Rename modeling_patch_moe.py to modeling_FalconTST.py

Browse files

Files changed (1) hide show

modeling_patch_moe.py → modeling_FalconTST.py +320 -426

modeling_patch_moe.py → modeling_FalconTST.py RENAMED Viewed

@@ -1,14 +1,20 @@
 import torch
-from typing import Optional
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
 import math
 from functools import reduce
 from abc import ABC, abstractmethod
-from .configuration_patch_moe import PatchMoeConfig
-from .ts_generation_mixin import PatchMoEGenerationMixin
-from transformers import PreTrainedModel
 def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
@@ -31,12 +37,12 @@ def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
 def _apply_rotary_pos_emb_bshd(
-    t: Tensor,
-    freqs: Tensor,
-    rotary_interleaved: bool = False,
-    multi_latent_attention: bool = False,
-    mscale: float = 1.0,
-) -> Tensor:
     """Apply rotary positional embedding to input tensor T.
     check https://kexue.fm/archives/8265 for detailed formulas
@@ -94,39 +100,24 @@ def topk_softmax_with_capacity(
     """
     assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
-    def compute_topk(
-        scores,
-        topk,
-    ):
         return torch.topk(scores, k=topk, dim=1)
     if score_function == "softmax":
         if use_pre_softmax:
             scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
-            probs, top_indices = compute_topk(
-                scores,
-                topk,
-            )
         else:
-            scores, top_indices = compute_topk(
-                logits,
-                topk,
-            )
             probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
     elif score_function == "sigmoid":
         scores = torch.sigmoid(logits.float()).type_as(logits)
         if expert_bias is not None:
             scores_for_routing = scores + expert_bias
-            _, top_indices = compute_topk(
-                scores_for_routing,
-                topk,
-            )
             scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
         else:
-            scores, top_indices = compute_topk(
-                scores,
-                topk,
-            )
         probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if topk > 1 else scores
     else:
         raise ValueError(f"Invalid score_function: {score_function}")
@@ -165,7 +156,7 @@ class RotaryEmbedding(nn.Module):
         dim = kv_channels
         self.rotary_interleaved = rotary_interleaved
-        device = "cpu" if use_cpu_initialization else torch.cuda.current_device()
         self.inv_freq = 1.0 / (
             rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
         )
@@ -180,9 +171,8 @@ class RotaryEmbedding(nn.Module):
         freqs = torch.outer(seq, self.inv_freq)  # [seq len, dim]
         return freqs
-    def forward(
-        self, max_seq_len: int, offset: int = 0, packed_seq: bool = False, device=None
-    ) -> Tensor:
         """Forward pass of RoPE embedding.
         Args:
@@ -195,7 +185,7 @@ class RotaryEmbedding(nn.Module):
         """
         if device is None:
             device = self.inv_freq.device
-        if self.inv_freq.device.type == "cpu":
             # move `inv_freq` to GPU once at the first micro-batch forward pass
             self.inv_freq = self.inv_freq.to(device=device)
@@ -213,7 +203,7 @@ class RotaryEmbedding(nn.Module):
         return emb.to(device)
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        state_dict.pop(f"{prefix}inv_freq", None)
         return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def get_rotary_seq_len(
@@ -247,9 +237,9 @@ class RMSNorm(nn.Module):
         self.variance_epsilon = eps
     def forward(self, hidden_states):
-        """
-        hidden_states [bs, patch_num, d_model]
-        """
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
@@ -257,7 +247,7 @@ class RMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
-class TEDotProductAttention(nn.Module):
     """Implement the scaled dot product attention with softmax.
     Arguments
     ---------
@@ -274,14 +264,7 @@ class TEDotProductAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
-    def forward(
-        self,
-        q,
-        k,
-        v,
-        attention_mask,
-        causal=None,
-    ):
         """Implements the multihead softmax attention.
         Arguments
         ---------
@@ -292,45 +275,47 @@ class TEDotProductAttention(nn.Module):
         """
         causal = self.causal if causal is None else causal
-        q = q.transpose(0, 1).contiguous()
-        k = k.transpose(0, 1).contiguous()
-        v = v.transpose(0, 1).contiguous()
         batch_size, seq_len = q.shape[0], q.shape[1]
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
-        # scores
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
-        scores = scores.masked_fill(attention_mask == 0, float("-1e9"))
         # Softmax
         attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
         # Dropout
         attention_drop = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
-        output = output.reshape(batch_size, seq_len, -1).transpose(0, 1).contiguous()
         return output
 class SelfAttention(nn.Module):
-    def __init__(
-        self,
-        config,
-    ):
         super().__init__()
         self.config = config
-        q_layernorm = config.q_layernorm
-        k_layernorm = config.k_layernorm
         self.hidden_size = config.hidden_size
-        self.core_attention = TEDotProductAttention()
-        self.linear_proj = nn.Linear(
-            self.hidden_size,
-            self.hidden_size,
-            bias=config.add_bias_linear,
-        )
-        self.linear_qkv = nn.Linear(
-            self.hidden_size,
-            3 * self.hidden_size,
-            bias=config.add_bias_linear,
         )
         if q_layernorm:
             self.q_layernorm = RMSNorm(self.hidden_size)
         else:
@@ -340,48 +325,38 @@ class SelfAttention(nn.Module):
         else:
             self.k_layernorm = IdentityOp()
-    def forward(self, x, attention_mask, rotary_pos_emb):
         qkv = self.linear_qkv(x)
-        qkv = qkv.view(qkv.size(0), qkv.size(1), self.config.num_attention_heads, -1)
         q, k, v = qkv.chunk(3, dim=-1)
-        # q/k norm
-        q = self.q_layernorm(q)
-        k = self.k_layernorm(k)
         # Apply rotary encoding to q and k
         rotary_pos_emb = (rotary_pos_emb,) * 2
         q_pos_emb, k_pos_emb = rotary_pos_emb
         q = _apply_rotary_pos_emb_bshd(q, q_pos_emb)
         k = _apply_rotary_pos_emb_bshd(k, k_pos_emb)
-        # attention
         attn_output = self.core_attention(q, k, v, attention_mask)
         output = self.linear_proj(attn_output)
         return output
 class MLP(nn.Module):
-    def __init__(self, config, in_features):
         super().__init__()
-        self.config = config
-        self.linear_fc1 = nn.Linear(
-            in_features,
-            self.config.moe_ffn_hidden_size * 2,
-            bias=self.config.add_bias_linear,
-        )
-        self.linear_fc2 = nn.Linear(
-            self.config.moe_ffn_hidden_size,
-            self.config.hidden_size,
-            bias=self.config.add_bias_linear,
-        )
     def forward(self, x):
         x = self.swiglu(self.linear_fc1(x))
         x = self.linear_fc2(x)
         return x
-    def swiglu(self, y):
         """Performs SwiGLU (Swish-Gated Linear Unit) activation function.
         Args:
@@ -404,9 +379,9 @@ class TransformerLayer(nn.Module):
             self.input_layernorm = IdentityOp()
         self.self_attention = SelfAttention(config)
         self.pre_mlp_layernorm = RMSNorm(self.config.hidden_size)
-        self.mlp = MLP(config, self.config.hidden_size)
-    def forward(self, x, attention_mask, rotary_pos_emb):
         residual = x
         x = self.input_layernorm(x)
         x = self.self_attention(x, attention_mask, rotary_pos_emb)
@@ -418,113 +393,84 @@ class TransformerLayer(nn.Module):
         return x
-class PatchMoEExpert_v2(nn.Module):
-    def __init__(self, config, patch_input_size=32, expert_output_size=336, final_layernorm=True):
         super().__init__()
         self.config = config
-        self.patch_size = patch_input_size
         self.seq_length = config.seq_length
-        assert (
-            self.seq_length % self.patch_size == 0
-        ), f"invalid patch_size: {self.patch_size} when seq_length={self.seq_length}"
         self.patch_num = self.seq_length // self.patch_size
         self.flatten_size = self.patch_num * self.config.hidden_size
-        self.layers = nn.ModuleList(
-            [
-                TransformerLayer(config, input_layernorm=config.transformer_input_layernorm)
-                for _ in range(self.config.expert_num_layers)
-            ]
-        )
         if final_layernorm:
             self.final_layernorm = RMSNorm(self.config.hidden_size)
         else:
             self.final_layernorm = IdentityOp()
         self.patch_embedding = MLP(config, in_features=patch_input_size)
-        self.output_layer = nn.Linear(
-            in_features=self.flatten_size,
-            out_features=expert_output_size,
-            bias=False,
-        )
     def _forward_patch_embedding(
         self,
-        input: Tensor,  # [batch_size, seq_len]
     ):
         """
         Perform patch embedding on the input time series.
-        This method applies a linear transformation to the input tensor to
         convert it into patches and then embeds these patches using a linear layer.
         """
         batch_size, seq_len = input.shape
-        assert (
-            seq_len == self.seq_length
-        ), f"Expected sequence length {self.seq_length}, but got {seq_len}"
         # Create input_mask based on pad_length
         # When a time point is masked, its value is mask_pad_value(default:255.)
-        input_mask = (
-            input != self.config.mask_pad_value
-        )  # 0: mask, 1: unmask   [batch_size, seq_len]
         # so whether the masked value 0 has the same effective of attention_mask
-        input_data = input * input_mask  # [batch_size, seq_len]
         # Patchify the input
-        input_data = input_data.unfold(
-            dimension=-1, size=self.patch_size, step=self.patch_size
-        ).contiguous()  # input [batch_size, patch_num, patch_size]
-        hidden_states = self.patch_embedding(
-            input_data
-        )  # hidden_states [batch_size, patch_num, hidden_size]
-        hidden_states = hidden_states.transpose(
-            0, 1
-        ).contiguous()  # hidden_states [patch_num, batch_size, hidden_size], To adapt to the Megatron
         # Patchify the mask: only the entire time points in a patch are masked then this patch is masked
-        attention_mask = input_mask.unfold(
-            dimension=-1, size=self.patch_size, step=self.patch_size
-        ).contiguous()  # [batch_size, patch_num, patch_size]
-        attention_mask = (
-            attention_mask.sum(-1) == self.patch_size
-        )  # [batch_size, patch_num]   # 0: mask, 1: unmask
-        attention_mask[:, -1] = True  # The last patch is not masked
         _, patch_num = attention_mask.shape
-        attention_mask = attention_mask.unsqueeze(2).repeat(
-            1, 1, patch_num
-        ) * attention_mask.unsqueeze(1).repeat(
-            1, patch_num, 1
-        )  # [batch_size, patch_num, patch_num]
-        attention_mask = attention_mask.unsqueeze(
-            1
-        ).contiguous()  # [batch_size, 1, patch_num, patch_num]
         return hidden_states, attention_mask, input_mask
-    def _forward_output(
-        self, hidden_states, output_scale=None, input_mask=None, inference_context=None
-    ):
         """
-        Perform a forward pass through the output layer.
-        Args:
-            expert_input (Tensor): Expert input of shape [batch_size, seq_len]
-            hidden_states (Tensor): Transformed hidden states of shape [patch_num, batch_size, hidden_size]
-            output_scale (Tensor, optional): Expert probabilities for the output layer  [batch_size]
-            input_mask (Tensor, optional): Expert input mask of shape [batch_size, seq_len], 0:mask, 1:unmask
-        Returns:
-            expert_output (Tensor): Expert output of shape [batch_size, expert_output_size]
         """
         # [patch_num, batch_size, hidden_size] -> [batch_size, flatten_size (patch_num * hidden_size)]
         patch_num, batch_size, hidden_size = hidden_states.shape
-        assert (
-            patch_num * hidden_size
-        ) == self.flatten_size, f"patch_num ({patch_num}) * hidden_size ({hidden_size}) != flatten_size ({self.flatten_size})"
         hidden_states = hidden_states.transpose(0, 1).reshape(-1, self.flatten_size).contiguous()
-        expert_output = self.output_layer(hidden_states)  # [batch_size, expert_output_size]
         if output_scale is not None:
             original_dtype = expert_output.dtype
             expert_output = expert_output * output_scale.unsqueeze(-1)
@@ -532,33 +478,29 @@ class PatchMoEExpert_v2(nn.Module):
         return expert_output
-    def forward(self, expert_input, rotary_pos_emb, expert_probs=None):
         hidden_states, attention_mask, input_mask = self._forward_patch_embedding(expert_input)
         for layer in self.layers:
-            hidden_states = layer(
-                hidden_states, attention_mask, rotary_pos_emb[: hidden_states.shape[0]]
-            )
         hidden_states = self.final_layernorm(hidden_states)
         expert_output = self._forward_output(hidden_states, expert_probs, input_mask)
         return expert_output
-class SequentialPatchMoE(nn.Module):
-    def __init__(self, config, expert_output_size=336):
         super().__init__()
         self.config = config
         self.expert_output_size = expert_output_size
-        self.local_experts = nn.ModuleList(
-            [
-                PatchMoEExpert_v2(
-                    config,
-                    expert_output_size=expert_output_size,
-                    patch_input_size=config.patch_size_list[expert_id],
-                    final_layernorm=config.moe_expert_final_layernorm,
-                )
-                for expert_id in range(config.num_moe_experts)
-            ]
-        )
     def forward(self, input, routing_map, rotary_pos_emb, expert_probs):
         expert_output_list = []
@@ -566,19 +508,15 @@ class SequentialPatchMoE(nn.Module):
         for i, expert in enumerate(self.local_experts):
             token_mask = routing_map[:, i].bool()  # shape (batch,)
-            current_inputs = input[token_mask]  # (num_tokens_for_expert, seq_len)
-            current_probs = expert_probs[token_mask, i]
             if current_inputs.numel() == 0:
-                expert_output = torch.zeros(
-                    0, self.expert_output_size, device=input.device, dtype=input.dtype
-                )
             else:
                 expert_output = expert(current_inputs, rotary_pos_emb, current_probs)
-            full_output = torch.zeros(
-                batch_size, self.expert_output_size, device=input.device, dtype=input.dtype
-            )
             full_output[token_mask] = expert_output
             expert_output_list.append(full_output)
@@ -601,7 +539,7 @@ class RouterGatingLinearFunction(torch.autograd.Function):
         ctx.weight_dtype = weight.dtype
         inp_shape = inp.shape
         inp = inp.view(-1, inp_shape[-1])
         output = torch.mm(inp.to(router_dtype), weight.to(router_dtype).t())
         output = output.view(*inp_shape[:-1], -1)
@@ -617,12 +555,11 @@ def router_gating_linear(inp: torch.Tensor, weight: torch.Tensor, router_dtype:
     return RouterGatingLinearFunction.apply(inp, weight, router_dtype)
-class Router(ABC, nn.Module):
     """Base Router class"""
     def __init__(
-        self,
-        config: PatchMoeConfig,
     ) -> None:
         """
         Initialize the Router module.
@@ -635,28 +572,24 @@ class Router(ABC, nn.Module):
         self.config = config
         # Initialize the gate weights.
         if self.config.patch_size_list is not None:
             assert self.config.moe_router_input_size is not None
             self.weight = torch.nn.Parameter(
-                torch.empty(
-                    (self.config.num_moe_experts, self.config.moe_router_input_size),
-                    dtype=torch.float32,
-                )
             )
         else:
             self.weight = torch.nn.Parameter(
-                torch.empty(
-                    (self.config.num_moe_experts, self.config.hidden_size), dtype=torch.float32
-                )
             )
         self.reset_parameters()
     def reset_parameters(self):
         """Reset the router parameters."""
-        torch.nn.init.normal_(self.weight, mean=0, std=self.config.init_method_std)
         self.weight.data = self.weight.data.to(dtype=self.config.torch_dtype)
     def gating(self, input: torch.Tensor):
         """Forward pass of the router gate.
@@ -700,8 +633,7 @@ class TopKRouter(Router):
     """Route each token to the top-k experts."""
     def __init__(
-        self,
-        config: PatchMoeConfig,
     ) -> None:
         """Initialize the zero token dropping router.
@@ -716,17 +648,18 @@ class TopKRouter(Router):
         self.enable_expert_bias = self.config.moe_router_enable_expert_bias
         if self.enable_expert_bias:
             self.register_buffer(
-                "local_tokens_per_expert",
                 torch.zeros(self.config.num_moe_experts, dtype=torch.float32),
                 persistent=False,
             )
             self.register_buffer(
-                "expert_bias", torch.zeros(self.config.num_moe_experts, dtype=torch.float32)
             )
         else:
             self.local_tokens_per_expert = None
             self.expert_bias = None
     def routing(self, logits: torch.Tensor):
         """Top-k routing function
@@ -763,7 +696,7 @@ class TopKRouter(Router):
         return scores, routing_map
-class PatchMoEMoELayer(nn.Module):
     def __init__(self, config, layer_number):
         super().__init__()
         self.config = config
@@ -781,50 +714,46 @@ class PatchMoEMoELayer(nn.Module):
                 self.expert_output_size = config.seq_length
         if self.is_last_layer and self.config.heterogeneous_moe_layer:
-            # If heterogeneous_moe_layer is True, the backcast will be None
-            self.backcast_layernorm = None
         else:
             self.backcast_layernorm = RMSNorm(self.seq_length)
-        self.experts = SequentialPatchMoE(
-            config,
-            expert_output_size=self.expert_output_size,
-        )
-        self.shared_experts = PatchMoEExpert_v2(
-            config,
-            expert_output_size=self.expert_output_size,
-            patch_input_size=config.shared_patch_size,
-            final_layernorm=config.moe_expert_final_layernorm,
-        )
     def time_series_preprocess(self, input: torch.Tensor):
         """
-        Preprocess time series(sample) for dispatch.
-        Applies RevIN to input time series(sample), and process the input mask (0: mask, 1: unmask)
-        Args:
-            input (torch.Tensor): The input time series (samples) to the MoE layer. [batch_size, seq_len]
-        Returns:
-            input (torch.Tensor): The (RevIN) backcast time series (samples). [batch_size, seq_len]
-            means (torch.Tensor): The means of the non-masked backcast time series (samples). [batch_size, 1]
-            stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples). [batch_size, 1]
         """
         batch_size, seq_len = input.shape
-        assert seq_len == self.seq_length, f"seq_len {seq_len} != self.seq_length {self.seq_length}"
         # Create input_mask based on pad_length
         # When a time point is masked, its value is mask_pad_value(default:255.)
-        input_mask = (
-            input != self.config.mask_pad_value
-        )  # 0: mask, 1: unmask   [batch_size, seq_len]
         self.input_mask = input_mask
         return input
     def router_and_preprocess(self, backcast: torch.Tensor):
         """Compute and preprocess time series(sample) routing for dispatch.
@@ -836,22 +765,20 @@ class PatchMoEMoELayer(nn.Module):
         # backcast [batch_size, seq_len]    means/stdev [batch_size, 1]
         backcast = self.time_series_preprocess(backcast)
-        residual = backcast  # residual: [batch_size, seq_len], the input to the shared experts
         # TODO: Check the effective of the masked value to the router
-        probs, routing_map = self.router(
-            backcast * self.input_mask
-        )  # probs/routing_map: [batch_size, num_experts]
         return backcast, probs, residual, routing_map
     def experts_compute(
         self,
-        input: torch.Tensor,  # [num_permuted_samples_after_dispatch, seq_len]
-        probs: torch.Tensor,  # [num_permuted_samples_after_dispatch]
-        residual: torch.Tensor,  # [batch_size, seq_len]
         rotary_pos_emb: torch.Tensor,
-        routing_map: torch.Tensor,  # [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
     ):
         """Computes the output of the experts on the dispatched time series(sample).
@@ -863,19 +790,20 @@ class PatchMoEMoELayer(nn.Module):
         """
         # shared_expert_output: [batch_size, seq_len (+ pred_len)]
         shared_experts_output = self.shared_experts(residual, rotary_pos_emb)
         # dispatched_input (global_input_tokens):   [num_permuted_samples_after_dispatch_postprocess(sorted), seq_len]
         # tokens_per_expert (global_probs):         [num_experts]
         # permuted_probs (global_probs):            [num_permuted_samples_after_dispatch_postprocess(sorted)]
         experts_output = self.experts(input, routing_map, rotary_pos_emb, probs)
         return experts_output, shared_experts_output
     def postprocess(
-        self,
-        backcast: torch.Tensor,  # [batch_size, seq_len]
-        forecast: torch.Tensor,  # [batch_size, pred_len]
         output_backcast: torch.Tensor,  # [batch_size, seq_len]
         output_forecast: torch.Tensor,  # [batch_size, pred_len]
     ):
@@ -889,21 +817,20 @@ class PatchMoEMoELayer(nn.Module):
             stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples).  [batch_size, 1]
             backcast_mask (torch.Tensor): The previous layer's backcast mask of time series (samples) .     [batch_size, seq_len]
         """
-        if output_backcast is not None:
-            output_backcast = self.backcast_layernorm(output_backcast)  # LayerNorm
             if self.config.residual_backcast:
                 output_backcast = backcast - output_backcast
-            output_backcast[~self.input_mask] = (
-                self.config.mask_pad_value
-            )  # Important! Recover the mask time point back to mask_pad_value(default:255.)
-        if (
-            self.config.do_expert_forecast and forecast is not None
-        ):  # The first layer's forecast is None
             output_forecast = forecast + output_forecast
         return output_backcast, output_forecast
     def combine(
         self,
@@ -916,67 +843,60 @@ class PatchMoEMoELayer(nn.Module):
         experts (e.g., via an All-to-All communication). It then adds the output
         from the shared expert if it exists.
         """
-        assert (
-            experts_output.shape == shared_experts_output.shape
-        ), f"experts_output shape {experts_output.shape} doesn't equal to shared_experts_output shape:{shared_experts_output.shape}"
         output = experts_output + shared_experts_output
         if self.is_last_layer and self.config.heterogeneous_moe_layer:
             output_backcast = None
             output_forecast = output
-            assert (
-                output_forecast.shape[1] == self.pred_length
-            ), f"heterogeneous_moe_layer=True, expected the last moe layer's output pred len: {self.pred_length}, but got {output_forecast.shape[1]}"
         else:
             #  Noting: the mask time point there maybe not mask_pad_value(default:255.), it will be postprocessed
-            output_backcast = output[:, : self.seq_length]  # [batch_size, seq_len]
             if self.config.do_expert_forecast:
-                output_forecast = output[:, self.seq_length :]  # [batch_size, pred_len]
-                assert (
-                    output_forecast.shape[1] == self.pred_length
-                ), f"do_expert_forecast=True, expected the last moe layer's output pred len: {self.pred_length}, but got {output_forecast.shape[1]}"
             else:
                 output_forecast = None
         return output_backcast, output_forecast
-    def forward(self, backcast, forecast, rotary_pos_emb):
         inputs, probs, residual, routing_map = self.router_and_preprocess(backcast)
-        experts_output, shared_experts_output = self.experts_compute(
-            inputs, probs, residual, rotary_pos_emb, routing_map
-        )
         output_backcast, output_forecast = self.combine(experts_output, shared_experts_output)
-        output_backcast, output_forecast = self.postprocess(
-            backcast, forecast, output_backcast, output_forecast
-        )
         return output_backcast, output_forecast
-class PatchMoEBlock(nn.Module):
-    def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList(
-            [
-                PatchMoEMoELayer(config, layer_num + 1)
                 for layer_num in range(self.config.num_hidden_layers)
-            ]
-        )
-    def forward(self, x, rotary_pos_emb):
         backcast = x
         forecast = None
         for layer in self.layers:
-            backcast, forecast = layer(backcast, forecast, rotary_pos_emb)
-        return backcast, forecast
-class PatchMoEPreTrainedModel(PreTrainedModel):
-    config_class = PatchMoeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["PatchMoEMoELayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = False
@@ -992,77 +912,73 @@ class PatchMoEPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-class PatchMoEModel(PatchMoEPreTrainedModel):
-    def __init__(self, config: PatchMoeConfig):
         super().__init__(config)
         self.config = config
         self.seq_length = config.seq_length
         self.rotary_pos_emb = RotaryEmbedding(
-            kv_channels=self.config.kv_channels,
-            rotary_base=config.rotary_base,
-            use_cpu_initialization=self.config.use_cpu_initialization,
-            rotary_interleaved=self.config.rotary_interleaved,
         )
-        self.decoder = PatchMoEBlock(config=config)
         if self.config.do_expert_forecast and self.config.heterogeneous_moe_layer:
             self.output_layer = IdentityOp()
         else:
-            self.output_layer = nn.Linear(
-                in_features=self.seq_length,
-                out_features=self.config.pred_length,
-                bias=self.config.add_bias_linear,
-            )
     def revin(
         self,
-        input: Tensor,  # [batch_size, seq_len]
-        input_mask: Tensor,  # [batch_size, seq_len] 0:mask, 1:unmask
     ):
-        """Normalization from Non-stationary Transformer"""
         input_data = input * input_mask
-        sum_per_sample = torch.sum(
-            input_data, dim=1, keepdim=True
-        ).detach()  # [batch_size, 1], torch.bfloat16
-        count_per_sample = torch.sum(
-            input_mask, dim=1, keepdim=True
-        ).detach()  # [batch_size, 1], torch.int64
-        assert (
-            torch.any(count_per_sample == 0) == False
-        ), f"There is zero in count_per_sample, shape: {input[torch.where(count_per_sample.squeeze(1) == 0)[0]]}"
-        means = sum_per_sample / count_per_sample  # [batch_size, 1]
         input_data = input_data - means
         input_data = input_data * input_mask
-        var_per_sample = (
-            torch.sum(input_data**2, dim=1, keepdim=True).detach() / count_per_sample
-        )  # [batch_size, 1]
         stdev = torch.sqrt(var_per_sample + 1e-9)
         input_data = input_data / stdev
         input_data = input_data * input_mask
-        # recover the mask_pad_value(default:255.)
         input = input * ~(input_mask) + input_data
         return input, means, stdev
     def forward(self, input, revin):
         batch_size, input_len = input.shape
         if input_len > self.seq_length:
-            input = input[:, -self.seq_length :]
         elif input_len < self.seq_length:
             pad_len = self.seq_length - input_len
-            input = F.pad(
-                input, pad=(pad_len, 0), mode="constant", value=self.config.mask_pad_value
-            )
         input_len = self.seq_length
-        input_mask = input != self.config.mask_pad_value
         # Step1. RevIN
         if revin:
             input, means, stdev = self.revin(input, input_mask)
         # Step2. Get rotary_pos_emb
         # rotary_pos_emb [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
         rotary_pos_emb = self.rotary_pos_emb(input_len, device=input.device)
@@ -1070,21 +986,23 @@ class PatchMoEModel(PatchMoEPreTrainedModel):
         # Step3. Do one-step inference to get mixed forecasts from multiple forecast heads
         # mixed_pred: [batch_size, sum(multi_forecast_head)]
         mixed_pred = self._inference_step(
-            input=input, input_mask=input_mask, rotary_pos_emb=rotary_pos_emb
         )
-        # Step4. Based on the mixed forecasts, do auto-regressive inference according to
         # the step list of each forecast head
-        if self.config.multi_forecast_head_type == "single":
             final_output = self._auto_regressive_single_head(
-                input=input,
-                input_mask=input_mask,
-                patchmoe_forecast=mixed_pred,
-                rotary_pos_emb=rotary_pos_emb,
             )
         else:
             raise NotImplementedError
         # Step5. RevIN
         if revin:
             final_output = final_output * (stdev.repeat(1, self.config.inference_length))
@@ -1093,58 +1011,57 @@ class PatchMoEModel(PatchMoEPreTrainedModel):
         return final_output.detach().float()
     def _inference_step(
-        self,
-        input,
-        input_mask,
         rotary_pos_emb,
-    ):
         if self.config.do_base_forecast:
             base_forecast, _ = self.base_output_layer(input)
         else:
             base_forecast = None
         decoder_backcast, decoder_forecast = self.decoder(
-            input,  # [batch_size, seq_len]
-            rotary_pos_emb,  # [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
         )
         if self.config.do_expert_forecast:
-            assert decoder_forecast is not None, f"decoder_forecast is None"
             if self.config.heterogeneous_moe_layer:
                 decoder_forecast = self.output_layer(decoder_forecast)  # IdentityOp
             else:
-                final_forecast = self.output_layer(decoder_backcast * input_mask)
                 decoder_forecast = decoder_forecast + final_forecast
         else:
             # The decoder_backcast contains the mask_pad_val(default:255.)
             decoder_forecast, _ = self.output_layer(decoder_backcast * input_mask)
         if self.config.do_base_forecast:
-            assert base_forecast is not None, f"base_forecast is None"
-            patchmoe_forecast = base_forecast + decoder_forecast
         else:
-            patchmoe_forecast = decoder_forecast
-        return patchmoe_forecast
     def _auto_regressive_single_head(
         self,
-        input,  # [batch_size, seq_len]
-        input_mask,  # [batch_size, seq_len]
-        patchmoe_forecast,  # [batch_size, max(multi_forecast_head)]
-        rotary_pos_emb,  # [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
-        auto_regressive_strategy="from_long_to_short",
     ):
         """auto regressive prediction with [single] head"""
-        assert (
-            self.config.multi_forecast_head_type == "single"
-        ), f"_auto_regressive_single_head only support multi_forecast_head_type==single "
-        if auto_regressive_strategy == "from_long_to_short":
             # From long to short
             multi_forecast_head_list = sorted(self.config.multi_forecast_head_list, reverse=True)
-            final_output = patchmoe_forecast
             while final_output.shape[1] < self.config.inference_length:
                 # adaptive choose the forecast head
                 remain_pred_len = self.config.inference_length - final_output.shape[1]
@@ -1154,39 +1071,28 @@ class PatchMoEModel(PatchMoEPreTrainedModel):
                 if idx == len(multi_forecast_head_list):
                     idx = len(multi_forecast_head_list) - 1
                 head_pred_len = multi_forecast_head_list[idx]
                 # one-step model prediction
-                input = torch.cat([input, patchmoe_forecast], dim=1)[
-                    :, -self.seq_length :
-                ].contiguous()
                 input_mask = torch.cat(
-                    [
-                        input_mask,
-                        torch.ones(
-                            patchmoe_forecast.shape,
-                            dtype=input_mask.dtype,
-                            device=input_mask.device,
-                        ),
-                    ],
-                    dim=1,
-                )[
-                    :, -self.seq_length :
-                ].contiguous()  # 0:mask, 1:unmask
-                patchmoe_forecast = self._inference_step(
-                    input=input,
-                    input_mask=input_mask,
-                    rotary_pos_emb=rotary_pos_emb,
                 )
                 # the core idea of multi forecast head type of [single]
-                patchmoe_forecast = patchmoe_forecast[:, :head_pred_len]
-                final_output = torch.cat([final_output, patchmoe_forecast], dim=1)
-            final_output = final_output[:, : self.config.inference_length]
-        elif auto_regressive_strategy == "from_short_to_long":
             # From short to long
             # in validate_args, it has been sorted, and check the valid config
             multi_forecast_head_list = sorted(self.config.multi_forecast_head_list)
@@ -1197,15 +1103,14 @@ class PatchMoEModel(PatchMoEPreTrainedModel):
                 else:
                     ar_step = min(
                         self.config.autoregressive_step_list[idx],
-                        self.config.multi_forecast_head_list[idx + 1]
-                        // self.config.multi_forecast_head_list[idx],
                     )
                     # ar_step = multi_forecast_head_list[idx + 1] // multi_forecast_head_list[idx]
                 multi_forecast_head_dict[head_pred_len] = ar_step
             # the core idea of strategy [from_short_to_long]
-            mixed_pred = patchmoe_forecast
             output_list = []
             cur_pred = None
             cur_pred_len = 0
@@ -1219,62 +1124,50 @@ class PatchMoEModel(PatchMoEPreTrainedModel):
                 if ar_step == 0:
                     # Ignore the current forecast head
                     continue
                 # Add current head's first auto-regressive step of prediction
-                head_pred = mixed_pred[:, :head_pred_len]  # [single]
                 output_list.append(head_pred[:, cur_pred_len:])
                 cur_pred = torch.cat(output_list, dim=1)
                 cur_pred_len = cur_pred.shape[1]
                 if cur_pred_len >= self.config.inference_length:
                     break
                 # Do auto-regressive of the rest of the steps
                 for _ in range(1, ar_step + 1):
                     # one-step model prediction
-                    cur_input = torch.cat([input, cur_pred], dim=1)[
-                        :, -self.seq_length :
-                    ].contiguous()
                     cur_input_mask = torch.cat(
-                        [
-                            input_mask,
-                            torch.ones(
-                                cur_pred.shape, dtype=input_mask.dtype, device=input_mask.device
-                            ),
-                        ],
-                        dim=1,
-                    )[
-                        :, -self.seq_length :
-                    ].contiguous()  # 0:mask, 1:unmask
-                    patchmoe_forecast = self._inference_step(
-                        input=cur_input,
-                        input_mask=cur_input_mask,
-                        rotary_pos_emb=rotary_pos_emb,
                     )
-                    head_pred = patchmoe_forecast[:, :head_pred_len]
                     output_list.append(head_pred)
                     cur_pred = torch.cat(output_list, dim=1)
                     cur_pred_len = cur_pred.shape[1]
                     if cur_pred_len >= self.config.inference_length:
                         break
                 if cur_pred_len >= self.config.inference_length:
                     break
-            final_output = cur_pred[
-                :, : self.config.inference_length
-            ]  # [batch_size, inference_len]
         assert final_output.shape[1] == self.config.inference_length
         return final_output
-class PatchMoEForPrediction(PatchMoEPreTrainedModel, PatchMoEGenerationMixin):
-    def __init__(self, config: PatchMoeConfig):
         super().__init__(config)
         self.config = config
-        self.model = PatchMoEModel(self.config)
         self.post_init()
     def forward(
@@ -1287,7 +1180,10 @@ class PatchMoEForPrediction(PatchMoEPreTrainedModel, PatchMoEGenerationMixin):
         revin: Optional[bool] = False,
     ):
         self.model.config.inference_length = max_output_length
-        outputs = self.model(input=input_ids, revin=revin)
         loss = None
         logits = outputs
@@ -1309,7 +1205,7 @@ class PatchMoEForPrediction(PatchMoEPreTrainedModel, PatchMoEGenerationMixin):
         attention_mask=None,
         inputs_embeds=None,
         revin=False,
-        **kwargs,
     ):
         """
         Prepare model inputs for autoregressive generation.
@@ -1317,10 +1213,8 @@ class PatchMoEForPrediction(PatchMoEPreTrainedModel, PatchMoEGenerationMixin):
         model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "revin": revin,
-            }
-        )
-        return model_inputs

 import torch
+from torch._dynamo import config
+from typing import List, Optional, Union
 import torch.nn as nn
 import torch.nn.functional as F
+# import transformer_engine as te
 from torch import Tensor
 import math
+from einops import rearrange, repeat
 from functools import reduce
 from abc import ABC, abstractmethod
+from configuration_FalconTST import FalconTSTConfig
+from ts_generation_mixin import FalconTSTGenerationMixin
+from transformers import PreTrainedModel, Cache, DynamicCache
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import MoeModelOutputWithPast, MoeCausalLMOutputWithPast
 def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
 def _apply_rotary_pos_emb_bshd(
+        t: Tensor,
+        freqs: Tensor,
+        rotary_interleaved: bool = False,
+        multi_latent_attention: bool = False,
+        mscale: float = 1.0,
+    ) -> Tensor:
     """Apply rotary positional embedding to input tensor T.
     check https://kexue.fm/archives/8265 for detailed formulas
     """
     assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
+    def compute_topk(scores, topk,):
         return torch.topk(scores, k=topk, dim=1)
     if score_function == "softmax":
         if use_pre_softmax:
             scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+            probs, top_indices = compute_topk(scores, topk, )
         else:
+            scores, top_indices = compute_topk(logits, topk, )
             probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
     elif score_function == "sigmoid":
         scores = torch.sigmoid(logits.float()).type_as(logits)
         if expert_bias is not None:
             scores_for_routing = scores + expert_bias
+            _, top_indices = compute_topk(scores_for_routing, topk, )
             scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
         else:
+            scores, top_indices = compute_topk(scores, topk,)
         probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if topk > 1 else scores
     else:
         raise ValueError(f"Invalid score_function: {score_function}")
         dim = kv_channels
         self.rotary_interleaved = rotary_interleaved
+        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
         self.inv_freq = 1.0 / (
             rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
         )
         freqs = torch.outer(seq, self.inv_freq)  # [seq len, dim]
         return freqs
+    def forward(self, max_seq_len: int, offset: int = 0, packed_seq: bool = False, device=None) -> Tensor:
         """Forward pass of RoPE embedding.
         Args:
         """
         if device is None:
             device = self.inv_freq.device
+        if self.inv_freq.device.type == 'cpu':
             # move `inv_freq` to GPU once at the first micro-batch forward pass
             self.inv_freq = self.inv_freq.to(device=device)
         return emb.to(device)
     def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        state_dict.pop(f'{prefix}inv_freq', None)
         return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def get_rotary_seq_len(
         self.variance_epsilon = eps
     def forward(self, hidden_states):
+        '''
+            hidden_states [bs, patch_num, d_model]
+        '''
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         return self.weight * hidden_states.to(input_dtype)
+class FlashAttention(nn.Module):
     """Implement the scaled dot product attention with softmax.
     Arguments
     ---------
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
+    def forward(self, q,k,v,attention_mask,causal=None, ):
         """Implements the multihead softmax attention.
         Arguments
         ---------
         """
         causal = self.causal if causal is None else causal
+        q = q.transpose(0,1).contiguous()
+        k = k.transpose(0,1).contiguous()
+        v = v.transpose(0,1).contiguous()
         batch_size, seq_len = q.shape[0], q.shape[1]
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # scores
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        scores = scores.masked_fill(attention_mask == 0, float('-1e9'))
         # Softmax
         attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
         # Dropout
         attention_drop = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        output = output.reshape(batch_size, seq_len, -1).transpose(0,1).contiguous()
         return output
+class TEDotProductAttention(nn.Module):
+    def __init__(self, flash_attention,):
+        super().__init__()
+        self.flash_attention = flash_attention
+    def forward(self, q, k, v, mask=None):
+        # Prioritize using FlashAttention
+        return self.flash_attention(q, k, v, mask)
 class SelfAttention(nn.Module):
+    def __init__(self,config,):
         super().__init__()
         self.config = config
+        q_layernorm=config.q_layernorm
+        k_layernorm=config.k_layernorm
         self.hidden_size = config.hidden_size
+        self.core_attention = TEDotProductAttention(
+                        flash_attention=FlashAttention(),
         )
+        self.linear_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.add_bias_linear,)
+        self.linear_qkv =  nn.Linear(self.hidden_size, 3*self.hidden_size, bias=config.add_bias_linear,)
         if q_layernorm:
             self.q_layernorm = RMSNorm(self.hidden_size)
         else:
         else:
             self.k_layernorm = IdentityOp()
+    def forward(self, x, attention_mask,rotary_pos_emb):
         qkv = self.linear_qkv(x)
+        qkv = qkv.view(qkv.size(0), qkv.size(1), self.config.num_attention_heads,-1)
         q, k, v = qkv.chunk(3, dim=-1)
         # Apply rotary encoding to q and k
         rotary_pos_emb = (rotary_pos_emb,) * 2
         q_pos_emb, k_pos_emb = rotary_pos_emb
         q = _apply_rotary_pos_emb_bshd(q, q_pos_emb)
         k = _apply_rotary_pos_emb_bshd(k, k_pos_emb)
+        q = self.q_layernorm(q)
+        k = self.k_layernorm(k)
+        # attention
         attn_output = self.core_attention(q, k, v, attention_mask)
         output = self.linear_proj(attn_output)
         return output
 class MLP(nn.Module):
+    def __init__(self,config,in_features):
         super().__init__()
+        self.config= config
+        self.linear_fc1 = nn.Linear(in_features, self.config.moe_ffn_hidden_size*2, bias=self.config.add_bias_linear,)
+        self.linear_fc2 = nn.Linear(self.config.moe_ffn_hidden_size, self.config.hidden_size, bias=self.config.add_bias_linear,)
     def forward(self, x):
         x = self.swiglu(self.linear_fc1(x))
         x = self.linear_fc2(x)
         return x
+    def swiglu(self,y):
         """Performs SwiGLU (Swish-Gated Linear Unit) activation function.
         Args:
             self.input_layernorm = IdentityOp()
         self.self_attention = SelfAttention(config)
         self.pre_mlp_layernorm = RMSNorm(self.config.hidden_size)
+        self.mlp = MLP(config,self.config.hidden_size)
+    def forward(self, x, attention_mask,rotary_pos_emb):
         residual = x
         x = self.input_layernorm(x)
         x = self.self_attention(x, attention_mask, rotary_pos_emb)
         return x
+class FalconTSTExpert(nn.Module):
+    def __init__(self, config, patch_input_size=32,expert_output_size=336,final_layernorm=True):
         super().__init__()
         self.config = config
+        self.patch_size= patch_input_size
         self.seq_length = config.seq_length
+        assert self.seq_length % self.patch_size == 0, f'invalid patch_size: {self.patch_size} when seq_length={self.seq_length}'
         self.patch_num = self.seq_length // self.patch_size
         self.flatten_size = self.patch_num * self.config.hidden_size
+        self.layers = nn.ModuleList([
+            TransformerLayer(config,input_layernorm=config.transformer_input_layernorm)
+            for _ in range(self.config.expert_num_layers)
+        ])
         if final_layernorm:
             self.final_layernorm = RMSNorm(self.config.hidden_size)
         else:
             self.final_layernorm = IdentityOp()
         self.patch_embedding = MLP(config, in_features=patch_input_size)
+        self.output_layer =  nn.Linear(in_features=self.flatten_size, out_features=expert_output_size, bias=False,)
     def _forward_patch_embedding(
         self,
+        input: Tensor,                      # [batch_size, seq_len]
     ):
         """
         Perform patch embedding on the input time series.
+        This method applies a linear transformation to the input tensor to
         convert it into patches and then embeds these patches using a linear layer.
         """
         batch_size, seq_len = input.shape
+        assert seq_len == self.seq_length, f'Expected sequence length {self.seq_length}, but got {seq_len}'
         # Create input_mask based on pad_length
         # When a time point is masked, its value is mask_pad_value(default:255.)
+        input_mask = (input != self.config.mask_pad_value) # 0: mask, 1: unmask   [batch_size, seq_len]
         # so whether the masked value 0 has the same effective of attention_mask
+        input_data = input * input_mask     # [batch_size, seq_len]
         # Patchify the input
+        input_data = input_data.unfold(dimension=-1, size=self.patch_size, step=self.patch_size).contiguous() # input [batch_size, patch_num, patch_size]
+        hidden_states= self.patch_embedding(input_data)              # hidden_states [batch_size, patch_num, hidden_size]
+        hidden_states = hidden_states.transpose(0, 1).contiguous()      # hidden_states [patch_num, batch_size, hidden_size], To adapt to the Megatron
         # Patchify the mask: only the entire time points in a patch are masked then this patch is masked
+        attention_mask = input_mask.unfold(dimension=-1, size=self.patch_size, step=self.patch_size).contiguous()   # [batch_size, patch_num, patch_size]
+        attention_mask = (attention_mask.sum(-1) == self.patch_size)  # [batch_size, patch_num]   # 0: mask, 1: unmask
+        attention_mask[:, -1] = True    # The last patch is not masked
         _, patch_num = attention_mask.shape
+        attention_mask = attention_mask.unsqueeze(2).repeat(1,1,patch_num) * attention_mask.unsqueeze(1).repeat(1,patch_num,1)  # [batch_size, patch_num, patch_num]
+        attention_mask = attention_mask.unsqueeze(1).contiguous()   # [batch_size, 1, patch_num, patch_num]
         return hidden_states, attention_mask, input_mask
+    def _forward_output(self, hidden_states, output_scale=None, input_mask=None, inference_context=None):
         """
+            Perform a forward pass through the output layer.
+            Args:
+                expert_input (Tensor): Expert input of shape [batch_size, seq_len]
+                hidden_states (Tensor): Transformed hidden states of shape [patch_num, batch_size, hidden_size]
+                output_scale (Tensor, optional): Expert probabilities for the output layer  [batch_size]
+                input_mask (Tensor, optional): Expert input mask of shape [batch_size, seq_len], 0:mask, 1:unmask
+            Returns:
+                expert_output (Tensor): Expert output of shape [batch_size, expert_output_size]
         """
         # [patch_num, batch_size, hidden_size] -> [batch_size, flatten_size (patch_num * hidden_size)]
         patch_num, batch_size, hidden_size = hidden_states.shape
+        assert (patch_num * hidden_size) == self.flatten_size, f'patch_num ({patch_num}) * hidden_size ({hidden_size}) != flatten_size ({self.flatten_size})'
         hidden_states = hidden_states.transpose(0, 1).reshape(-1, self.flatten_size).contiguous()
+        expert_output = self.output_layer(hidden_states)   # [batch_size, expert_output_size]
         if output_scale is not None:
             original_dtype = expert_output.dtype
             expert_output = expert_output * output_scale.unsqueeze(-1)
         return expert_output
+    def forward(self, expert_input, rotary_pos_emb,expert_probs=None):
         hidden_states, attention_mask, input_mask = self._forward_patch_embedding(expert_input)
         for layer in self.layers:
+            hidden_states = layer(hidden_states,attention_mask,rotary_pos_emb[:hidden_states.shape[0]])
         hidden_states = self.final_layernorm(hidden_states)
         expert_output = self._forward_output(hidden_states, expert_probs, input_mask)
         return expert_output
+class SequentialFalconTST(nn.Module):
+    def __init__(self, config,expert_output_size=336):
         super().__init__()
         self.config = config
         self.expert_output_size = expert_output_size
+        self.local_experts = nn.ModuleList([
+                            FalconTSTExpert(
+                                config,
+                                expert_output_size=expert_output_size,
+                                patch_input_size=config.patch_size_list[expert_id],
+                                final_layernorm=config.moe_expert_final_layernorm
+                            )
+                            for expert_id in range(config.num_moe_experts)
+                        ])
     def forward(self, input, routing_map, rotary_pos_emb, expert_probs):
         expert_output_list = []
         for i, expert in enumerate(self.local_experts):
             token_mask = routing_map[:, i].bool()  # shape (batch,)
+            current_inputs = input[token_mask]     # (num_tokens_for_expert, seq_len)
+            current_probs  = expert_probs[token_mask, i]
             if current_inputs.numel() == 0:
+                expert_output = torch.zeros(0, self.expert_output_size, device=input.device, dtype=input.dtype)
             else:
                 expert_output = expert(current_inputs, rotary_pos_emb, current_probs)
+            full_output = torch.zeros(batch_size, self.expert_output_size, device=input.device, dtype=input.dtype)
             full_output[token_mask] = expert_output
             expert_output_list.append(full_output)
         ctx.weight_dtype = weight.dtype
         inp_shape = inp.shape
         inp = inp.view(-1, inp_shape[-1])
         output = torch.mm(inp.to(router_dtype), weight.to(router_dtype).t())
         output = output.view(*inp_shape[:-1], -1)
     return RouterGatingLinearFunction.apply(inp, weight, router_dtype)
+class Router(ABC,nn.Module):
     """Base Router class"""
     def __init__(
+        self, config: FalconTSTConfig,
     ) -> None:
         """
         Initialize the Router module.
         self.config = config
         # Initialize the gate weights.
         if self.config.patch_size_list is not None:
             assert self.config.moe_router_input_size is not None
             self.weight = torch.nn.Parameter(
+                torch.empty((self.config.num_moe_experts, self.config.moe_router_input_size), dtype=torch.float32)
             )
         else:
             self.weight = torch.nn.Parameter(
+                torch.empty((self.config.num_moe_experts, self.config.hidden_size), dtype=torch.float32)
             )
         self.reset_parameters()
     def reset_parameters(self):
         """Reset the router parameters."""
+        torch.nn.init.normal_(self.weight,mean=0,std=self.config.init_method_std)
         self.weight.data = self.weight.data.to(dtype=self.config.torch_dtype)
     def gating(self, input: torch.Tensor):
         """Forward pass of the router gate.
     """Route each token to the top-k experts."""
     def __init__(
+        self, config: FalconTSTConfig,
     ) -> None:
         """Initialize the zero token dropping router.
         self.enable_expert_bias = self.config.moe_router_enable_expert_bias
         if self.enable_expert_bias:
             self.register_buffer(
+                'local_tokens_per_expert',
                 torch.zeros(self.config.num_moe_experts, dtype=torch.float32),
                 persistent=False,
             )
             self.register_buffer(
+                'expert_bias', torch.zeros(self.config.num_moe_experts, dtype=torch.float32)
             )
         else:
             self.local_tokens_per_expert = None
             self.expert_bias = None
     def routing(self, logits: torch.Tensor):
         """Top-k routing function
         return scores, routing_map
+class FalconTSTMoELayer(nn.Module):
     def __init__(self, config, layer_number):
         super().__init__()
         self.config = config
                 self.expert_output_size = config.seq_length
         if self.is_last_layer and self.config.heterogeneous_moe_layer:
+                # If heterogeneous_moe_layer is True, the backcast will be None
+                self.backcast_layernorm = None
         else:
             self.backcast_layernorm = RMSNorm(self.seq_length)
+        self.experts = SequentialFalconTST(
+                                config,
+                                expert_output_size=self.expert_output_size,
+                            )
+        self.shared_experts = FalconTSTExpert(config,
+                                expert_output_size=self.expert_output_size,
+                                patch_input_size=config.shared_patch_size,
+                                final_layernorm=config.moe_expert_final_layernorm)
     def time_series_preprocess(self, input: torch.Tensor):
         """
+            Preprocess time series(sample) for dispatch.
+            Applies RevIN to input time series(sample), and process the input mask (0: mask, 1: unmask)
+            Args:
+                input (torch.Tensor): The input time series (samples) to the MoE layer. [batch_size, seq_len]
+            Returns:
+                input (torch.Tensor): The (RevIN) backcast time series (samples). [batch_size, seq_len]
+                means (torch.Tensor): The means of the non-masked backcast time series (samples). [batch_size, 1]
+                stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples). [batch_size, 1]
         """
         batch_size, seq_len = input.shape
+        assert seq_len == self.seq_length, f'seq_len {seq_len} != self.seq_length {self.seq_length}'
         # Create input_mask based on pad_length
         # When a time point is masked, its value is mask_pad_value(default:255.)
+        input_mask = (input != self.config.mask_pad_value) # 0: mask, 1: unmask   [batch_size, seq_len]
         self.input_mask = input_mask
         return input
     def router_and_preprocess(self, backcast: torch.Tensor):
         """Compute and preprocess time series(sample) routing for dispatch.
         # backcast [batch_size, seq_len]    means/stdev [batch_size, 1]
         backcast = self.time_series_preprocess(backcast)
+        residual = backcast                   # residual: [batch_size, seq_len], the input to the shared experts
         # TODO: Check the effective of the masked value to the router
+        probs, routing_map = self.router(backcast * self.input_mask)    # probs/routing_map: [batch_size, num_experts]
         return backcast, probs, residual, routing_map
     def experts_compute(
         self,
+        input: torch.Tensor,            # [num_permuted_samples_after_dispatch, seq_len]
+        probs: torch.Tensor,            # [num_permuted_samples_after_dispatch]
+        residual: torch.Tensor,         # [batch_size, seq_len]
         rotary_pos_emb: torch.Tensor,
+        routing_map:torch.Tensor,   # [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
     ):
         """Computes the output of the experts on the dispatched time series(sample).
         """
         # shared_expert_output: [batch_size, seq_len (+ pred_len)]
         shared_experts_output = self.shared_experts(residual, rotary_pos_emb)
         # dispatched_input (global_input_tokens):   [num_permuted_samples_after_dispatch_postprocess(sorted), seq_len]
         # tokens_per_expert (global_probs):         [num_experts]
         # permuted_probs (global_probs):            [num_permuted_samples_after_dispatch_postprocess(sorted)]
         experts_output = self.experts(input, routing_map, rotary_pos_emb, probs)
         return experts_output, shared_experts_output
     def postprocess(
+        self,
+        backcast: torch.Tensor,         # [batch_size, seq_len]
+        forecast: torch.Tensor,         # [batch_size, pred_len]
         output_backcast: torch.Tensor,  # [batch_size, seq_len]
         output_forecast: torch.Tensor,  # [batch_size, pred_len]
     ):
             stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples).  [batch_size, 1]
             backcast_mask (torch.Tensor): The previous layer's backcast mask of time series (samples) .     [batch_size, seq_len]
         """
+        if output_backcast is not None:
+            # 25/8/14 @modified by xiaming replace the revin with layernorm after the moe layer
+            # And if we multiply the output_backcast with the input mask, the performance will be hurted
+            output_backcast = self.backcast_layernorm(output_backcast) # LayerNorm
             if self.config.residual_backcast:
                 output_backcast = backcast - output_backcast
+            output_backcast[~self.input_mask] = self.config.mask_pad_value   # Important! Recover the mask time point back to mask_pad_value(default:255.)
+        if self.config.do_expert_forecast and forecast is not None: # The first layer's forecast is None
             output_forecast = forecast + output_forecast
         return output_backcast, output_forecast
     def combine(
         self,
         experts (e.g., via an All-to-All communication). It then adds the output
         from the shared expert if it exists.
         """
+        assert experts_output.shape == shared_experts_output.shape,\
+             f'experts_output shape {experts_output.shape} doesn\'t equal to shared_experts_output shape:{shared_experts_output.shape}'
         output = experts_output + shared_experts_output
         if self.is_last_layer and self.config.heterogeneous_moe_layer:
             output_backcast = None
             output_forecast = output
+            assert output_forecast.shape[1] == self.pred_length, \
+                f'heterogeneous_moe_layer=True, expected the last moe layer\'s output pred len: {self.pred_length}, but got {output_forecast.shape[1]}'
         else:
             #  Noting: the mask time point there maybe not mask_pad_value(default:255.), it will be postprocessed
+            output_backcast = output[:, :self.seq_length]   # [batch_size, seq_len]
             if self.config.do_expert_forecast:
+                output_forecast = output[:, self.seq_length:]   # [batch_size, pred_len]
+                assert output_forecast.shape[1] == self.pred_length, \
+                    f'do_expert_forecast=True, expected the last moe layer\'s output pred len: {self.pred_length}, but got {output_forecast.shape[1]}'
             else:
                 output_forecast = None
         return output_backcast, output_forecast
+    def forward(self, backcast,forecast,rotary_pos_emb):
         inputs, probs, residual, routing_map = self.router_and_preprocess(backcast)
+        experts_output, shared_experts_output = self.experts_compute(inputs, probs, residual, rotary_pos_emb, routing_map)
         output_backcast, output_forecast = self.combine(experts_output, shared_experts_output)
+        output_backcast, output_forecast = self.postprocess(backcast, forecast, output_backcast, output_forecast)
         return output_backcast, output_forecast
+class FalconTSTBlock(nn.Module):
+    def __init__(self,config):
         super().__init__()
         self.config = config
+        self.layers = nn.ModuleList([
+                FalconTSTMoELayer(config,layer_num +1)
                 for layer_num in range(self.config.num_hidden_layers)
+            ])
+    def forward(self, x,rotary_pos_emb):
         backcast = x
         forecast = None
         for layer in self.layers:
+            backcast, forecast = layer(backcast,forecast,rotary_pos_emb)
+        return backcast,forecast
+class FalconTSTPreTrainedModel(PreTrainedModel):
+    config_class = FalconTSTConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconTSTMoELayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = False
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+class FalconTSTModel(FalconTSTPreTrainedModel):
+    def __init__(self, config: FalconTSTConfig):
         super().__init__(config)
         self.config = config
         self.seq_length = config.seq_length
         self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_base=config.rotary_base,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+                rotary_interleaved=self.config.rotary_interleaved
         )
+        self.decoder = FalconTSTBlock(
+            config=config
+            )
         if self.config.do_expert_forecast and self.config.heterogeneous_moe_layer:
             self.output_layer = IdentityOp()
         else:
+            self.output_layer = nn.Linear(in_features=self.seq_length, out_features=self.config.pred_length, bias=self.config.add_bias_linear,)
     def revin(
         self,
+        input: Tensor,          # [batch_size, seq_len]
+        input_mask: Tensor,     # [batch_size, seq_len] 0:mask, 1:unmask
     ):
+        """ Normalization from Non-stationary Transformer"""
         input_data = input * input_mask
+        sum_per_sample = torch.sum(input_data, dim=1, keepdim=True).detach()                 # [batch_size, 1], torch.bfloat16
+        count_per_sample = torch.sum(input_mask, dim=1, keepdim=True).detach()               # [batch_size, 1], torch.int64
+        assert torch.any(count_per_sample == 0) == False, \
+            f'There is zero in count_per_sample, shape: {input[torch.where(count_per_sample.squeeze(1) == 0)[0]]}'
+        means = sum_per_sample / count_per_sample                                            # [batch_size, 1]
         input_data = input_data - means
         input_data = input_data * input_mask
+        var_per_sample = torch.sum(input_data ** 2, dim=1, keepdim=True).detach() / count_per_sample # [batch_size, 1]
         stdev = torch.sqrt(var_per_sample + 1e-9)
         input_data = input_data / stdev
         input_data = input_data * input_mask
+        #recover the mask_pad_value(default:255.)
         input = input * ~(input_mask) + input_data
         return input, means, stdev
     def forward(self, input, revin):
+        # Apply rotary position embeddings
+        # seq_len = patches.size(1)
+        # pos_emb = self.rotary_pos_emb(seq_len, patches.device)
+        # patches = patches + pos_emb
         batch_size, input_len = input.shape
+        # @created by xiaming @modified by baichun
+        # realize varied input length
         if input_len > self.seq_length:
+            input = input[:, -self.seq_length:]
         elif input_len < self.seq_length:
             pad_len = self.seq_length - input_len
+            input = F.pad(input, pad=(pad_len, 0), mode='constant', value=self.config.mask_pad_value)
         input_len = self.seq_length
+        input_mask = (input != self.config.mask_pad_value)
         # Step1. RevIN
         if revin:
             input, means, stdev = self.revin(input, input_mask)
         # Step2. Get rotary_pos_emb
         # rotary_pos_emb [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
         rotary_pos_emb = self.rotary_pos_emb(input_len, device=input.device)
         # Step3. Do one-step inference to get mixed forecasts from multiple forecast heads
         # mixed_pred: [batch_size, sum(multi_forecast_head)]
         mixed_pred = self._inference_step(
+            input=input,
+            input_mask=input_mask,
+            rotary_pos_emb=rotary_pos_emb
         )
+        # Step4. Based on the mixed forecasts, do auto-regressive inference according to
         # the step list of each forecast head
+        if self.config.multi_forecast_head_type == 'single':
             final_output = self._auto_regressive_single_head(
+                input=input,
+                input_mask=input_mask,
+                FalconTST_forecast=mixed_pred,
+                rotary_pos_emb=rotary_pos_emb
             )
         else:
             raise NotImplementedError
         # Step5. RevIN
         if revin:
             final_output = final_output * (stdev.repeat(1, self.config.inference_length))
         return final_output.detach().float()
     def _inference_step(
+        self,
+        input,
+        input_mask,
         rotary_pos_emb,
+    ):
         if self.config.do_base_forecast:
             base_forecast, _ = self.base_output_layer(input)
         else:
             base_forecast = None
         decoder_backcast, decoder_forecast = self.decoder(
+            input,                        # [batch_size, seq_len]
+            rotary_pos_emb,      # [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
         )
         if self.config.do_expert_forecast:
+            assert decoder_forecast is not None, f'decoder_forecast is None'
             if self.config.heterogeneous_moe_layer:
                 decoder_forecast = self.output_layer(decoder_forecast)  # IdentityOp
             else:
+                final_forecast= self.output_layer(decoder_backcast *  input_mask)
                 decoder_forecast = decoder_forecast + final_forecast
         else:
             # The decoder_backcast contains the mask_pad_val(default:255.)
             decoder_forecast, _ = self.output_layer(decoder_backcast * input_mask)
         if self.config.do_base_forecast:
+            assert base_forecast is not None, f'base_forecast is None'
+            FalconTST_forecast = base_forecast + decoder_forecast
         else:
+            FalconTST_forecast = decoder_forecast
+        return FalconTST_forecast
     def _auto_regressive_single_head(
         self,
+        input,               # [batch_size, seq_len]
+        input_mask,          # [batch_size, seq_len]
+        FalconTST_forecast,   # [batch_size, max(multi_forecast_head)]
+        rotary_pos_emb,      # [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
+        auto_regressive_strategy='from_long_to_short'
     ):
         """auto regressive prediction with [single] head"""
+        assert self.config.multi_forecast_head_type == 'single', \
+            f'_auto_regressive_single_head only support multi_forecast_head_type==single '
+        if auto_regressive_strategy == 'from_long_to_short':
             # From long to short
             multi_forecast_head_list = sorted(self.config.multi_forecast_head_list, reverse=True)
+            final_output = FalconTST_forecast
             while final_output.shape[1] < self.config.inference_length:
                 # adaptive choose the forecast head
                 remain_pred_len = self.config.inference_length - final_output.shape[1]
                 if idx == len(multi_forecast_head_list):
                     idx = len(multi_forecast_head_list) - 1
                 head_pred_len = multi_forecast_head_list[idx]
                 # one-step model prediction
+                input = torch.cat([input, FalconTST_forecast], dim=1)[:, -self.seq_length:].contiguous()
                 input_mask = torch.cat(
+                    [input_mask,
+                    torch.ones(FalconTST_forecast.shape, dtype=input_mask.dtype, device=input_mask.device)],
+                    dim=1)[:, -self.seq_length:].contiguous()   # 0:mask, 1:unmask
+                FalconTST_forecast = self._inference_step(
+                    input=input,
+                    input_mask=input_mask,
+                    rotary_pos_emb=rotary_pos_emb,
                 )
                 # the core idea of multi forecast head type of [single]
+                FalconTST_forecast = FalconTST_forecast[:, :head_pred_len]
+                final_output = torch.cat([final_output, FalconTST_forecast], dim=1)
+            final_output = final_output[:, :self.config.inference_length]
+        elif auto_regressive_strategy == 'from_short_to_long':
             # From short to long
             # in validate_args, it has been sorted, and check the valid config
             multi_forecast_head_list = sorted(self.config.multi_forecast_head_list)
                 else:
                     ar_step = min(
                         self.config.autoregressive_step_list[idx],
+                        self.config.multi_forecast_head_list[idx + 1] // self.config.multi_forecast_head_list[idx]
                     )
                     # ar_step = multi_forecast_head_list[idx + 1] // multi_forecast_head_list[idx]
                 multi_forecast_head_dict[head_pred_len] = ar_step
             # the core idea of strategy [from_short_to_long]
+            mixed_pred = FalconTST_forecast
             output_list = []
             cur_pred = None
             cur_pred_len = 0
                 if ar_step == 0:
                     # Ignore the current forecast head
                     continue
                 # Add current head's first auto-regressive step of prediction
+                head_pred = mixed_pred[:, :head_pred_len]     # [single]
                 output_list.append(head_pred[:, cur_pred_len:])
                 cur_pred = torch.cat(output_list, dim=1)
                 cur_pred_len = cur_pred.shape[1]
                 if cur_pred_len >= self.config.inference_length:
                     break
                 # Do auto-regressive of the rest of the steps
                 for _ in range(1, ar_step + 1):
                     # one-step model prediction
+                    cur_input = torch.cat([input, cur_pred], dim=1)[:, -self.seq_length:].contiguous()
                     cur_input_mask = torch.cat(
+                        [input_mask,
+                        torch.ones(cur_pred.shape, dtype=input_mask.dtype, device=input_mask.device)],
+                        dim=1)[:, -self.seq_length:].contiguous()   # 0:mask, 1:unmask
+                    FalconTST_forecast = self._inference_step(
+                        input=cur_input,
+                        input_mask=cur_input_mask,
+                        rotary_pos_emb=rotary_pos_emb,
                     )
+                    head_pred = FalconTST_forecast[:, :head_pred_len]
                     output_list.append(head_pred)
                     cur_pred = torch.cat(output_list, dim=1)
                     cur_pred_len = cur_pred.shape[1]
                     if cur_pred_len >= self.config.inference_length:
                         break
                 if cur_pred_len >= self.config.inference_length:
                     break
+            final_output = cur_pred[:, :self.config.inference_length] # [batch_size, inference_len]
         assert final_output.shape[1] == self.config.inference_length
         return final_output
+class FalconTSTForPrediction(FalconTSTPreTrainedModel, FalconTSTGenerationMixin):
+    def __init__(self, config: FalconTSTConfig):
         super().__init__(config)
         self.config = config
+        self.model = FalconTSTModel(self.config)
         self.post_init()
     def forward(
         revin: Optional[bool] = False,
     ):
         self.model.config.inference_length = max_output_length
+        outputs = self.model(
+            input=input_ids,
+            revin=revin
+        )
         loss = None
         logits = outputs
         attention_mask=None,
         inputs_embeds=None,
         revin=False,
+        **kwargs
     ):
         """
         Prepare model inputs for autoregressive generation.
         model_inputs = {"input_ids": input_ids}
+        model_inputs.update({
+            "revin": revin,
+        })
+        return model_inputs