Rename configuration_patch_moe.py to configuration_FalconTST.py

Browse files

Files changed (1) hide show

configuration_patch_moe.py → configuration_FalconTST.py +26 -18

configuration_patch_moe.py → configuration_FalconTST.py RENAMED Viewed

@@ -1,24 +1,25 @@
 """
-Configuration class for PatchMoE model.
-This module defines the configuration for PatchMoE, a large-scale time series foundation model
 that utilizes Mixture of Experts (MoE) architecture with multiple patch tokenizers.
 """
-from typing import List, Optional
 from transformers import PretrainedConfig
-class PatchMoeConfig(PretrainedConfig):
     """
-    Configuration class for PatchMoE model.
-    PatchMoE is a time series foundation model that uses Mixture of Experts architecture
     with multiple patch tokenizers for efficient time series forecasting.
     This configuration inherits from [`PretrainedConfig`] and can be used to control the model
     output. Read the documentation from [`PretrainedConfig`] for more information.
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
@@ -97,10 +98,10 @@ class PatchMoeConfig(PretrainedConfig):
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie word embeddings.
     """
-    model_type = "patch_moe"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         hidden_size: int = 1024,
@@ -113,6 +114,7 @@ class PatchMoeConfig(PretrainedConfig):
         mask_pad_value: float = 255.0,
         expert_num_layers: int = 4,
         shared_patch_size: int = 64,
         patch_size_list: Optional[List[int]] = None,
         multi_forecast_head_list: Optional[List[int]] = None,
         is_revin: bool = True,
@@ -126,6 +128,7 @@ class PatchMoeConfig(PretrainedConfig):
         test_data_test_len: int = 720,
         autoregressive_step_list: Optional[List[int]] = None,
         multi_forecast_head_type: str = "single",
         num_experts: int = 4,
         moe_router_topk: int = 2,
         moe_ffn_hidden_size: int = 4096,
@@ -142,7 +145,8 @@ class PatchMoeConfig(PretrainedConfig):
         tie_word_embeddings: bool = False,
         **kwargs,
     ):
-        """Initialize PatchMoE configuration."""
         # Set default values for list parameters
         if patch_size_list is None:
             patch_size_list = [96, 64, 48, 24]
@@ -150,14 +154,15 @@ class PatchMoeConfig(PretrainedConfig):
             multi_forecast_head_list = [24, 96, 336]
         if autoregressive_step_list is None:
             autoregressive_step_list = [2, 4, 1]
-        # patchmoe inference specific
         self.test_data_seq_len = test_data_seq_len
         self.inference_length = test_data_test_len
         self.autoregressive_step_list = autoregressive_step_list
         self.multi_forecast_head_type = multi_forecast_head_type
         self.use_cache = True
-        # patchmoe specific
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
         self.num_attention_heads = num_attention_heads
@@ -165,7 +170,7 @@ class PatchMoeConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.seq_length = seq_length
         self.multi_forecast_head_list = multi_forecast_head_list
-        self.kv_channels = self.hidden_size // self.num_attention_heads
         self.rotary_base = rope_theta
         self.num_hidden_layers = num_hidden_layers
         self.mask_pad_value = mask_pad_value
@@ -188,7 +193,7 @@ class PatchMoeConfig(PretrainedConfig):
         self.moe_router_topk = moe_router_topk
         self.moe_router_score_function = moe_router_score_function
         self.moe_ffn_hidden_size = moe_ffn_hidden_size
-        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
         self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
         self.moe_expert_final_layernorm = moe_expert_final_layernorm
         self.transformer_input_layernorm = transformer_input_layernorm
@@ -196,8 +201,11 @@ class PatchMoeConfig(PretrainedConfig):
         self.q_layernorm = q_layernorm
         self.k_layernorm = k_layernorm
-        kwargs.pop("tie_word_embeddings", None)
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )

 """
+Configuration class for FalconTST model.
+This module defines the configuration for FalconTST, a large-scale time series foundation model
 that utilizes Mixture of Experts (MoE) architecture with multiple patch tokenizers.
 """
+from typing import List, Optional, Union
 from transformers import PretrainedConfig
+import torch
+class FalconTSTConfig(PretrainedConfig):
     """
+    Configuration class for FalconTST model.
+    FalconTST is a time series foundation model that uses Mixture of Experts architecture
     with multiple patch tokenizers for efficient time series forecasting.
     This configuration inherits from [`PretrainedConfig`] and can be used to control the model
     output. Read the documentation from [`PretrainedConfig`] for more information.
     Args:
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie word embeddings.
     """
+    model_type = "FalconTST"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         hidden_size: int = 1024,
         mask_pad_value: float = 255.0,
         expert_num_layers: int = 4,
         shared_patch_size: int = 64,
         patch_size_list: Optional[List[int]] = None,
         multi_forecast_head_list: Optional[List[int]] = None,
         is_revin: bool = True,
         test_data_test_len: int = 720,
         autoregressive_step_list: Optional[List[int]] = None,
         multi_forecast_head_type: str = "single",
         num_experts: int = 4,
         moe_router_topk: int = 2,
         moe_ffn_hidden_size: int = 4096,
         tie_word_embeddings: bool = False,
         **kwargs,
     ):
+        """Initialize FalconTST configuration."""
         # Set default values for list parameters
         if patch_size_list is None:
             patch_size_list = [96, 64, 48, 24]
             multi_forecast_head_list = [24, 96, 336]
         if autoregressive_step_list is None:
             autoregressive_step_list = [2, 4, 1]
+        # FalconTST inference specific
         self.test_data_seq_len = test_data_seq_len
         self.inference_length = test_data_test_len
         self.autoregressive_step_list = autoregressive_step_list
         self.multi_forecast_head_type = multi_forecast_head_type
         self.use_cache = True
+        # FalconTST specific
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
         self.num_attention_heads = num_attention_heads
         self.initializer_range = initializer_range
         self.seq_length = seq_length
         self.multi_forecast_head_list = multi_forecast_head_list
+        self.kv_channels=self.hidden_size // self.num_attention_heads
         self.rotary_base = rope_theta
         self.num_hidden_layers = num_hidden_layers
         self.mask_pad_value = mask_pad_value
         self.moe_router_topk = moe_router_topk
         self.moe_router_score_function = moe_router_score_function
         self.moe_ffn_hidden_size = moe_ffn_hidden_size
+        self.moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size
         self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
         self.moe_expert_final_layernorm = moe_expert_final_layernorm
         self.transformer_input_layernorm = transformer_input_layernorm
         self.q_layernorm = q_layernorm
         self.k_layernorm = k_layernorm
+        kwargs.pop('tie_word_embeddings', None)
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )