razmars commited on
Commit
b51b458
·
verified ·
1 Parent(s): 440913e

Upload configuration_super_linear_fs.py

Browse files
Files changed (1) hide show
  1. configuration_super_linear_fs.py +96 -0
configuration_super_linear_fs.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch, torch.nn as nn, torch.nn.functional as F
3
+
4
+ from transformers import (
5
+ PretrainedConfig,
6
+ PreTrainedModel,
7
+ GenerationMixin,
8
+ AutoConfig,
9
+ AutoModelForCausalLM,
10
+ )
11
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
12
+
13
+ # 1) --------------------------------------------------------------------------
14
+ # CONFIG
15
+ # -----------------------------------------------------------------------------
16
+
17
+
18
+ class SuperLinearConfigFS(PretrainedConfig):
19
+ """
20
+ Configuration for the SuperLinear MoE time–series foundation model.
21
+ Only *model_type* must be unique inside transformers; the rest mirrors
22
+ the __init__ arguments of your original Config object.
23
+ """
24
+
25
+ model_type = "super_linear"
26
+
27
+ def __init__(
28
+ self,
29
+ seq_len=512,
30
+ pred_len=96,
31
+ inf_pred_len=96,
32
+ max_horizon=96,
33
+ moe_n_experts=12,
34
+ top_k_experts=5,
35
+ moe =1,
36
+ freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
37
+ auto_regressive= 1,
38
+ d_model= 128,
39
+ dropout= 0.0,
40
+ fft_len= 5000,
41
+ freeze_experts= 1,
42
+ layer_type= "RLinear",
43
+ linear_checkpoints_dir= "checkpoints5",
44
+ linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
45
+ load_linear = 0,
46
+ load_weights =0,
47
+ misc_moe = 10,
48
+ mlp_gating = 0,
49
+ moe_norm = 0,
50
+ model_type= "super_linear",
51
+ moe_temp = 1,
52
+ noisy_gating_std = 0.1,
53
+ noisy_gating_std_decay = 1,
54
+ torch_dtype = "float32",
55
+ transformers_version = "4.40.1",
56
+ use_fft = 1,
57
+ train_epochs = 30,
58
+ patience = 5,
59
+ lradj = "constant",
60
+ learning_rate = 0.05,
61
+ channel_ind = 0,
62
+ full_size = 0,
63
+ **kwargs, # any extra CLI args
64
+ ):
65
+ self.seq_len = seq_len
66
+ self.moe = moe
67
+ self.pred_len = pred_len
68
+ self.inf_pred_len = inf_pred_len
69
+ self.max_horizon = max_horizon
70
+ self.auto_regressive = auto_regressive
71
+ self.moe_n_experts = moe_n_experts
72
+ self.top_k_experts = top_k_experts
73
+ self.freq_experts = freq_experts
74
+ self.freeze_experts = freeze_experts
75
+ self.layer_type = layer_type
76
+ self.linear_checkpoints_path = linear_checkpoints_path
77
+ self.linear_checkpoints_dir = linear_checkpoints_dir
78
+ self.load_linear = load_linear
79
+ self.load_weights = load_weights
80
+ self.misc_moe = misc_moe
81
+ self.noisy_gating_std = noisy_gating_std
82
+ self.noisy_gating_std_decay = noisy_gating_std_decay
83
+ self.d_model = d_model
84
+ self.mlp_gating = mlp_gating
85
+ self.moe_norm = moe_norm
86
+ self.moe_temp = moe_temp
87
+ self.use_fft = use_fft
88
+ self.fft_len = fft_len
89
+ self.dropout = dropout
90
+ self.train_epochs = train_epochs
91
+ self.patience = patience
92
+ self.lradj = lradj
93
+ self.learning_rate = learning_rate
94
+ self.channel_ind = channel_ind
95
+ self.full_size = full_size
96
+ super().__init__(**kwargs)