Update 연구중.py
Browse files
연구중.py
CHANGED
|
@@ -135,18 +135,14 @@ class MixerBlock(layers.Layer):
|
|
| 135 |
self.dim = dim
|
| 136 |
|
| 137 |
self.ln_token = layers.LayerNormalization(epsilon=1e-6)
|
| 138 |
-
self.ln_gate = layers.LayerNormalization(epsilon=1e-6) # 이름 변경
|
| 139 |
self.ln_channel = layers.LayerNormalization(epsilon=1e-6)
|
| 140 |
|
| 141 |
# Token Mixer
|
| 142 |
-
self.token_fc1 = layers.Dense(seq_len *
|
| 143 |
self.token_fc2 = layers.Dense(seq_len)
|
| 144 |
|
| 145 |
-
# Gating (Sigmoid) - Temperature 불필요
|
| 146 |
-
self.gate_dense = layers.Dense(1)
|
| 147 |
-
|
| 148 |
# Channel Mixer
|
| 149 |
-
self.ch_fc1 = layers.Dense(self.dim * 4
|
| 150 |
self.ch_fc2 = layers.Dense(self.dim)
|
| 151 |
|
| 152 |
def call(self, x, training=None):
|
|
@@ -159,19 +155,11 @@ class MixerBlock(layers.Layer):
|
|
| 159 |
y = tf.transpose(y_t, perm=[0, 2, 1])
|
| 160 |
x = x + y
|
| 161 |
|
| 162 |
-
# 2. Scalar Gating (수정됨)
|
| 163 |
-
# Softmax의 1/N 희석 문제를 해결하기 위해 Sigmoid 사용
|
| 164 |
-
y = self.ln_gate(x)
|
| 165 |
-
gate = tf.nn.sigmoid(self.gate_dense(y)) # (B, L, 1) Range: 0~1
|
| 166 |
-
y = y * gate
|
| 167 |
-
x = x + y
|
| 168 |
-
|
| 169 |
-
# 3. Channel Mixer
|
| 170 |
y = self.ln_channel(x)
|
| 171 |
-
|
| 172 |
-
y = self.ch_fc2(
|
| 173 |
x = x + y
|
| 174 |
-
|
| 175 |
return x
|
| 176 |
|
| 177 |
|
|
|
|
| 135 |
self.dim = dim
|
| 136 |
|
| 137 |
self.ln_token = layers.LayerNormalization(epsilon=1e-6)
|
|
|
|
| 138 |
self.ln_channel = layers.LayerNormalization(epsilon=1e-6)
|
| 139 |
|
| 140 |
# Token Mixer
|
| 141 |
+
self.token_fc1 = layers.Dense(seq_len * 4)
|
| 142 |
self.token_fc2 = layers.Dense(seq_len)
|
| 143 |
|
|
|
|
|
|
|
|
|
|
| 144 |
# Channel Mixer
|
| 145 |
+
self.ch_fc1 = layers.Dense(self.dim * 4)
|
| 146 |
self.ch_fc2 = layers.Dense(self.dim)
|
| 147 |
|
| 148 |
def call(self, x, training=None):
|
|
|
|
| 155 |
y = tf.transpose(y_t, perm=[0, 2, 1])
|
| 156 |
x = x + y
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
y = self.ln_channel(x)
|
| 159 |
+
a, b = tf.split(self.ch_fc1(y), 2, axis=-1)
|
| 160 |
+
y = self.ch_fc2(a * tf.nn.gelu(b))
|
| 161 |
x = x + y
|
| 162 |
+
|
| 163 |
return x
|
| 164 |
|
| 165 |
|