OpenLab-NLP
/

HyperConv-Layer

Model card Files Files and versions

xet

Community

OpenLab-NLP commited on Dec 17, 2025

Commit

aec1b8a

verified ·

1 Parent(s): 2e02c67

Update 연구중.py

Browse files

Files changed (1) hide show

연구중.py +23 -23

연구중.py CHANGED Viewed

@@ -128,50 +128,50 @@ ds = ds.batch(BATCH_SIZE, drop_remainder=True)
 ds = ds.map(lambda v1, v2: ((v1, v2), tf.zeros([BATCH_SIZE], dtype=tf.float32)), num_parallel_calls=tf.data.AUTOTUNE)
 ds = ds.prefetch(tf.data.AUTOTUNE)
-class MixerBlock(layers.Layer):
-    def __init__(self, dim):
-        super().__init__()
 class MixerBlock(layers.Layer):
     def __init__(self, seq_len, dim, token_mlp_dim, channel_mlp_dim, dropout=0.0):
         super().__init__()
-        self.ln_token = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32)
-        self.ln_attn  = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32)
-        self.ln_channel = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32)
-        self.ch_fc1 = layers.Dense(self.dim * 4, activation=tf.nn.gelu)
-        self.ch_fc2 = layers.Dense(self.dim)
-        self.token_fc1 = layers.Dense(seq_len * 2, dtype=tf.float32)
-        self.token_fc2 = layers.Dense(seq_len, dtype=tf.float32)
-        self.attn = layers.Dense(1, dtype=tf.float32)
     def call(self, x, training=None):
-        # x: (B, L, D)
-        # ---------- Token Mixer (Pre-LN) ----------
         y = self.ln_token(x)
-        y_t = tf.transpose(y, perm=[0, 2, 1])          # (B, D, L)
         y_t = self.token_fc1(y_t)
         a, b = tf.split(y_t, 2, axis=-1)
         y_t = self.token_fc2(a * tf.nn.gelu(b))
-        y = tf.transpose(y_t, perm=[0, 2, 1])          # (B, L, D)
         x = x + y
-        # ---------- Scalar Attention Reweight (Pre-LN) ----------
-        y = self.ln_attn(x)
-        weight = tf.nn.softmax(self.attn(y), axis=1)  # (B, L, 1)
-        y = y * weight
-        x = x + y
-        # ---------- Channel Mixer (Pre-LN) ----------
         y = self.ln_channel(x)
         y = self.ch_fc1(y)
         y = self.ch_fc2(y)
         x = x + y
         return x

 ds = ds.map(lambda v1, v2: ((v1, v2), tf.zeros([BATCH_SIZE], dtype=tf.float32)), num_parallel_calls=tf.data.AUTOTUNE)
 ds = ds.prefetch(tf.data.AUTOTUNE)
 class MixerBlock(layers.Layer):
     def __init__(self, seq_len, dim, token_mlp_dim, channel_mlp_dim, dropout=0.0):
         super().__init__()
+        self.dim = dim
+        self.ln_token = layers.LayerNormalization(epsilon=1e-6)
+        self.ln_gate  = layers.LayerNormalization(epsilon=1e-6) # 이름 변경
+        self.ln_channel = layers.LayerNormalization(epsilon=1e-6)
+        # Token Mixer
+        self.token_fc1 = layers.Dense(seq_len * 2)
+        self.token_fc2 = layers.Dense(seq_len)
+        # Gating (Sigmoid) - Temperature 불필요
+        self.gate_dense = layers.Dense(1)
+        # Channel Mixer
+        self.ch_fc1 = layers.Dense(self.dim * 4, activation='gelu')
+        self.ch_fc2 = layers.Dense(self.dim)
     def call(self, x, training=None):
+        # 1. Token Mixer
         y = self.ln_token(x)
+        y_t = tf.transpose(y, perm=[0, 2, 1])
         y_t = self.token_fc1(y_t)
         a, b = tf.split(y_t, 2, axis=-1)
         y_t = self.token_fc2(a * tf.nn.gelu(b))
+        y = tf.transpose(y_t, perm=[0, 2, 1])
         x = x + y
+        # 2. Scalar Gating (수정됨)
+        # Softmax의 1/N 희석 문제를 해결하기 위해 Sigmoid 사용
+        y = self.ln_gate(x)
+        gate = tf.nn.sigmoid(self.gate_dense(y)) # (B, L, 1) Range: 0~1
+        y = y * gate
+        x = x + y
+        # 3. Channel Mixer
         y = self.ln_channel(x)
         y = self.ch_fc1(y)
         y = self.ch_fc2(y)
         x = x + y
         return x