OpenLab-NLP
/

HyperConv-Layer

Model card Files Files and versions

OpenLab-NLP commited on Dec 19, 2025

Commit

cc13462

·

verified ·

1 Parent(s): 52becc8

Update 연구중.py

Files changed (1) hide show

연구중.py +27 -14

연구중.py CHANGED Viewed

@@ -128,38 +128,51 @@ ds = ds.batch(BATCH_SIZE, drop_remainder=True)
 ds = ds.map(lambda v1, v2: ((v1, v2), tf.zeros([BATCH_SIZE], dtype=tf.float32)), num_parallel_calls=tf.data.AUTOTUNE)
 ds = ds.prefetch(tf.data.AUTOTUNE)
 class MixerBlock(layers.Layer):
     def __init__(self, seq_len, dim, token_mlp_dim, channel_mlp_dim, dropout=0.0):
         super().__init__()
         self.dim = dim
-        self.ln_token = layers.LayerNormalization(epsilon=1e-6)
-        self.ln_channel = layers.LayerNormalization(epsilon=1e-6)
-        # Token Mixer
-        self.token_fc1 = layers.Dense(seq_len * 4)
-        self.token_fc2 = layers.Dense(seq_len)
-        # Channel Mixer
-        self.ch_fc1 = layers.Dense(self.dim * 4)
-        self.ch_fc2 = layers.Dense(self.dim)
     def call(self, x, training=None):
-        # 1. Token Mixer
         y = self.ln_token(x)
-        y_t = tf.transpose(y, perm=[0, 2, 1])
         y_t = self.token_fc1(y_t)
         a, b = tf.split(y_t, 2, axis=-1)
         y_t = self.token_fc2(a * tf.nn.gelu(b))
-        y = tf.transpose(y_t, perm=[0, 2, 1])
         x = x + y
-        y = self.ln_channel(x)
-        a, b = tf.split(self.ch_fc1(y), 2, axis=-1)
-        y = self.ch_fc2(a * tf.nn.gelu(b))
         x = x + y
         return x

 ds = ds.map(lambda v1, v2: ((v1, v2), tf.zeros([BATCH_SIZE], dtype=tf.float32)), num_parallel_calls=tf.data.AUTOTUNE)
 ds = ds.prefetch(tf.data.AUTOTUNE)
+class MixerBlock(layers.Layer):
+    def __init__(self, dim):
+        super().__init__()
 class MixerBlock(layers.Layer):
     def __init__(self, seq_len, dim, token_mlp_dim, channel_mlp_dim, dropout=0.0):
         super().__init__()
         self.dim = dim
+        self.ln_token = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32)
+        self.ln_attn  = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32)
+        self.ln_channel = layers.LayerNormalization(epsilon=1e-6, dtype=tf.float32)
+        self.ch_fc1 = layers.Dense(self.dim * 4, activation=tf.nn.gelu)
+        self.ch_fc2 = layers.Dense(self.dim)
+        self.token_fc1 = layers.Dense(seq_len * 2, dtype=tf.float32)
+        self.token_fc2 = layers.Dense(seq_len, dtype=tf.float32)
+        self.attn = layers.Dense(1, dtype=tf.float32)
     def call(self, x, training=None):
+        # x: (B, L, D)
+        # ---------- Token Mixer (Pre-LN) ----------
         y = self.ln_token(x)
+        y_t = tf.transpose(y, perm=[0, 2, 1])          # (B, D, L)
         y_t = self.token_fc1(y_t)
         a, b = tf.split(y_t, 2, axis=-1)
         y_t = self.token_fc2(a * tf.nn.gelu(b))
+        y = tf.transpose(y_t, perm=[0, 2, 1])          # (B, L, D)
         x = x + y
+        # ---------- Scalar Attention Reweight (Pre-LN) ----------
+        y = self.ln_attn(x)
+        weight = tf.nn.softmax(self.attn(y), axis=1)  # (B, L, 1)
+        y = y * weight
         x = x + y
+        # ---------- Channel Mixer (Pre-LN) ----------
+        y = self.ln_channel(x)
+        y = self.ch_fc1(y)
+        y = self.ch_fc2(y)
+        x = x + y
         return x