OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 26, 2025

Commit

a2ab022

verified ·

1 Parent(s): 67b096a

Update Mo.py

Browse files

Files changed (1) hide show

Mo.py +47 -62

Mo.py CHANGED Viewed

@@ -118,74 +118,58 @@ with strategy.scope():
 class SwiGLU(layers.Layer):
     def __init__(self, d_model, d_ff):
         super().__init__()
-        self.proj = layers.Dense(960)
         self.out = layers.Dense(d_model)
     def call(self, x):
         x_proj = self.proj(x)
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
-class LoU(layers.Layer):
-    def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
-        self.d_model = d_model
-        self.clip_value = float(clip_value)
-        self.eps = float(eps)
-        self.Q = layers.Dense(d_model, dtype='float32')
-        self.K = layers.Dense(d_model, dtype='float32')
-        self.V = layers.Dense(d_model, dtype='float32')
-        self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32')
-        self.glu = SwiGLU(d_model, d_model)
-    def _ema_over_time(self, score, alpha_dynamic):
-        seq = tf.transpose(score, perm=[1, 0, 2])
-        alpha_seq = tf.transpose(alpha_dynamic, perm=[1, 0, 2])
-        def step(prev_ema, inputs):
-            x_t, alpha_t = inputs
-            new = alpha_t * x_t + (1.0 - alpha_t) * prev_ema
-            return new
-        init = seq[0]
-        first_alpha = alpha_seq[0]
-        remaining_seq = seq[1:]
-        remaining_alpha = alpha_seq[1:]
-        elems = (remaining_seq, remaining_alpha)
-        # tf.scan을 사용하여 시계열 EMA 계산
-        ema_seq = tf.scan(fn=step, elems=elems, initializer=init)
-        ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0)
-        ema = tf.transpose(ema_seq, perm=[1, 0, 2])
-        return ema
-    # LoU는 원래 Uni-directional Attention/Recurrent Block 역할
-    def call(self, x):
-        x_f32 = tf.cast(x, tf.float32)
-        residual = x_f32
-        x_f32 = self.norm1(x)
-        q = self.Q(x_f32)
-        k = self.K(x_f32)
-        V = self.V(x_f32)
-        g_q = (tf.nn.tanh(q) + 1.0) / 2.0
-        g_k = (tf.nn.tanh(k) + 1.0) / 2.0
-        score = g_q * g_k
-        alpha_dynamic = self.alpha_linear(x_f32)
-        score_ema = self._ema_over_time(score, alpha_dynamic)
-        mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True)
-        denom = tf.maximum(mean_last, self.eps)
-        score_norm = score_ema / denom
-        score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
-        x_comb = score_clipped * V
-        # LoU 블록에서는 x_comb + residual 후 CrossBlock을 통과
-        out = self.norm(x_comb + residual)
-        out = self.glu(out)
-        return tf.cast(out, x.dtype)
 class Lo(layers.Layer):
     def __init__(self, d_model):
@@ -202,7 +186,8 @@ class Lo(layers.Layer):
 class Block(layers.Layer):
     def __init__(self, d_model):
         super().__init__()
-        self.lou = LoU(d_model)
         self.lo = Lo(d_model)
     def call(self, x):

 class SwiGLU(layers.Layer):
     def __init__(self, d_model, d_ff):
         super().__init__()
+        self.proj = layers.Dense(dff)
         self.out = layers.Dense(d_model)
     def call(self, x):
         x_proj = self.proj(x)
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
+class MHLA(layers.Layer):
+    def __init__(self, embed_dim, num_heads=8, dropout=0.0):
         super().__init__()
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.Wq = layers.Dense(embed_dim, use_bias=False)
+        self.Wk = layers.Dense(embed_dim, use_bias=False)
+        self.Wv = layers.Dense(embed_dim, use_bias=False)
+        self.out = layers.Dense(embed_dim)
+        self.dropout = layers.Dropout(dropout)
+    def split_heads(self, x):
+        # [B, L, D] -> [B, num_heads, L, head_dim]
+        B, L, D = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
+        x = tf.reshape(x, (B, L, self.num_heads, self.head_dim))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+    def combine_heads(self, x):
+        # [B, num_heads, L, head_dim] -> [B, L, D]
+        x = tf.transpose(x, perm=[0, 2, 1, 3])
+        B, L, H, D = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
+        return tf.reshape(x, (B, L, H*D))
+    def call(self, x, training=False):
+        q = tf.nn.elu(self.Wq(x)) + 1
+        k = tf.nn.elu(self.Wk(x)) + 1
+        v = self.Wv(x)
+        q = self.split_heads(q)
+        k = self.split_heads(k)
+        v = self.split_heads(v)
+        # causal linear attention cumulative sum
+        k_cum = tf.cumsum(k, axis=2)
+        kv_cum = tf.cumsum(k * v, axis=2)
+        z = 1.0 / tf.reduce_sum(q * k_cum, axis=-1, keepdims=True)
+        out = (q * kv_cum) * z
+        out = self.combine_heads(out)
+        out = self.dropout(out, training=training)
+        return self.out(out)
 class Lo(layers.Layer):
     def __init__(self, d_model):
 class Block(layers.Layer):
     def __init__(self, d_model):
         super().__init__()
+        self.lou = MHLA(d_model, 8)
+        self.glu = SwiGLU(d_model, 1154)
         self.lo = Lo(d_model)
     def call(self, x):