OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 23, 2025

Commit

edc3c67

verified ·

1 Parent(s): f2448de

Update Mo.py

Browse files

Files changed (1) hide show

Mo.py +31 -19

Mo.py CHANGED Viewed

@@ -124,7 +124,6 @@ class SwiGLU(layers.Layer):
         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
@@ -137,7 +136,31 @@ class LoU(layers.Layer):
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
-        self.glu = SwiGLU(d_model, 320)
     def call(self, x):
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
@@ -150,30 +173,19 @@ class LoU(layers.Layer):
         g_k = (tf.nn.tanh(k) + 1.0) / 2.0
         score = g_q * g_k
-        score = tf.cumsum(score, axis=1) # (B, L, D)
-        # 💡 수정된 부분: 현재 토큰까지의 누적합 평균으로 정규화
-        seq_len = tf.shape(score)[1]
-        # [1, 2, 3, ..., L]을 D_model 차원으로 확장
-        count_for_mean = tf.cast(tf.range(seq_len) + 1, score.dtype)
-        count_for_mean = tf.reshape(count_for_mean, (1, seq_len, 1))
-        # 누적합을 현재까지의 토큰 개수로 나누어 평균 누적합 계산 (B, L, D)
-        score_mean = score / count_for_mean
-        # 정규화 분모 설정
-        denom = tf.maximum(score_mean, self.eps)
-        score_norm = score / denom
-        # -----------------------------------------------
         score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
         x_comb = score_clipped * V
         out = self.norm(x_comb + residual)
         out = self.glu(out)
         return tf.cast(out, x.dtype)
 class Lo(layers.Layer):
     def __init__(self, d_model):
         super().__init__()

         x_val, x_gate = tf.split(x_proj, 2, axis=-1)
         return self.out(x_val * tf.nn.silu(x_gate))
 class LoU(layers.Layer):
     def __init__(self, d_model, clip_value=5.0, eps=1e-6):
         super().__init__()
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.alpha_linear = layers.Dense(1, activation='sigmoid', dtype='float32')
+        self.glu = SwiGLU(d_model, d_model)
+    def _ema_over_time(self, score, alpha_dynamic):
+        seq = tf.transpose(score, perm=[1, 0, 2])
+        alpha_seq = tf.transpose(alpha_dynamic, perm=[1, 0, 2])
+        def step(prev_ema, inputs):
+            x_t, alpha_t = inputs
+            new = alpha_t * x_t + (1.0 - alpha_t) * prev_ema
+            return new
+        init = seq[0]
+        first_alpha = alpha_seq[0]
+        remaining_seq = seq[1:]
+        remaining_alpha = alpha_seq[1:]
+        elems = (remaining_seq, remaining_alpha)
+        # tf.scan을 사용하여 시계열 EMA 계산
+        ema_seq = tf.scan(fn=step, elems=elems, initializer=init)
+        ema_seq = tf.concat([tf.expand_dims(init, 0), ema_seq], axis=0)
+        ema = tf.transpose(ema_seq, perm=[1, 0, 2])
+        return ema
+    # LoU는 원래 Uni-directional Attention/Recurrent Block 역할
     def call(self, x):
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
         g_k = (tf.nn.tanh(k) + 1.0) / 2.0
         score = g_q * g_k
+        alpha_dynamic = self.alpha_linear(x_f32)
+        score_ema = self._ema_over_time(score, alpha_dynamic)
+        mean_last = tf.reduce_mean(score_ema, axis=-1, keepdims=True)
+        denom = tf.maximum(mean_last, self.eps)
+        score_norm = score_ema / denom
         score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
         x_comb = score_clipped * V
+        # LoU 블록에서는 x_comb + residual 후 CrossBlock을 통과
         out = self.norm(x_comb + residual)
         out = self.glu(out)
         return tf.cast(out, x.dtype)
 class Lo(layers.Layer):
     def __init__(self, d_model):
         super().__init__()