OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 18, 2025

Commit

7caa907

verified ·

1 Parent(s): 0c184aa

Update Model.py

Browse files

Files changed (1) hide show

Model.py +45 -17

Model.py CHANGED Viewed

@@ -121,22 +121,43 @@ dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
 print("✅ TF Dataset 생성 완료!")
 class Lo(layers.Layer):
-    def __init__(self, d_model):
         super().__init__()
         # 내부 계산은 float32로 유지
-        self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
-        self.p = layers.Dense(96, use_bias=True, dtype='float32')
         self._out_dtype = 'float32'
     def call(self, x):
         # x may be bfloat16; cast to float32 for stable intermediate computation
         x_f32 = tf.cast(x, tf.float32)
-        x = self.proj(x_f32)
-        x = tf.nn.gelu(x)
-        x = self.p(x)
         # cast back to model dtype for consistency
         return tf.cast(x, self._out_dtype)
 class LoSoU(layers.Layer):
     """
     안정화된 LoSoU 레이어 (동적 alpha 사용)
@@ -154,11 +175,17 @@ class LoSoU(layers.Layer):
         self.eps = float(eps)
         # projection / gating layers in float32
-        self.Q = layers.Dense(96, dtype='float32')
-        self.K = layers.Dense(96, dtype='float32')
-        self.V = layers.Dense(96, activation='gelu', dtype='float32')
         self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         # 동적 alpha 계산을 위한 레이어
         # alpha는 [0, 1] 범위여야 하므로 sigmoid 사용
@@ -210,9 +237,9 @@ class LoSoU(layers.Layer):
         residual = x_f32
         # Q, K, V
-        q = self.Q(x_f32)   # (B, L, 96)
-        k = self.K(x_f32)   # (B, L, 96)
-        V = tf.cast(self.V(x), tf.float32)  # ensure V's output is float32
         # gating signals in (0,1)
         g_q = tf.nn.sigmoid(q)
@@ -240,8 +267,9 @@ class LoSoU(layers.Layer):
         # combine with V
         x_comb = score_clipped * V  # (B, L, d_model)
-        out = self.proj(x_comb)  # (B, L, d_model)
-        out = self.norm(out)
         # cast back to original dtype for downstream layers
         return tf.cast(out, x.dtype)
@@ -259,10 +287,10 @@ class Block(layers.Layer):
 class ReLaM(tf.keras.Model):
     def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
         super().__init__()
-        self.token_embedding = layers.Embedding(vocab_size, 128)
-        self.pos_embedding = layers.Embedding(max_seq_len, 128)
         self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
-        self.proj = layers.Dense(128)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
     def call(self, x, training=False):

 print("✅ TF Dataset 생성 완료!")
 class Lo(layers.Layer):
+    def __init__(self):
         super().__init__()
         # 내부 계산은 float32로 유지
+        self.p = layers.Dense(48, use_bias=True, dtype='float32')
         self._out_dtype = 'float32'
     def call(self, x):
         # x may be bfloat16; cast to float32 for stable intermediate computation
         x_f32 = tf.cast(x, tf.float32)
+        x = self.p(x_f32)
         # cast back to model dtype for consistency
         return tf.cast(x, self._out_dtype)
+class rGLU(layers.Layer):
+    def __init__(self, d_model, hyper_n):
+        super().__init__()
+        self.Wr = layers.Dense(48)
+        self.WB = layers.Dense(768)
+        self.Wr1 = layers.Dense(48)
+        self.W = layers.Dense(d_model)
+    def call(self, x):
+        x = self.Wr(x)
+        x = self.WB(x)
+        a, b = tf.split(x, 2, axis=-1)
+        o = tf.nn.silu(a) * b
+        o = self.Wr1(o)
+        o = self.W(o)
+        return o
+class Adapter(layers.Layer):
+    def __init__(self, d_model, hyper_n):
+        super().__init__()
+        self.Wr = layers.Dense(48, activation='gelu')
+        self.W = layers.Dense(d_model)
+    def call(self, x):
+        return self.W(self.Wr(x))
 class LoSoU(layers.Layer):
     """
     안정화된 LoSoU 레이어 (동적 alpha 사용)
         self.eps = float(eps)
         # projection / gating layers in float32
+        self.Q = layers.Dense(d_model, dtype='float32')
+        self.K = layers.Dense(d_model, dtype='float32')
+        self.V = layers.Dense(d_model, dtype='float32')
+        self.rglu = rGLU(d_model)
+        self.adapter = Adapter(d_model)
+        self.Qr = Lo()
+        self.Kr = Lo()
+        self.Vr = Lo()
         self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         # 동적 alpha 계산을 위한 레이어
         # alpha는 [0, 1] 범위여야 하므로 sigmoid 사용
         residual = x_f32
         # Q, K, V
+        q = self.Q(self.Qr(x_f32))  # (B, L, 96)
+        k = self.K(self.Kr(x_f32))   # (B, L, 96)
+        V = tf.cast(self.V(self.Vr(x)), tf.float32)  # ensure V's output is float32
         # gating signals in (0,1)
         g_q = tf.nn.sigmoid(q)
         # combine with V
         x_comb = score_clipped * V  # (B, L, d_model)
+        out = self.rglu(x_comb)  # (B, L, d_model)
+        out = self.norm(out) + x_comb
+        out = self.norm1(self.adapter(out)) + out
         # cast back to original dtype for downstream layers
         return tf.cast(out, x.dtype)
 class ReLaM(tf.keras.Model):
     def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
         super().__init__()
+        self.token_embedding = layers.Embedding(vocab_size, 192)
+        self.pos_embedding = layers.Embedding(max_seq_len, 192)
         self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
+        self.proj = layers.Dense(192)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
     def call(self, x, training=False):