OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 18, 2025

Commit

830aa48

verified ·

1 Parent(s): 55af523

Update Model.py

Browse files

Files changed (1) hide show

Model.py +11 -46

Model.py CHANGED Viewed

@@ -120,36 +120,10 @@ dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
 print("✅ TF Dataset 생성 완료!")
-class Lo(layers.Layer):
-    def __init__(self):
-        super().__init__()
-        # 내부 계산은 float32로 유지
-        self.p = layers.Dense(64, use_bias=True, dtype='float32')
-        self._out_dtype = 'float32'
-    def call(self, x):
-        # x may be bfloat16; cast to float32 for stable intermediate computation
-        x_f32 = tf.cast(x, tf.float32)
-        x = self.p(x_f32)
-        # cast back to model dtype for consistency
-        return tf.cast(x, self._out_dtype)
-class rGLU(layers.Layer):
-    def __init__(self, d_model, hyper_n):
-        super().__init__()
-        self.Wr = Lo()
-        self.W2 = layers.Dense(256)
-        self.W1 = layers.Dense(256)
-        self.Wr1 = Lo()
-        self.W = layers.Dense(d_model)
-    def call(self, x):
-        x = tf.nn.silu(self.W1(Wr(x)) + x) * (self.W2(self.Wr1(x)) + x)
-        return self.W(x)
 class Adapter(layers.Layer):
-    def __init__(self, d_model, hyper_n):
         super().__init__()
-        self.Wr = Lo()
         self.W = layers.Dense(d_model)
     def call(self, x):
         return self.W(tf.nn.gelu(self.Wr(x)))
@@ -173,15 +147,10 @@ class LoSoU(layers.Layer):
         # projection / gating layers in float32
         self.Q = layers.Dense(d_model, dtype='float32')
         self.K = layers.Dense(d_model, dtype='float32')
-        self.V = layers.Dense(d_model, dtype='float32')
-        self.rglu = rGLU(d_model)
         self.adapter = Adapter(d_model)
-        self.Qr = Lo()
-        self.Kr = Lo()
-        self.Vr = Lo()
-        self.proj = layers.Dense(d_model, use_bias=True, dtype='float32')
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         # 동적 alpha 계산을 위한 레이어
         # alpha는 [0, 1] 범위여야 하므로 sigmoid 사용
@@ -230,12 +199,12 @@ class LoSoU(layers.Layer):
         # x: (B, L, d_model) maybe bfloat16 or float32
         # cast to float32 for all internal computations
         x_f32 = tf.cast(x, tf.float32)
         residual = x_f32
         # Q, K, V
-        q = self.Q(self.Qr(x_f32)) + x_f32  # (B, L, 96)
-        k = self.K(self.Kr(x_f32)) + x_f32  # (B, L, 96)
-        V = self.V(self.Vr(x)) + x # ensure V's output is float32
         # gating signals in (0,1)
         g_q = tf.nn.sigmoid(q)
@@ -261,10 +230,8 @@ class LoSoU(layers.Layer):
         score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
         # combine with V
-        x_comb = score_clipped * V  # (B, L, d_model)
-        out = self.rglu(x_comb)  # (B, L, d_model)
-        out = self.norm(out) + x_comb
         out = self.norm1(self.adapter(out)) + out
         # cast back to original dtype for downstream layers
@@ -283,10 +250,9 @@ class Block(layers.Layer):
 class ReLaM(tf.keras.Model):
     def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
         super().__init__()
-        self.token_embedding = layers.Embedding(vocab_size, 192)
-        self.pos_embedding = layers.Embedding(max_seq_len, 192)
         self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
-        self.proj = layers.Dense(192)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
     def call(self, x, training=False):
@@ -296,7 +262,6 @@ class ReLaM(tf.keras.Model):
         x = self.token_embedding(x) + self.pos_embedding(positions)
         for block in self.blocks:
             x = block(x)
-        x = self.proj(x)
         x = self.ln_f(x)
         embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
@@ -329,7 +294,7 @@ def create_lr_schedule(initial_lr=5e-5, decay_steps=10000, decay_rate=0.9):
 model = ReLaM(
     vocab_size=vocab_size,
     max_seq_len=max_len,
-    d_model=192,
     n_layers=1
 )

 print("✅ TF Dataset 생성 완료!")
 class Adapter(layers.Layer):
+    def __init__(self, d_model):
         super().__init__()
+        self.Wr = layers.Dense(64)
         self.W = layers.Dense(d_model)
     def call(self, x):
         return self.W(tf.nn.gelu(self.Wr(x)))
         # projection / gating layers in float32
         self.Q = layers.Dense(d_model, dtype='float32')
         self.K = layers.Dense(d_model, dtype='float32')
         self.adapter = Adapter(d_model)
         self.norm = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         self.norm1 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
+        self.norm2 = layers.LayerNormalization(epsilon=1e-5, dtype='float32')
         # 동적 alpha 계산을 위한 레이어
         # alpha는 [0, 1] 범위여야 하므로 sigmoid 사용
         # x: (B, L, d_model) maybe bfloat16 or float32
         # cast to float32 for all internal computations
         x_f32 = tf.cast(x, tf.float32)
+        x_f32 = self.norm2(x_f32)
         residual = x_f32
         # Q, K, V
+        q = self.Q(x_f32)  # (B, L, 96)
+        k = self.K(x_f32)  # (B, L, 96)
         # gating signals in (0,1)
         g_q = tf.nn.sigmoid(q)
         score_clipped = tf.clip_by_value(score_norm, -self.clip_value, self.clip_value)
         # combine with V
+        x_comb = tf.nn.silu(score_clipped)  # (B, L, d_model)
+        out = self.norm(x_comb) + residual
         out = self.norm1(self.adapter(out)) + out
         # cast back to original dtype for downstream layers
 class ReLaM(tf.keras.Model):
     def __init__(self, vocab_size, max_seq_len, d_model, n_layers, dropout_rate=0.1):
         super().__init__()
+        self.token_embedding = layers.Embedding(vocab_size, 128)
+        self.pos_embedding = layers.Embedding(max_seq_len, 128)
         self.blocks = [Block(d_model, hyper_n=1) for _ in range(n_layers)]
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype="float32")
     def call(self, x, training=False):
         x = self.token_embedding(x) + self.pos_embedding(positions)
         for block in self.blocks:
             x = block(x)
         x = self.ln_f(x)
         embedding_matrix = tf.cast(self.token_embedding.embeddings, x.dtype)
 model = ReLaM(
     vocab_size=vocab_size,
     max_seq_len=max_len,
+    d_model=128,
     n_layers=1
 )