HawkGPT-v0.2 / model.py
HawkLabofficial's picture
Upload model.py with huggingface_hub
4d77219 verified
Raw
History Blame Contribute Delete
5.45 kB
"""GPT-2 style Transformer Decoder in TensorFlow/Keras."""
import math
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import config
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.1, **kwargs):
super().__init__(**kwargs)
assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.qkv = layers.Dense(embed_dim * 3, use_bias=False)
self.out_proj = layers.Dense(embed_dim, use_bias=False)
self.dropout = layers.Dropout(dropout)
self.scale = math.sqrt(self.head_dim)
def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
B, T, C = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
qkv = self.qkv(x) # (B, T, 3*C)
qkv = tf.reshape(qkv, (B, T, 3, self.num_heads, self.head_dim))
qkv = tf.transpose(qkv, (2, 0, 3, 1, 4)) # (3, B, H, T, D)
q, k, v = qkv[0], qkv[1], qkv[2]
# Scaled dot-product attention with causal mask
att = tf.matmul(q, tf.transpose(k, (0, 1, 3, 2))) / self.scale # (B, H, T, T)
causal_mask = tf.linalg.band_part(tf.ones((T, T), dtype=att.dtype), -1, 0)
causal_mask = tf.reshape(causal_mask, (1, 1, T, T))
att = tf.where(tf.equal(causal_mask, 0), tf.constant(-1e9, dtype=att.dtype), att)
att = tf.nn.softmax(att, axis=-1)
att = self.dropout(att, training=training)
out = tf.matmul(att, v) # (B, H, T, D)
out = tf.transpose(out, (0, 2, 1, 3)) # (B, T, H, D)
out = tf.reshape(out, (B, T, C))
return self.out_proj(out)
class FeedForward(layers.Layer):
def __init__(self, embed_dim: int, ff_dim: int, dropout: float = 0.1, **kwargs):
super().__init__(**kwargs)
self.net = keras.Sequential([
layers.Dense(ff_dim, activation="gelu"),
layers.Dense(embed_dim),
layers.Dropout(dropout),
])
def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
return self.net(x, training=training)
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, dropout: float = 0.1, **kwargs):
super().__init__(**kwargs)
self.ln1 = layers.LayerNormalization(epsilon=1e-5)
self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
self.ln2 = layers.LayerNormalization(epsilon=1e-5)
self.ff = FeedForward(embed_dim, ff_dim, dropout)
self.drop = layers.Dropout(dropout)
def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor:
# Pre-norm architecture (more stable training)
x = x + self.drop(self.attn(self.ln1(x), training=training), training=training)
x = x + self.drop(self.ff(self.ln2(x), training=training), training=training)
return x
class GPTModel(keras.Model):
def __init__(
self,
vocab_size: int,
embed_dim: int = config.EMBED_DIM,
num_heads: int = config.NUM_HEADS,
num_layers: int = config.NUM_LAYERS,
ff_dim: int = config.FF_DIM,
max_seq_len: int = config.MAX_SEQ_LEN,
dropout: float = config.DROPOUT,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.vocab_size = vocab_size
self.token_emb = layers.Embedding(vocab_size, embed_dim)
self.pos_emb = layers.Embedding(max_seq_len, embed_dim)
self.drop = layers.Dropout(dropout)
self.blocks = [
TransformerBlock(embed_dim, num_heads, ff_dim, dropout)
for _ in range(num_layers)
]
self.ln_final = layers.LayerNormalization(epsilon=1e-5)
self.head = layers.Dense(vocab_size, use_bias=False)
def call(self, input_ids: tf.Tensor, training: bool = False) -> tf.Tensor:
B, T = tf.shape(input_ids)[0], tf.shape(input_ids)[1]
positions = tf.range(T)
x = self.token_emb(input_ids) + self.pos_emb(positions)
x = self.drop(x, training=training)
for block in self.blocks:
x = block(x, training=training)
x = self.ln_final(x)
logits = self.head(x) # (B, T, vocab_size)
return logits
def compute_loss(self, input_ids: tf.Tensor, training: bool = False) -> tf.Tensor:
"""Compute cross-entropy loss for next-token prediction.
input_ids: full padded sequence (B, T)
Split internally: input = input_ids[:, :-1], target = input_ids[:, 1:]
"""
x = input_ids[:, :-1]
target = input_ids[:, 1:]
logits = self(x, training=training)
loss = tf.keras.losses.sparse_categorical_crossentropy(target, logits, from_logits=True)
return tf.reduce_mean(loss)
def count_params(self) -> int:
return sum(tf.size(v).numpy() for v in self.trainable_variables)
def build_model(vocab_size: int) -> GPTModel:
model = GPTModel(vocab_size=vocab_size)
# Warm up with a dummy forward pass
dummy = tf.zeros((1, config.MAX_SEQ_LEN), dtype=tf.int32)
model(dummy)
# Weight tying (embedding ↔ projection) — after build
model.head.kernel.assign(tf.transpose(model.token_emb.embeddings))
print(f"Model built: {model.count_params():,} parameters")
return model