| """HawkGPT 0.3 — Transformer Decoder with ALiBi positional encoding. |
| |
| ALiBi: Attention with Linear Biases — no learned position embeddings. |
| Better for generalization, no position embedding limit. |
| """ |
|
|
| import math |
| import tensorflow as tf |
| from tensorflow import keras |
| from tensorflow.keras import layers |
|
|
| import config |
|
|
|
|
| class MultiHeadSelfAttention(layers.Layer): |
| """Multi-head self-attention with ALiBi positional biases.""" |
|
|
| def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.05, **kwargs): |
| super().__init__(**kwargs) |
| assert embed_dim % num_heads == 0 |
| self.num_heads = num_heads |
| self.head_dim = embed_dim // num_heads |
|
|
| self.qkv = layers.Dense(embed_dim * 3, use_bias=False) |
| self.out_proj = layers.Dense(embed_dim, use_bias=False) |
| self.dropout = layers.Dropout(dropout) |
| self.scale = math.sqrt(self.head_dim) |
|
|
| def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: |
| B, T, C = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2] |
| qkv = self.qkv(x) |
| qkv = tf.reshape(qkv, (B, T, 3, self.num_heads, self.head_dim)) |
| qkv = tf.transpose(qkv, (2, 0, 3, 1, 4)) |
| q, k, v = qkv[0], qkv[1], qkv[2] |
|
|
| |
| att = tf.matmul(q, tf.transpose(k, (0, 1, 3, 2))) / self.scale |
|
|
| |
| |
| alibi = self._get_alibi(T, att.dtype) |
| att = att + alibi |
|
|
| |
| causal_mask = tf.linalg.band_part(tf.ones((T, T), dtype=att.dtype), -1, 0) |
| causal_mask = tf.reshape(causal_mask, (1, 1, T, T)) |
| att = tf.where(tf.equal(causal_mask, 0), tf.constant(-1e9, dtype=att.dtype), att) |
|
|
| att = tf.nn.softmax(att, axis=-1) |
| att = self.dropout(att, training=training) |
|
|
| out = tf.matmul(att, v) |
| out = tf.transpose(out, (0, 2, 1, 3)) |
| out = tf.reshape(out, (B, T, C)) |
| return self.out_proj(out) |
|
|
| def _get_alibi(self, T: int, dtype) -> tf.Tensor: |
| """Compute ALiBi bias matrix.""" |
| slopes = [] |
| for h in range(self.num_heads): |
| slopes.append(-2.0 ** (-8.0 * h / self.num_heads)) |
| slopes = tf.constant(slopes, dtype=dtype) |
|
|
| |
| positions = tf.range(T, dtype=dtype) |
| dist = positions[:, None] - positions[None, :] |
| dist = tf.abs(dist) |
|
|
| |
| alibi = slopes[:, None, None] * dist[None, :, :] |
| return alibi |
|
|
|
|
| class FeedForward(layers.Layer): |
| def __init__(self, embed_dim: int, ff_dim: int, dropout: float = 0.05, **kwargs): |
| super().__init__(**kwargs) |
| self.net = keras.Sequential([ |
| layers.Dense(ff_dim, activation="gelu"), |
| layers.Dense(embed_dim), |
| layers.Dropout(dropout), |
| ]) |
|
|
| def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: |
| return self.net(x, training=training) |
|
|
|
|
| class TransformerBlock(layers.Layer): |
| def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.05, **kwargs): |
| super().__init__(**kwargs) |
| self.ln1 = layers.LayerNormalization(epsilon=1e-5) |
| self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout) |
| self.ln2 = layers.LayerNormalization(epsilon=1e-5) |
| self.ff = FeedForward(embed_dim, ff_dim, dropout) |
| self.drop = layers.Dropout(dropout) |
|
|
| def call(self, x, training=False): |
| x = x + self.drop(self.attn(self.ln1(x), training=training), training=training) |
| x = x + self.drop(self.ff(self.ln2(x), training=training), training=training) |
| return x |
|
|
|
|
| class GPTModel(keras.Model): |
| def __init__( |
| self, |
| vocab_size: int, |
| embed_dim: int = config.EMBED_DIM, |
| num_heads: int = config.NUM_HEADS, |
| num_layers: int = config.NUM_LAYERS, |
| ff_dim: int = config.FF_DIM, |
| max_seq_len: int = config.MAX_SEQ_LEN, |
| dropout: float = config.DROPOUT, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.embed_dim = embed_dim |
| self.vocab_size = vocab_size |
|
|
| self.token_emb = layers.Embedding(vocab_size, embed_dim) |
| |
| self.drop = layers.Dropout(dropout) |
|
|
| self.blocks = [ |
| TransformerBlock(embed_dim, num_heads, ff_dim, dropout) |
| for _ in range(num_layers) |
| ] |
| self.ln_final = layers.LayerNormalization(epsilon=1e-5) |
| self.head = layers.Dense(vocab_size, use_bias=False) |
|
|
| def call(self, input_ids: tf.Tensor, training: bool = False) -> tf.Tensor: |
| B, T = tf.shape(input_ids)[0], tf.shape(input_ids)[1] |
|
|
| |
| x = self.token_emb(input_ids) |
| x = self.drop(x, training=training) |
|
|
| for block in self.blocks: |
| x = block(x, training=training) |
|
|
| x = self.ln_final(x) |
| logits = self.head(x) |
| return logits |
|
|
| def count_params(self) -> int: |
| return sum(tf.size(v).numpy() for v in self.trainable_variables) |
|
|
|
|
| def build_model(vocab_size: int) -> GPTModel: |
| model = GPTModel(vocab_size=vocab_size) |
| dummy = tf.zeros((1, config.MAX_SEQ_LEN), dtype=tf.int32) |
| model(dummy) |
| |
| model.head.kernel.assign(tf.transpose(model.token_emb.embeddings)) |
| print(f"Model built: {model.count_params():,} parameters") |
| return model |
|
|