Instructions to use HawkLabofficial/HawkGPT-v0.5 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Keras
How to use HawkLabofficial/HawkGPT-v0.5 with Keras:
# Available backend options are: "jax", "torch", "tensorflow". import os os.environ["KERAS_BACKEND"] = "jax" import keras model = keras.saving.load_model("hf://HawkLabofficial/HawkGPT-v0.5") - Notebooks
- Google Colab
- Kaggle
| """HawkGPT 0.5 — Same proven arch: RMSNorm, GQA, ALiBi, no biases.""" | |
| import math | |
| import tensorflow as tf | |
| from tensorflow import keras | |
| from tensorflow.keras import layers | |
| import config | |
| class RMSNorm(layers.Layer): | |
| """RMSNorm — faster than LayerNorm, no mean computation.""" | |
| def __init__(self, dim: int, eps: float = 1e-6, **kwargs): | |
| super().__init__(**kwargs) | |
| self.eps = eps | |
| self.scale = self.add_weight(name="scale", shape=(dim,), initializer="ones") | |
| def call(self, x: tf.Tensor) -> tf.Tensor: | |
| rms = tf.sqrt(tf.reduce_mean(tf.square(x), axis=-1, keepdims=True) + self.eps) | |
| return x / rms * self.scale | |
| class GroupedQueryAttention(layers.Layer): | |
| """GQA: 8 query heads, 2 KV heads + ALiBi position biases.""" | |
| def __init__(self, embed_dim: int, num_heads: int, num_kv_heads: int, dropout: float = 0.0, **kwargs): | |
| super().__init__(**kwargs) | |
| assert embed_dim % num_heads == 0 | |
| self.num_heads = num_heads | |
| self.num_kv_heads = num_kv_heads | |
| self.head_dim = embed_dim // num_heads | |
| self.kv_dim = num_kv_heads * self.head_dim | |
| self.q_proj = layers.Dense(embed_dim, use_bias=False) | |
| self.k_proj = layers.Dense(self.kv_dim, use_bias=False) | |
| self.v_proj = layers.Dense(self.kv_dim, use_bias=False) | |
| self.out_proj = layers.Dense(embed_dim, use_bias=False) | |
| self.dropout = layers.Dropout(dropout) | |
| self.scale = math.sqrt(self.head_dim) | |
| slopes = [-2.0 ** (-8.0 * h / num_heads) for h in range(num_heads)] | |
| self._alibi_slopes = tf.constant(slopes, dtype=tf.float32) | |
| def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: | |
| B, T, C = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2] | |
| q = self.q_proj(x) | |
| k = self.k_proj(x) | |
| v = self.v_proj(x) | |
| q = tf.reshape(q, (B, T, self.num_heads, self.head_dim)) | |
| q = tf.transpose(q, (0, 2, 1, 3)) | |
| k = tf.reshape(k, (B, T, self.num_kv_heads, self.head_dim)) | |
| k = tf.transpose(k, (0, 2, 1, 3)) | |
| v = tf.reshape(v, (B, T, self.num_kv_heads, self.head_dim)) | |
| v = tf.transpose(v, (0, 2, 1, 3)) | |
| k = tf.repeat(k, self.num_heads // self.num_kv_heads, axis=1) | |
| v = tf.repeat(v, self.num_heads // self.num_kv_heads, axis=1) | |
| att = tf.matmul(q, tf.transpose(k, (0, 1, 3, 2))) / self.scale | |
| # ALiBi | |
| slopes = tf.cast(self._alibi_slopes, att.dtype) | |
| positions = tf.cast(tf.range(T, dtype=tf.float32), att.dtype) | |
| dist = tf.abs(positions[:, None] - positions[None, :]) | |
| att = att + slopes[:, None, None] * dist[None, :, :] | |
| # Causal mask + softmax in float32 for stability | |
| causal_mask = tf.linalg.band_part(tf.ones((T, T)), -1, 0) | |
| causal_mask = tf.reshape(causal_mask, (1, 1, T, T)) | |
| att_f32 = tf.cast(att, tf.float32) | |
| att_f32 = tf.where(tf.equal(causal_mask, 0), tf.constant(-1e9, dtype=tf.float32), att_f32) | |
| att_f32 = tf.nn.softmax(att_f32, axis=-1) | |
| att = tf.cast(att_f32, v.dtype) | |
| att = self.dropout(att, training=training) | |
| out = tf.matmul(att, v) | |
| out = tf.transpose(out, (0, 2, 1, 3)) | |
| out = tf.reshape(out, (B, T, C)) | |
| return self.out_proj(out) | |
| class FeedForward(layers.Layer): | |
| def __init__(self, embed_dim: int, ff_dim: int, dropout: float = 0.0, **kwargs): | |
| super().__init__(**kwargs) | |
| self.net = keras.Sequential([ | |
| layers.Dense(ff_dim, activation="gelu", use_bias=False), | |
| layers.Dense(embed_dim, use_bias=False), | |
| layers.Dropout(dropout), | |
| ]) | |
| def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: | |
| return self.net(x, training=training) | |
| class TransformerBlock(layers.Layer): | |
| """Pre-norm Transformer: norm → attn → add → norm → ffn → add.""" | |
| def __init__(self, embed_dim: int, num_heads: int, num_kv_heads: int, ff_dim: int, dropout: float = 0.0, **kwargs): | |
| super().__init__(**kwargs) | |
| self.ln1 = RMSNorm(embed_dim) | |
| self.attn = GroupedQueryAttention(embed_dim, num_heads, num_kv_heads, dropout) | |
| self.ln2 = RMSNorm(embed_dim) | |
| self.ff = FeedForward(embed_dim, ff_dim, dropout) | |
| def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: | |
| x = x + self.attn(self.ln1(x), training=training) | |
| x = x + self.ff(self.ln2(x), training=training) | |
| return x | |
| class GPTModel(keras.Model): | |
| def __init__( | |
| self, | |
| vocab_size: int, | |
| embed_dim: int = config.EMBED_DIM, | |
| num_heads: int = config.NUM_HEADS, | |
| num_kv_heads: int = config.NUM_KV_HEADS, | |
| num_layers: int = config.NUM_LAYERS, | |
| ff_dim: int = config.FF_DIM, | |
| dropout: float = config.DROPOUT, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.embed_dim = embed_dim | |
| self.token_emb = layers.Embedding(vocab_size, embed_dim, embeddings_initializer="normal") | |
| self.blocks = [ | |
| TransformerBlock(embed_dim, num_heads, num_kv_heads, ff_dim, dropout) | |
| for _ in range(num_layers) | |
| ] | |
| self.ln_final = RMSNorm(embed_dim) | |
| self.head = layers.Dense(vocab_size, use_bias=False) | |
| def call(self, input_ids: tf.Tensor, training: bool = False) -> tf.Tensor: | |
| x = self.token_emb(input_ids) | |
| for block in self.blocks: | |
| x = block(x, training=training) | |
| x = self.ln_final(x) | |
| return self.head(x) | |
| def count_params(self) -> int: | |
| return sum(tf.size(v).numpy() for v in self.trainable_variables) | |
| def build_model(vocab_size: int) -> GPTModel: | |
| model = GPTModel(vocab_size=vocab_size) | |
| dummy = tf.zeros((1, config.MAX_SEQ_LEN), dtype=tf.int32) | |
| model(dummy) | |
| # Weight tying | |
| model.head.kernel.assign(tf.transpose(model.token_emb.embeddings)) | |
| print(f"Model built: {model.count_params():,} parameters") | |
| return model | |