liminerity
/

tiny-epstein-100m

@@ -54,25 +54,220 @@ tokenizer = AutoTokenizer.from_pretrained(model_path)
 # For convenience, the model definition is included in the training script.
 # Here we provide a minimal loading snippet assuming you have the model class.
-# Define model config (must match the saved config.json)
 class ModelConfig:
-    vocab_size = 50257
-    emb_dim = 768
-    hidden_dim = 2048
-    num_layers = 12
-    num_heads = 12
-    num_kv_heads = 4
-    max_seq_len = 1024
-    window_size = 1024
-    sliding_window_ratio = 0.75
-    rope_theta = 10000.0
-    dtype = torch.float16
-    bias = False
-    dropout = 0.0
-# Instantiate model (you need the model class definition, e.g., TinyAya)
-# Here we assume you have the TinyAya class from the training script.
-# If not, copy the class definition from the training script into this cell.
 model = TinyAya(ModelConfig())
 state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
 model.load_state_dict(state_dict)

 # For convenience, the model definition is included in the training script.
 # Here we provide a minimal loading snippet assuming you have the model class.
+# ------------------------------------------------------------------------------
+# Configuration (scaled to ~150M for L4 GPU)
+# ------------------------------------------------------------------------------
 class ModelConfig:
+    vocab_size = 50257          # will be updated from tokenizer
+    emb_dim = 768                # embedding dimension
+    hidden_dim = 2048            # intermediate size (FFN) - reduced
+    num_layers = 12              # number of transformer layers - reduced
+    num_heads = 12               # number of query heads - reduced
+    num_kv_heads = 4             # number of key/value heads (GQA)
+    max_seq_len = 1024           # shorter sequence length to save memory
+    window_size = 1024           # sliding window size (match max_seq_len)
+    sliding_window_ratio = 0.75  # fraction of layers with sliding window
+    rope_theta = 10000.0         # base for RoPE
+    dtype = torch.float16        # use mixed precision
+    bias = False                 # no bias in linear layers
+    dropout = 0.0                # no dropout mentioned
+    gradient_checkpointing = True # enable to save memory
+# ------------------------------------------------------------------------------
+# Helper modules (unchanged)
+# ------------------------------------------------------------------------------
+class CohereLayerNorm(nn.Module):
+    """LayerNorm without bias (scale only)."""
+    def __init__(self, emb_dim, eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(emb_dim))
+    def forward(self, x):
+        input_dtype = x.dtype
+        x = x.to(torch.float32)
+        mean = x.mean(dim=-1, keepdim=True)
+        variance = (x - mean).pow(2).mean(dim=-1, keepdim=True)
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+        return (self.weight.to(torch.float32) * x).to(input_dtype)
+class FeedForward(nn.Module):
+    """SwiGLU MLP."""
+    def __init__(self, config):
+        super().__init__()
+        self.fc1 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
+        self.fc2 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
+        self.fc3 = nn.Linear(config.hidden_dim, config.emb_dim, bias=config.bias)
+    def forward(self, x):
+        x_fc1 = self.fc1(x)
+        x_fc2 = self.fc2(x)
+        x = F.silu(x_fc1) * x_fc2
+        return self.fc3(x)
+def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, dtype=torch.float32):
+    """Precompute rotary position embeddings."""
+    assert dim % 2 == 0, "dim must be even"
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[:(dim // 2)] / dim))
+    t = torch.arange(max_seq_len, dtype=dtype)
+    freqs = torch.outer(t, freqs)  # shape (max_seq_len, dim//2)
+    emb = torch.cat((freqs, freqs), dim=-1)  # shape (max_seq_len, dim)
+    return emb.sin(), emb.cos()
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_emb(x, cos, sin):
+    """
+    Apply rotary embeddings to input tensor.
+    x: (batch, seq_len, num_heads, head_dim)
+    cos, sin: (seq_len, head_dim)
+    """
+    cos = cos.unsqueeze(0).unsqueeze(2)  # (1, seq_len, 1, head_dim)
+    sin = sin.unsqueeze(0).unsqueeze(2)  # (1, seq_len, 1, head_dim)
+    return (x * cos) + (rotate_half(x) * sin)
+class GroupedQueryAttention(nn.Module):
+    """Multi-head attention with GQA and optional sliding window mask."""
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.num_heads = config.num_heads
+        self.num_kv_heads = config.num_kv_heads
+        self.head_dim = config.emb_dim // config.num_heads
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.wq = nn.Linear(config.emb_dim, config.num_heads * self.head_dim, bias=config.bias)
+        self.wk = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
+        self.wv = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
+        self.wo = nn.Linear(config.num_heads * self.head_dim, config.emb_dim, bias=config.bias)
+        total_layers = config.num_layers
+        num_sliding = int(total_layers * config.sliding_window_ratio)
+        self.use_sliding = (layer_id < num_sliding)
+        self.window_size = config.window_size
+        self.max_seq_len = config.max_seq_len
+        self.rope_theta = config.rope_theta
+        self.rope_sin, self.rope_cos = None, None
+    def init_rope(self, max_seq_len, device):
+        if self.rope_sin is not None and self.rope_sin.shape[0] >= max_seq_len:
+            return
+        sin, cos = precompute_rope_freqs(
+            self.head_dim, max_seq_len, theta=self.rope_theta, dtype=torch.float32
+        )
+        self.rope_sin = sin.to(device)
+        self.rope_cos = cos.to(device)
+    def forward(self, x, mask=None):
+        batch, seq_len, _ = x.shape
+        device = x.device
+        if self.use_sliding:
+            self.init_rope(seq_len, device)
+        xq = self.wq(x)
+        xk = self.wk(x)
+        xv = self.wv(x)
+        xq = xq.view(batch, seq_len, self.num_heads, self.head_dim)
+        xk = xk.view(batch, seq_len, self.num_kv_heads, self.head_dim)
+        xv = xv.view(batch, seq_len, self.num_kv_heads, self.head_dim)
+        if self.use_sliding:
+            xq = apply_rotary_emb(xq, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
+            xk = apply_rotary_emb(xk, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
+        xk = xk.repeat_interleave(self.num_queries_per_kv, dim=2)
+        xv = xv.repeat_interleave(self.num_queries_per_kv, dim=2)
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        scores = torch.matmul(xq, xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            scores = scores + mask
+        else:
+            mask = torch.full((seq_len, seq_len), float('-inf'), device=device)
+            mask = torch.triu(mask, diagonal=1)
+            if self.use_sliding:
+                for i in range(seq_len):
+                    low = max(0, i - self.window_size + 1)
+                    mask[i, :low] = float('-inf')
+            scores = scores + mask
+        probs = F.softmax(scores, dim=-1, dtype=torch.float32).to(xq.dtype)
+        out = torch.matmul(probs, xv)
+        out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1)
+        return self.wo(out)
+class ParallelTransformerBlock(nn.Module):
+    """Decoder block with parallel attention and MLP."""
+    def __init__(self, config, layer_id):
+        super().__init__()
+        self.norm = CohereLayerNorm(config.emb_dim)
+        self.attn = GroupedQueryAttention(config, layer_id)
+        self.mlp = FeedForward(config)
+    def forward(self, x, mask=None):
+        residual = x
+        x = self.norm(x)
+        attn_out = self.attn(x, mask=mask)
+        mlp_out = self.mlp(x)
+        return residual + attn_out + mlp_out
+class TinyAya(nn.Module):
+    """Tiny Aya 150M model."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.token_embedding = nn.Embedding(config.vocab_size, config.emb_dim)
+        self.layers = nn.ModuleList([
+            ParallelTransformerBlock(config, i) for i in range(config.num_layers)
+        ])
+        self.norm = CohereLayerNorm(config.emb_dim)
+        self.lm_head = nn.Linear(config.emb_dim, config.vocab_size, bias=False)
+        self.lm_head.weight = self.token_embedding.weight
+        if config.gradient_checkpointing:
+            self.gradient_checkpointing_enable()
+    def gradient_checkpointing_enable(self):
+        self._gradient_checkpointing = True
+    def forward(self, input_ids, mask=None):
+        x = self.token_embedding(input_ids)
+        for layer in self.layers:
+            if self.training and getattr(self, '_gradient_checkpointing', False):
+                x = torch.utils.checkpoint.checkpoint(layer, x, mask)
+            else:
+                x = layer(x, mask=mask)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        return logits
+    @torch.no_grad()
+    def generate(self, input_ids, max_new_tokens=50, temperature=1.0):
+        self.eval()
+        for _ in range(max_new_tokens):
+            logits = self(input_ids[:, -self.config.max_seq_len:])
+            next_token_logits = logits[:, -1, :] / temperature
+            probs = F.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            input_ids = torch.cat([input_ids, next_token], dim=-1)
+        return input_ids
 model = TinyAya(ModelConfig())
 state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
 model.load_state_dict(state_dict)