Spaces:

abi96062
/

smolLM2-135-implementation

Running

App Files Files Community

abi96062 commited on 12 days ago

Commit

49d2fa1

verified ·

1 Parent(s): 9b57eb5

Create model.py

Browse files

Files changed (1) hide show

model.py +354 -0

model.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+model.py
+========
+Complete SmolLM2-135M model implementation
+Architecture:
+- 30 transformer blocks
+- 576 hidden dimensions
+- 9 query heads, 3 KV heads (Grouped Query Attention)
+- SwiGLU feed-forward network
+- RoPE position embeddings
+- RMSNorm layer normalization
+- Weight tying (embeddings = lm_head)
+Total parameters: 134,515,008 (~135M)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from components import RMSNorm, TransformerBlock
+from transformers import AutoConfig
+class SmolLM2Model(nn.Module):
+    """
+    SmolLM2-135M Language Model
+    A decoder-only transformer based on Llama architecture with:
+    - Grouped Query Attention (memory efficient)
+    - SwiGLU FFN (improved expressiveness)
+    - RoPE position embeddings (length extrapolation)
+    - RMSNorm (faster than LayerNorm)
+    Model configuration:
+    - Layers: 30
+    - Hidden size: 576
+    - Attention heads: 9 (Q) / 3 (KV)
+    - FFN size: 1536
+    - Vocab size: 49,152
+    - Context length: 2048
+    """
+    def __init__(self, config):
+        """
+        Initialize SmolLM2 model
+        Args:
+            config: Model configuration object with attributes:
+                - vocab_size: Size of vocabulary (49152)
+                - hidden_size: Model dimension (576)
+                - num_hidden_layers: Number of transformer blocks (30)
+                - tie_word_embeddings: Whether to tie input/output embeddings
+                - rms_norm_eps: Epsilon for RMSNorm
+        """
+        super().__init__()
+        self.config = config
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        # Transformer blocks (30 layers)
+        self.layers = nn.ModuleList([
+            TransformerBlock(config) for _ in range(config.num_hidden_layers)
+        ])
+        # Final layer normalization
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # Language modeling head (output projection)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Weight tying: share embeddings with output projection
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        print(f"✅ Model initialized with {config.num_hidden_layers} transformer blocks")
+        print(f"✅ Weight tying: {config.tie_word_embeddings}")
+    def forward(self, input_ids, attention_mask=None, position_ids=None):
+        """
+        Forward pass through the model
+        Args:
+            input_ids (torch.Tensor): Input token IDs [batch, seq_len]
+            attention_mask (torch.Tensor, optional): Attention mask
+            position_ids (torch.Tensor, optional): Position indices
+        Returns:
+            torch.Tensor: Logits over vocabulary [batch, seq_len, vocab_size]
+        """
+        batch_size, seq_len = input_ids.shape
+        # Create position IDs if not provided
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=input_ids.device)
+        # Embed tokens
+        hidden_states = self.embed_tokens(input_ids)
+        # Pass through all transformer blocks
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask, position_ids)
+        # Final normalization
+        hidden_states = self.norm(hidden_states)
+        # Project to vocabulary
+        logits = self.lm_head(hidden_states)
+        return logits
+    def generate(
+        self,
+        input_ids,
+        max_new_tokens=50,
+        temperature=1.0,
+        top_p=0.9,
+        top_k=None,
+        do_sample=True
+    ):
+        """
+        Generate text autoregressively
+        Supports multiple sampling strategies:
+        - Greedy decoding (temperature=0)
+        - Temperature sampling
+        - Nucleus (top-p) sampling
+        - Top-k sampling
+        Args:
+            input_ids (torch.Tensor): Input token IDs [batch, seq_len]
+            max_new_tokens (int): Number of tokens to generate
+            temperature (float): Sampling temperature (0 = greedy, >1 = more random)
+            top_p (float): Nucleus sampling threshold (0-1)
+            top_k (int, optional): Top-k sampling threshold
+            do_sample (bool): Whether to sample or use greedy decoding
+        Returns:
+            torch.Tensor: Generated token IDs [batch, seq_len + max_new_tokens]
+        """
+        self.eval()
+        for _ in range(max_new_tokens):
+            with torch.no_grad():
+                # Forward pass
+                logits = self(input_ids)
+                # Get next token logits
+                next_token_logits = logits[:, -1, :]
+                # Apply temperature
+                if temperature > 0:
+                    next_token_logits = next_token_logits / temperature
+                # Greedy decoding
+                if not do_sample or temperature == 0:
+                    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+                else:
+                    # Top-k sampling
+                    if top_k is not None:
+                        top_k = min(top_k, next_token_logits.size(-1))
+                        indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
+                        next_token_logits[indices_to_remove] = float('-inf')
+                    # Nucleus (top-p) sampling
+                    if top_p < 1.0:
+                        sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                        # Remove tokens with cumulative probability above threshold
+                        sorted_indices_to_remove = cumulative_probs > top_p
+                        # Keep at least one token
+                        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                        sorted_indices_to_remove[..., 0] = False
+                        # Scatter to original indexing
+                        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                        next_token_logits[indices_to_remove] = float('-inf')
+                    # Sample from distribution
+                    probs = F.softmax(next_token_logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                # Append to sequence
+                input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+    def get_num_params(self, non_embedding=False):
+        """
+        Count model parameters
+        Args:
+            non_embedding (bool): If True, exclude embedding parameters
+        Returns:
+            int: Number of parameters
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.embed_tokens.weight.numel()
+            # If weights are tied, don't double-count
+            if not self.config.tie_word_embeddings:
+                n_params -= self.lm_head.weight.numel()
+        return n_params
+def initialize_weights(model, config):
+    """
+    Initialize model weights using GPT-style initialization
+    Strategy:
+    - All weights: Normal(0, 0.02)
+    - Residual projections: Scaled by 1/sqrt(2 * num_layers)
+    - RMSNorm: Initialized to 1.0 (PyTorch default)
+    The residual scaling prevents variance explosion in deep networks.
+    Args:
+        model (SmolLM2Model): Model to initialize
+        config: Model configuration
+    """
+    std = 0.02
+    num_layers = config.num_hidden_layers
+    # Residual scaling factor: 1/sqrt(2 * num_layers)
+    residual_scaling = 1.0 / math.sqrt(2 * num_layers)
+    print(f"Initializing weights with std={std}, residual_scaling={residual_scaling:.6f}")
+    # Initialize embeddings
+    nn.init.normal_(model.embed_tokens.weight, mean=0.0, std=std)
+    # Initialize each transformer block
+    for layer in model.layers:
+        # Attention projections
+        nn.init.normal_(layer.self_attn.q_proj.weight, mean=0.0, std=std)
+        nn.init.normal_(layer.self_attn.k_proj.weight, mean=0.0, std=std)
+        nn.init.normal_(layer.self_attn.v_proj.weight, mean=0.0, std=std)
+        # Output projection with residual scaling
+        nn.init.normal_(layer.self_attn.o_proj.weight, mean=0.0, std=std * residual_scaling)
+        # FFN projections
+        nn.init.normal_(layer.mlp.gate_proj.weight, mean=0.0, std=std)
+        nn.init.normal_(layer.mlp.up_proj.weight, mean=0.0, std=std)
+        # Output projection with residual scaling
+        nn.init.normal_(layer.mlp.down_proj.weight, mean=0.0, std=std * residual_scaling)
+    # RMSNorm weights are initialized to 1.0 by default (PyTorch)
+    print(f"✅ Initialized {sum(1 for _ in model.parameters())} weight tensors")
+def load_pretrained_weights(our_model, official_model, device='cuda'):
+    """
+    Load weights from HuggingFace official model
+    Maps weight names from official model to our implementation:
+    - model.embed_tokens.weight -> embed_tokens.weight
+    - model.layers.{i}.* -> layers[i].*
+    - model.norm.weight -> norm.weight
+    - lm_head.weight (tied with embeddings)
+    Args:
+        our_model (SmolLM2Model): Our model to load weights into
+        official_model: HuggingFace official model
+        device (str): Device to load weights to
+    Returns:
+        int: Number of weight tensors loaded
+    """
+    print("=" * 70)
+    print("LOADING PRETRAINED WEIGHTS")
+    print("=" * 70)
+    official_state = official_model.state_dict()
+    loaded_count = 0
+    # 1. Load token embeddings
+    our_model.embed_tokens.weight.data = official_state['model.embed_tokens.weight'].clone().to(device)
+    loaded_count += 1
+    # 2. Load all transformer blocks
+    num_layers = our_model.config.num_hidden_layers
+    for layer_idx in range(num_layers):
+        prefix = f'model.layers.{layer_idx}'
+        # Layer norms
+        our_model.layers[layer_idx].input_layernorm.weight.data = \
+            official_state[f'{prefix}.input_layernorm.weight'].clone().to(device)
+        our_model.layers[layer_idx].post_attention_layernorm.weight.data = \
+            official_state[f'{prefix}.post_attention_layernorm.weight'].clone().to(device)
+        # Attention projections
+        our_model.layers[layer_idx].self_attn.q_proj.weight.data = \
+            official_state[f'{prefix}.self_attn.q_proj.weight'].clone().to(device)
+        our_model.layers[layer_idx].self_attn.k_proj.weight.data = \
+            official_state[f'{prefix}.self_attn.k_proj.weight'].clone().to(device)
+        our_model.layers[layer_idx].self_attn.v_proj.weight.data = \
+            official_state[f'{prefix}.self_attn.v_proj.weight'].clone().to(device)
+        our_model.layers[layer_idx].self_attn.o_proj.weight.data = \
+            official_state[f'{prefix}.self_attn.o_proj.weight'].clone().to(device)
+        # FFN projections
+        our_model.layers[layer_idx].mlp.gate_proj.weight.data = \
+            official_state[f'{prefix}.mlp.gate_proj.weight'].clone().to(device)
+        our_model.layers[layer_idx].mlp.up_proj.weight.data = \
+            official_state[f'{prefix}.mlp.up_proj.weight'].clone().to(device)
+        our_model.layers[layer_idx].mlp.down_proj.weight.data = \
+            official_state[f'{prefix}.mlp.down_proj.weight'].clone().to(device)
+        loaded_count += 9  # 2 norms + 4 attn + 3 ffn
+    # 3. Load final norm
+    our_model.norm.weight.data = official_state['model.norm.weight'].clone().to(device)
+    loaded_count += 1
+    print(f"\n✅ Loaded {num_layers} transformer blocks")
+    print(f"✅ Total loaded: {loaded_count} weight tensors")
+    print("=" * 70)
+    return loaded_count
+if __name__ == "__main__":
+    """Test model creation and parameter count"""
+    # Load config
+    config = AutoConfig.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+    # Create model
+    model = SmolLM2Model(config)
+    # Count parameters
+    total_params = model.get_num_params()
+    print(f"\nTotal parameters: {total_params:,}")
+    print(f"Expected: 134,515,008")
+    print(f"Match: {total_params == 134_515_008}")
+    # Test forward pass
+    test_input = torch.randint(0, config.vocab_size, (1, 10))
+    output = model(test_input)
+    print(f"\nForward pass test:")
+    print(f"  Input shape: {test_input.shape}")
+    print(f"  Output shape: {output.shape}")
+    print(f"  Expected: torch.Size([1, 10, 49152])")
+    # Test generation
+    generated = model.generate(test_input, max_new_tokens=5)
+    print(f"\nGeneration test:")
+    print(f"  Generated shape: {generated.shape}")
+    print(f"  Expected: torch.Size([1, 15])")