Things got Messy

Browse files

Files changed (5) hide show

Model_Architecture/data.py +188 -0
Model_Architecture/generation.py +2 -2
Model_Architecture/model.py +106 -110
Model_Architecture/model_size.py +226 -0
Model_Architecture/train.py +483 -0

Model_Architecture/data.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import tiktoken
+import torch
+from torch.utils.data import Dataset, DataLoader
+from typing import Tuple, Optional, Literal, List
+from pathlib import Path
+from tqdm import tqdm
+import mmap
+import numpy as np
+from model import ModelArgs
+#####################################
+# DATA
+#####################################
+class TextDataset(Dataset):
+    def __init__(self, txt: str, tokenizer, args: ModelArgs, stride: Optional[int] = None, max_samples: Optional[int] = None):
+        """
+        Optimized text dataset with memory-mapped reading and batched tokenization.
+        Args:
+            txt: Text content or path to file
+            tokenizer: Pretrained tokenizer with .encode() method
+            args: ModelArgs containing max_seq_len, max_batch_size
+            stride: Sliding window stride. Defaults to max_seq_len // 2
+            max_samples: Limit number of samples for quick testing
+        """
+        self.max_seq_len = args.max_seq_len
+        self.stride = stride if stride is not None else self.max_seq_len // 2
+        # Handle file paths efficiently with memory mapping
+        if Path(txt).exists():
+            text_content = self._read_file_mmap(txt)
+        else:
+            text_content = txt
+        # Validate input
+        if not text_content or len(text_content.strip()) < self.max_seq_len:
+            raise ValueError(f"Text too short. Need at least {self.max_seq_len} chars, got {len(text_content)}")
+        print(f"📝 Tokenizing {len(text_content):,} characters...")
+        # Tokenize with progress bar for large texts
+        token_ids = self._tokenize_with_progress(tokenizer, text_content)
+        # Create sliding windows with vectorized operations
+        self.samples = self._create_sliding_windows(token_ids, max_samples)
+        print(f"✅ Created {len(self.samples)} training samples")
+    def _read_file_mmap(self, file_path: str) -> str:
+        """Memory-efficient file reading for large files"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
+                    return mm.read().decode('utf-8', errors='ignore')
+        except Exception as e:
+            raise RuntimeError(f"Failed to read file {file_path}: {e}")
+    def _tokenize_with_progress(self, tokenizer, text: str) -> List[int]:
+        """Tokenize with progress bar for large texts"""
+        # Process in chunks for memory efficiency
+        chunk_size = 10_000_000  # 10MB chunks
+        tokens = []
+        if len(text) > chunk_size:
+            # Process large texts in chunks
+            pbar = tqdm(total=len(text), desc="Tokenizing", unit="char")
+            for i in range(0, len(text), chunk_size):
+                chunk = text[i:i + chunk_size]
+                chunk_tokens = tokenizer.encode(chunk, allowed_special={"<|endoftext|>"})
+                tokens.extend(chunk_tokens)
+                pbar.update(len(chunk))
+            pbar.close()
+        else:
+            # Single pass for smaller texts
+            tokens = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
+        if not tokens:
+            raise ValueError("No tokens generated from input text")
+        return tokens
+    def _create_sliding_windows(self, token_ids: List[int], max_samples: Optional[int]) -> torch.Tensor:
+        """Create overlapping sequences using vectorized operations"""
+        if len(token_ids) < self.max_seq_len + 1:
+            raise ValueError(f"Not enough tokens. Need {self.max_seq_len + 1}, got {len(token_ids)}")
+        # Convert to numpy for faster slicing
+        tokens_array = np.array(token_ids, dtype=np.int64)
+        # Calculate number of windows
+        num_windows = (len(tokens_array) - self.max_seq_len - 1) // self.stride + 1
+        if max_samples:
+            num_windows = min(num_windows, max_samples)
+        # Pre-allocate tensors
+        inputs = torch.zeros(num_windows, self.max_seq_len, dtype=torch.long)
+        targets = torch.zeros(num_windows, self.max_seq_len, dtype=torch.long)
+        # Fill tensors efficiently
+        for i in range(num_windows):
+            start = i * self.stride
+            inputs[i] = torch.from_numpy(tokens_array[start:start + self.max_seq_len])
+            targets[i] = torch.from_numpy(tokens_array[start + 1:start + self.max_seq_len + 1])
+        # Stack into pairs (more memory efficient than separate lists)
+        self.samples = torch.stack([inputs, targets], dim=1)
+        return self.samples
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return (input_ids, target_ids) tuple"""
+        return self.samples[idx, 0], self.samples[idx, 1]
+def create_dataloader(
+    txt: str,
+    args: ModelArgs,
+    stride: Optional[int] = None,
+    shuffle: bool = True,
+    drop_last: bool = True,
+    num_workers: int = 0,
+    pin_memory: bool = True,
+    persistent_workers: bool = False,
+    max_samples: Optional[int] = None
+) -> DataLoader:
+    """
+    Optimized DataLoader with proper memory pinning and worker settings.
+    Args:
+        txt: Text content or file path
+        args: ModelArgs configuration
+        stride: Sliding window stride
+        shuffle: Whether to shuffle samples
+        drop_last: Drop incomplete batches
+        num_workers: Number of data loading workers (0 = main process)
+        pin_memory: Pin memory for faster GPU transfer (recommended)
+        persistent_workers: Keep workers alive between epochs (if num_workers > 0)
+        max_samples: Limit samples for testing
+    """
+    # Use the best default tokenizer for your setup
+    # tiktoken's gpt2 is fast, well-tested, and has reasonable vocab size (~50k)
+    # For multilingual or code, consider "cl100k_base" or "o200k_base"
+    tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
+    tokenizer = tiktoken.get_encoding(tokenizer_name)
+    # Create dataset with size validation
+    try:
+        dataset = TextDataset(
+            txt=txt,
+            tokenizer=tokenizer,
+            args=args,
+            stride=stride,
+            max_samples=max_samples
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to create dataset: {e}")
+    # Create DataLoader with optimized settings
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.max_batch_size,
+        shuffle=shuffle,
+        drop_last=drop_last,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        persistent_workers=persistent_workers if num_workers > 0 else False,
+        prefetch_factor=2 if num_workers > 0 else None,
+    )
+    return dataloader
+# Convenience function for downloading sample data
+def get_sample_data(url: str = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt") -> str:
+    """Download sample text data for testing"""
+    try:
+        import requests
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"⚠️  Could not download sample data: {e}")
+        return ""

Model_Architecture/generation.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import tiktoken
-from model import Transformer, ModelArgs
 #####################################
@@ -151,7 +151,7 @@ if __name__ == "__main__":
     # Initialize model and tokenizer
     print("Initializing model...")
     torch.manual_seed(123)
-    model = Transformer(args)
     model.eval()
     tokenizer = tiktoken.get_encoding("gpt2")

 import torch
 import tiktoken
+from model import ismail, ModelArgs
 #####################################
     # Initialize model and tokenizer
     print("Initializing model...")
     torch.manual_seed(123)
+    model = ismail(args)
     model.eval()
     tokenizer = tiktoken.get_encoding("gpt2")

Model_Architecture/model.py CHANGED Viewed

@@ -2,7 +2,7 @@ import tiktoken
 import torch
 import torch.nn as nn
 from torch.utils.data import Dataset, DataLoader
 import math
 from dataclasses import dataclass
 from typing import Tuple, Optional, Literal
@@ -18,31 +18,32 @@ from kernel import act_quant, weight_dequant, fp8_gemm
 @dataclass
 class ModelArgs:
     max_batch_size: int = 8
-    max_seq_len: int = 4096 * 4
     dtype: Literal["bf16", "fp8"] = "bf16"
     scale_fmt: Optional[str] = None
     vocab_size: int = 102400
-    dim: int = 2048
-    inter_dim: int = 10944
-    moe_inter_dim: int = 1408
-    n_layers: int = 27
-    n_dense_layers: int = 1
-    n_heads: int = 16
     # moe
-    n_routed_experts: int = 64
-    n_shared_experts: int = 2
-    n_activated_experts: int = 6
-    n_expert_groups: int = 1
-    n_limited_groups: int = 1
-    score_func: Literal["softmax", "sigmoid"] = "softmax"
     route_scale: float = 1.
-    use_routing_bias: bool = False  # Enable routing bias for fine-tuning expert selection
     # mla
     q_lora_rank: int = 0
     kv_lora_rank: int = 512
     qk_nope_head_dim: int = 128
     qk_rope_head_dim: int = 64
     v_head_dim: int = 128
     # yarn
     original_seq_len: int = 4096
     rope_theta: float = 10000.0
@@ -58,54 +59,7 @@ block_size = 128
 gemm_impl: Literal["bf16", "fp8"] = "bf16"
-#####################################
-# DATA
-#####################################
-class TextDataset(Dataset):
-    def __init__(self, txt, tokenizer, args: ModelArgs, stride: Optional[int] = None):
-        self.input_ids = []
-        self.target_ids = []
-        # Use max_seq_len from ModelArgs
-        max_length = args.max_seq_len
-        if stride is None:
-            stride = max_length // 2  # Default stride is half the sequence length
-        # Tokenize the entire text
-        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
-        # Use a sliding window to chunk the book into overlapping sequences of max_length
-        for i in range(0, len(token_ids) - max_length, stride):
-            input_chunk = token_ids[i:i + max_length]
-            target_chunk = token_ids[i + 1: i + max_length + 1]
-            self.input_ids.append(torch.tensor(input_chunk))
-            self.target_ids.append(torch.tensor(target_chunk))
-    def __len__(self):
-        return len(self.input_ids)
-    def __getitem__(self, idx):
-        return self.input_ids[idx], self.target_ids[idx]
-def create_dataloader(txt, args: ModelArgs, stride: Optional[int] = None,
-                         shuffle: bool = True, drop_last: bool = True, num_workers: int = 0):
-    # Initialize the tokenizer
-    tokenizer = tiktoken.get_encoding("gpt2")
-    # Create dataset with ModelArgs
-    dataset = TextDataset(txt, tokenizer, args, stride)
-    # Create dataloader using batch_size from ModelArgs
-    dataloader = DataLoader(
-        dataset,
-        batch_size=args.max_batch_size,
-        shuffle=shuffle,
-        drop_last=drop_last,
-        num_workers=num_workers
-    )
-    return dataloader
 #####################################
 # RoPE
@@ -321,9 +275,6 @@ class Gate(nn.Module):
         self.dim = args.dim
         self.n_routed_experts = args.n_routed_experts
         self.n_activated_experts = args.n_activated_experts
-        self.n_expert_groups = args.n_expert_groups
-        self.n_limited_groups = args.n_limited_groups
-        self.score_func = args.score_func
         self.route_scale = args.route_scale
         # Gate weight
@@ -341,10 +292,7 @@ class Gate(nn.Module):
         scores = linear(x, self.weight)
         # Apply scoring function
-        if self.score_func == "softmax":
-            scores = scores.softmax(dim=-1, dtype=torch.float32)
-        else:
-            scores = scores.sigmoid()
         original_scores = scores
@@ -352,17 +300,6 @@ class Gate(nn.Module):
         if self.bias is not None:
             scores = scores + self.bias
-        # Expert grouping for load balancing
-        if self.n_expert_groups > 1:
-            scores = scores.view(x.size(0), self.n_expert_groups, -1)
-            if self.bias is None:
-                group_scores = scores.amax(dim=-1)
-            else:
-                group_scores = scores.topk(2, dim=-1)[0].sum(dim=-1)
-            indices = group_scores.topk(self.n_limited_groups, dim=-1)[1]
-            mask = scores.new_ones(x.size(0), self.n_expert_groups, dtype=bool).scatter_(1, indices, False)
-            scores = scores.masked_fill_(mask.unsqueeze(-1), float("-inf")).flatten(1)
         # Select top-k experts
         indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
         weights = original_scores.gather(1, indices)
@@ -391,56 +328,115 @@ class Expert(nn.Module):
 class MoE(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.dim = args.dim
         self.n_routed_experts = args.n_routed_experts
         self.n_activated_experts = args.n_activated_experts
-        # Gate for routing
         self.gate = Gate(args)
-        # Routed experts
         self.experts = nn.ModuleList([
             Expert(args.dim, args.moe_inter_dim)
             for _ in range(args.n_routed_experts)
         ])
-        # Shared experts (always process all tokens)
         self.shared_experts = MLP(args.dim, args.n_shared_experts * args.moe_inter_dim)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
         original_shape = x.size()
         x = x.view(-1, self.dim)
-        # Route tokens to experts
-        weights, indices = self.gate(x)
-        # Initialize output for routed experts
         y = torch.zeros_like(x)
-        # Process each routed expert
         for i in range(self.n_routed_experts):
-            # Find tokens routed to this expert
             idx, top = torch.where(indices == i)
             if idx.numel() == 0:
                 continue
-            # Process tokens with this expert
-            expert_output = self.experts[i](x[idx])
-            # Weight and accumulate expert outputs
-            y[idx] += expert_output * weights[idx, top, None]
-        # Process all tokens with shared experts
         z = self.shared_experts(x)
-        # Combine routed and shared expert outputs
         output = (y + z).view(original_shape)
-        return output
 #####################################
@@ -482,7 +478,7 @@ class Block(nn.Module):
 # TRANSFORMER MODEL
 #####################################
-class Transformer(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.args = args

 import torch
 import torch.nn as nn
 from torch.utils.data import Dataset, DataLoader
+from contextlib import nullcontext
 import math
 from dataclasses import dataclass
 from typing import Tuple, Optional, Literal
 @dataclass
 class ModelArgs:
     max_batch_size: int = 8
+    max_seq_len: int = 2048
     dtype: Literal["bf16", "fp8"] = "bf16"
     scale_fmt: Optional[str] = None
     vocab_size: int = 102400
+    dim: int = 1024
+    inter_dim: int = 4096
+    moe_inter_dim: int = 1024
+    n_layers: int = 20
+    n_dense_layers: int = 3
+    n_heads: int = 12
     # moe
+    n_routed_experts: int = 6
+    n_shared_experts: int = 1
+    n_activated_experts: int = 2
     route_scale: float = 1.
+    use_routing_bias: bool = True  # Enable routing bias for fine-tuning expert selection
     # mla
     q_lora_rank: int = 0
     kv_lora_rank: int = 512
     qk_nope_head_dim: int = 128
     qk_rope_head_dim: int = 64
     v_head_dim: int = 128
     # yarn
     original_seq_len: int = 4096
     rope_theta: float = 10000.0
 gemm_impl: Literal["bf16", "fp8"] = "bf16"
 #####################################
 # RoPE
         self.dim = args.dim
         self.n_routed_experts = args.n_routed_experts
         self.n_activated_experts = args.n_activated_experts
         self.route_scale = args.route_scale
         # Gate weight
         scores = linear(x, self.weight)
         # Apply scoring function
+        scores = scores.sigmoid()
         original_scores = scores
         if self.bias is not None:
             scores = scores + self.bias
         # Select top-k experts
         indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
         weights = original_scores.gather(1, indices)
 class MoE(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.dim = args.dim
         self.n_routed_experts = args.n_routed_experts
         self.n_activated_experts = args.n_activated_experts
+        self.active_expert_idx = None  # None = all active (inference mode)
         self.gate = Gate(args)
         self.experts = nn.ModuleList([
             Expert(args.dim, args.moe_inter_dim)
             for _ in range(args.n_routed_experts)
         ])
         self.shared_experts = MLP(args.dim, args.n_shared_experts * args.moe_inter_dim)
+        self.ffn_norm = RMSNorm(args.dim)
+        # Load balance loss coefficient
+        self.lb_loss_coef = 0.01
+    def set_active_expert(self, expert_idx: Optional[int]):
+        """Freeze all but the active expert to save optimizer memory"""
+        self.active_expert_idx = expert_idx
+        for i, expert in enumerate(self.experts):
+            requires_grad = (expert_idx is None) or (i == expert_idx)
+            for param in expert.parameters():
+                param.requires_grad = requires_grad
+    def compute_load_balance_loss(self, router_probs, expert_indices):
+        """Encourage uniform expert utilization"""
+        # router_probs: [num_tokens, n_experts]
+        # expert_indices: [num_tokens, top_k]
+        # Token fraction per expert
+        tokens_per_expert = torch.zeros(self.n_routed_experts, device=router_probs.device)
+        indices_flat = expert_indices.view(-1)
+        ones = torch.ones_like(indices_flat, dtype=torch.float32)
+        tokens_per_expert.scatter_add_(0, indices_flat, ones)
+        tokens_per_expert = tokens_per_expert / (indices_flat.numel() + 1e-8)
+        # Average routing probability per expert
+        router_prob_per_expert = router_probs.mean(dim=0)
+        # Load balancing loss (minimize difference)
+        loss = torch.mean(tokens_per_expert * router_prob_per_expert) * self.n_routed_experts
+        return loss
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         original_shape = x.size()
         x = x.view(-1, self.dim)
+        # Always compute routing (even in sequential mode to train the gate)
+        router_logits = F.linear(x, self.gate.weight)
+        router_probs = router_logits.sigmoid()
+        if self.gate.bias is not None:
+            router_logits = router_logits + self.gate.bias
+        # Select top-k experts
+        weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
+        # Normalize weights
+        if self.gate.score_func == "sigmoid":
+            weights = weights / weights.sum(dim=-1, keepdim=True)
+        weights = weights * self.gate.route_scale
+        # Sequential Training Mode
+        if self.training and self.active_expert_idx is not None:
+            y = torch.zeros_like(x)
+            # Only compute gradients for active expert
+            for i in range(self.n_routed_experts):
+                idx, top = torch.where(indices == i)
+                if idx.numel() == 0:
+                    continue
+                # Use gradient context manager
+                grad_context = nullcontext() if i == self.active_expert_idx else torch.no_grad()
+                with grad_context:
+                    expert_out = self.experts[i](x[idx])
+                    y[idx] += expert_out * weights[idx, top, None]
+            # Load balance loss (still needed for gate training)
+            lb_loss = self.compute_load_balance_loss(router_probs, indices)
+            # Shared experts always train
+            z = self.shared_experts(x)
+            return (y + z).view(original_shape), lb_loss
+        # Normal MoE Mode (inference or full training)
         y = torch.zeros_like(x)
         for i in range(self.n_routed_experts):
             idx, top = torch.where(indices == i)
             if idx.numel() == 0:
                 continue
+            expert_out = self.experts[i](x[idx])
+            y[idx] += expert_out * weights[idx, top, None]
         z = self.shared_experts(x)
         output = (y + z).view(original_shape)
+        if self.training:
+            lb_loss = self.compute_load_balance_loss(router_probs, indices)
+            return output, lb_loss
+        else:
+            return output, None
 #####################################
 # TRANSFORMER MODEL
 #####################################
+class ismail(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.args = args

Model_Architecture/model_size.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import sys
+from pathlib import Path
+# Add the Model_Architecture directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+from model import ModelArgs
+def estimate_model_size(args: ModelArgs):
+    """Calculate detailed model size and parameter count"""
+    print(f"\n{'='*70}")
+    print(f"MODEL ARCHITECTURE ANALYSIS: ismail")
+    print(f"{'='*70}\n")
+    # Display configuration
+    print(f"📋 CONFIGURATION:")
+    print(f"   Model dimension (dim):        {args.dim}")
+    print(f"   Vocabulary size:              {args.vocab_size:,}")
+    print(f"   Number of layers:             {args.n_layers}")
+    print(f"   Dense layers:                 {args.n_dense_layers}")
+    print(f"   MoE layers:                   {args.n_layers - args.n_dense_layers}")
+    print(f"   Attention heads:              {args.n_heads}")
+    print(f"   Max sequence length:          {args.max_seq_len}")
+    print(f"   Max batch size:               {args.max_batch_size}")
+    print(f"   \nMoE Configuration:")
+    print(f"   Routed experts:               {args.n_routed_experts}")
+    print(f"   Shared experts:               {args.n_shared_experts}")
+    print(f"   Activated experts:            {args.n_activated_experts}")
+    print(f"   \nMLA Configuration:")
+    print(f"   Q LoRA rank:                  {args.q_lora_rank}")
+    print(f"   KV LoRA rank:                 {args.kv_lora_rank}")
+    print(f"   QK nope head dim:             {args.qk_nope_head_dim}")
+    print(f"   QK rope head dim:             {args.qk_rope_head_dim}")
+    print(f"   V head dim:                   {args.v_head_dim}")
+    # Calculate parameters by component
+    print(f"\n{'='*70}")
+    print(f"🔢 PARAMETER COUNT BY COMPONENT:")
+    print(f"{'='*70}\n")
+    # 1. Embeddings
+    tok_embed_params = args.vocab_size * args.dim
+    output_params = args.vocab_size * args.dim
+    total_embed_params = tok_embed_params + output_params
+    print(f"   Token Embeddings:             {tok_embed_params:>15,} params")
+    print(f"   Output Layer:                 {output_params:>15,} params")
+    print(f"   {'─' * 50}")
+    print(f"   Total Embeddings:             {total_embed_params:>15,} params\n")
+    # 2. Attention (per layer)
+    if args.q_lora_rank == 0:
+        wq_params = args.dim * args.n_heads * (args.qk_nope_head_dim + args.qk_rope_head_dim)
+        wq_norm_params = 0
+    else:
+        wq_params = args.dim * args.q_lora_rank + args.q_lora_rank * args.n_heads * (args.qk_nope_head_dim + args.qk_rope_head_dim)
+        wq_norm_params = args.q_lora_rank
+    wkv_a_params = args.dim * (args.kv_lora_rank + args.qk_rope_head_dim)
+    kv_norm_params = args.kv_lora_rank
+    wkv_b_params = args.kv_lora_rank * args.n_heads * (args.qk_nope_head_dim + args.v_head_dim)
+    wo_params = args.n_heads * args.v_head_dim * args.dim
+    attn_norm_params = args.dim
+    attn_params_per_layer = wq_params + wq_norm_params + wkv_a_params + kv_norm_params + wkv_b_params + wo_params + attn_norm_params
+    print(f"   Attention (per layer):")
+    if args.q_lora_rank > 0:
+        print(f"      WQ (LoRA):                 {wq_params:>15,} params")
+        print(f"      Q Norm:                    {wq_norm_params:>15,} params")
+    else:
+        print(f"      WQ:                        {wq_params:>15,} params")
+    print(f"      WKV_A:                     {wkv_a_params:>15,} params")
+    print(f"      KV Norm:                   {kv_norm_params:>15,} params")
+    print(f"      WKV_B:                     {wkv_b_params:>15,} params")
+    print(f"      WO:                        {wo_params:>15,} params")
+    print(f"      Attn Norm:                 {attn_norm_params:>15,} params")
+    print(f"   {'─' * 50}")
+    print(f"      Subtotal:                  {attn_params_per_layer:>15,} params\n")
+    # 3. Dense FFN
+    dense_w1_params = args.dim * args.inter_dim
+    dense_w2_params = args.inter_dim * args.dim
+    dense_w3_params = args.dim * args.inter_dim
+    ffn_norm_params = args.dim
+    dense_ffn_per_layer = dense_w1_params + dense_w2_params + dense_w3_params + ffn_norm_params
+    print(f"   Dense FFN (per layer):")
+    print(f"      FC1 (W1):                  {dense_w1_params:>15,} params")
+    print(f"      FC2 (W3):                  {dense_w3_params:>15,} params")
+    print(f"      FC3 (W2):                  {dense_w2_params:>15,} params")
+    print(f"      FFN Norm:                  {ffn_norm_params:>15,} params")
+    print(f"   {'─' * 50}")
+    print(f"      Subtotal:                  {dense_ffn_per_layer:>15,} params\n")
+    # 4. MoE FFN
+    gate_params = args.n_routed_experts * args.dim
+    if args.use_routing_bias:
+        gate_params += args.n_routed_experts
+    expert_w1_params = args.dim * args.moe_inter_dim
+    expert_w2_params = args.moe_inter_dim * args.dim
+    expert_w3_params = args.dim * args.moe_inter_dim
+    per_expert_params = expert_w1_params + expert_w2_params + expert_w3_params
+    routed_experts_params = args.n_routed_experts * per_expert_params
+    shared_w1_params = args.dim * (args.n_shared_experts * args.moe_inter_dim)
+    shared_w2_params = (args.n_shared_experts * args.moe_inter_dim) * args.dim
+    shared_w3_params = args.dim * (args.n_shared_experts * args.moe_inter_dim)
+    shared_experts_params = shared_w1_params + shared_w2_params + shared_w3_params
+    moe_ffn_per_layer = gate_params + routed_experts_params + shared_experts_params + ffn_norm_params
+    print(f"   MoE FFN (per layer):")
+    print(f"      Gate:                      {gate_params:>15,} params")
+    print(f"      Routed Experts ({args.n_routed_experts}x):       {routed_experts_params:>15,} params")
+    print(f"         Per expert:             {per_expert_params:>15,} params")
+    print(f"      Shared Experts:            {shared_experts_params:>15,} params")
+    print(f"      FFN Norm:                  {ffn_norm_params:>15,} params")
+    print(f"   {'─' * 50}")
+    print(f"      Subtotal:                  {moe_ffn_per_layer:>15,} params\n")
+    # 5. Final Norm
+    final_norm_params = args.dim
+    # Total calculation
+    dense_layer_params = attn_params_per_layer + dense_ffn_per_layer
+    moe_layer_params = attn_params_per_layer + moe_ffn_per_layer
+    total_dense_params = args.n_dense_layers * dense_layer_params
+    total_moe_params = (args.n_layers - args.n_dense_layers) * moe_layer_params
+    total_params = total_embed_params + total_dense_params + total_moe_params + final_norm_params
+    print(f"   Layer Summary:")
+    print(f"      Dense layers ({args.n_dense_layers}x):        {total_dense_params:>15,} params")
+    print(f"      MoE layers ({args.n_layers - args.n_dense_layers}x):          {total_moe_params:>15,} params")
+    print(f"      Final Norm:                {final_norm_params:>15,} params")
+    print(f"\n{'='*70}")
+    print(f"📊 TOTAL PARAMETERS:              {total_params:>15,} ({total_params/1e6:.2f}M)")
+    print(f"{'='*70}\n")
+    # Memory calculations
+    print(f"{'='*70}")
+    print(f"💾 MEMORY USAGE:")
+    print(f"{'='*70}\n")
+    bytes_per_param_bf16 = 2
+    bytes_per_param_fp32 = 4
+    # Model weights
+    weight_memory_bf16 = total_params * bytes_per_param_bf16 / (1024**3)
+    weight_memory_fp32 = total_params * bytes_per_param_fp32 / (1024**3)
+    print(f"   Model Weights:")
+    print(f"      BF16 (inference):          {weight_memory_bf16:>10.3f} GB")
+    print(f"      FP32 (training):           {weight_memory_fp32:>10.3f} GB\n")
+    # KV Cache
+    kv_cache_per_layer = args.max_batch_size * args.max_seq_len * (args.kv_lora_rank + args.qk_rope_head_dim)
+    total_kv_cache = kv_cache_per_layer * args.n_layers * bytes_per_param_bf16 / (1024**3)
+    print(f"   KV Cache (BF16):")
+    print(f"      Per layer:                 {kv_cache_per_layer * bytes_per_param_bf16 / (1024**3):>10.3f} GB")
+    print(f"      Total ({args.n_layers} layers):         {total_kv_cache:>10.3f} GB\n")
+    # Activations (rough estimate)
+    activation_memory = (args.max_batch_size * args.max_seq_len * args.dim * args.n_layers * 4) / (1024**3)
+    print(f"   Activations (estimate):       {activation_memory:>10.3f} GB\n")
+    # Training overhead
+    gradients_memory = weight_memory_fp32  # Same size as weights
+    optimizer_states = weight_memory_fp32 * 2  # Adam: 2x for momentum + variance
+    training_overhead = gradients_memory + optimizer_states
+    print(f"   Training Overhead (FP32):")
+    print(f"      Gradients:                 {gradients_memory:>10.3f} GB")
+    print(f"      Optimizer states (Adam):   {optimizer_states:>10.3f} GB")
+    print(f"      Total overhead:            {training_overhead:>10.3f} GB\n")
+    # Total estimates
+    inference_total = weight_memory_bf16 + total_kv_cache + activation_memory
+    training_total = weight_memory_fp32 + total_kv_cache + activation_memory + training_overhead
+    print(f"{'='*70}")
+    print(f"   INFERENCE (BF16):             {inference_total:>10.3f} GB")
+    print(f"   TRAINING (FP32 + Adam):       {training_total:>10.3f} GB")
+    print(f"{'='*70}\n")
+    # Memory analysis
+    print(f"{'='*70}")
+    print(f"🎯 MEMORY ANALYSIS:")
+    print(f"{'='*70}\n")
+    for threshold, name in [(8, "8GB"), (16, "16GB"), (24, "24GB"), (32, "32GB"), (40, "40GB"), (48, "48GB"), (80, "80GB")]:
+        if inference_total <= threshold:
+            print(f"   ✅ Inference fits in {name} GPU")
+            break
+    else:
+        print(f"   ❌ Inference requires >80GB GPU")
+    for threshold, name in [(8, "8GB"), (16, "16GB"), (24, "24GB"), (32, "32GB"), (40, "40GB"), (48, "48GB"), (80, "80GB")]:
+        if training_total <= threshold:
+            print(f"   ✅ Training fits in {name} GPU")
+            break
+    else:
+        print(f"   ❌ Training requires >80GB GPU")
+    print(f"\n{'='*70}\n")
+    return {
+        'total_params': total_params,
+        'weight_memory_gb': weight_memory_bf16,
+        'inference_memory_gb': inference_total,
+        'training_memory_gb': training_total
+    }
+if __name__ == "__main__":
+    # Load default configuration
+    args = ModelArgs()
+    # Run estimation
+    results = estimate_model_size(args)

Model_Architecture/train.py ADDED Viewed

	@@ -0,0 +1,483 @@

+#!/usr/bin/env python3
+"""
+Sequential Expert Training Script for MoE on Single GPU
+Memory Usage: ~7.2GB (vs 10.9GB for full MoE)
+"""
+import argparse
+import torch
+import torch.nn.functional as F
+from pathlib import Path
+import json
+import time
+import math
+# Import your model
+from model import ismail, ModelArgs
+from model_size import estimate_model_size
+# Try to import optional dependencies
+try:
+    import wandb
+    HAS_WANDB = True
+except ImportError:
+    HAS_WANDB = False
+    print("⚠️  wandb not installed. Run 'pip install wandb' for experiment tracking.")
+try:
+    import bitsandbytes as bnb
+    HAS_BNB = True
+except ImportError:
+    HAS_BNB = False
+    print("⚠️  bitsandbytes not installed. Run 'pip install bitsandbytes' for memory-efficient optimizer.")
+# Configuration
+DEFAULT_CONFIG = {
+    "model": {
+        "vocab_size": 32000,  # Reduced from 102400
+        "dim": 1024,
+        "inter_dim": 4096,
+        "moe_inter_dim": 1024,
+        "n_layers": 16,
+        "n_dense_layers": 1,  # Only first layer dense
+        "n_heads": 16,  # Increased for better parallelism
+        # MoE
+        "n_routed_experts": 6,
+        "n_shared_experts": 1,
+        "n_activated_experts": 2,
+        # MLA
+        "q_lora_rank": 128,  # Enable Q LoRA
+        "kv_lora_rank": 512,
+        "qk_nope_head_dim": 64,
+        "qk_rope_head_dim": 32,
+        "v_head_dim": 64,
+        # Sequence
+        "max_seq_len": 2048,  # Start shorter
+        "max_batch_size": 4,
+    },
+    "training": {
+        "learning_rate": 3e-4,
+        "weight_decay": 0.1,
+        "beta1": 0.9,
+        "beta2": 0.95,
+        "grad_clip": 1.0,
+        "warmup_steps": 1000,
+        "total_steps": 50000,
+        "expert_rotation_steps": 2000,  # Rotate expert every N steps
+        "gradient_accumulation_steps": 16,
+        "eval_every": 1000,
+        "save_every": 5000,
+        "save_dir": "./checkpoints",
+        "log_every": 100,
+        "dtype": "bf16",
+        "compile": True,  # PyTorch 2.0+ compilation
+    },
+    "data": {
+        "train_file": "./data/train.txt",
+        "val_file": "./data/val.txt",
+        "stride": 512,
+    },
+    "logging": {
+        "use_wandb": HAS_WANDB,
+        "project_name": "sequential-moe",
+        "run_name": "moe-12gb-gpu",
+    }
+}
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train MoE model with sequential experts")
+    parser.add_argument("--config", type=str, help="Path to config JSON")
+    parser.add_argument("--train_file", type=str, help="Training text file")
+    parser.add_argument("--val_file", type=str, help="Validation text file")
+    parser.add_argument("--save_dir", type=str, default="./checkpoints")
+    parser.add_argument("--resume", type=str, help="Checkpoint to resume from")
+    parser.add_argument("--no_wandb", action="store_true", help="Disable wandb")
+    return parser.parse_args()
+def load_config(args):
+    """Load and merge configuration"""
+    config = DEFAULT_CONFIG.copy()
+    if args.config and Path(args.config).exists():
+        with open(args.config) as f:
+            user_config = json.load(f)
+        # Deep merge
+        for key, value in user_config.items():
+            if key in config and isinstance(value, dict):
+                config[key].update(value)
+            else:
+                config[key] = value
+    # Override from CLI args
+    if args.train_file:
+        config["data"]["train_file"] = args.train_file
+    if args.val_file:
+        config["data"]["val_file"] = args.val_file
+    if args.save_dir:
+        config["training"]["save_dir"] = args.save_dir
+    if args.no_wandb:
+        config["logging"]["use_wandb"] = False
+    return config
+def setup_model(config, device):
+    """Initialize model and print size estimate"""
+    args = ModelArgs(**config["model"])
+    print("\n" + "="*70)
+    print("MODEL INITIALIZATION")
+    print("="*70 + "\n")
+    # Estimate size
+    size_info = estimate_model_size(args)
+    model = ismail(args).to(device)
+    # Compile for speed (PyTorch 2.0+)
+    if config["training"]["compile"]:
+        try:
+            model = torch.compile(model)
+            print("✅ Model compiled with torch.compile()\n")
+        except Exception as e:
+            print(f"⚠️  Compilation failed: {e}\n")
+    return model, args
+def setup_optimizer(model, config):
+    """Setup memory-efficient optimizer"""
+    training_cfg = config["training"]
+    # Separate parameter groups
+    expert_params = []
+    base_params = []
+    router_params = []
+    for name, param in model.named_parameters():
+        if "experts" in name and "shared" not in name:
+            expert_params.append(param)
+        elif "gate" in name:
+            router_params.append(param)
+        else:
+            base_params.append(param)
+    # Use 8-bit Adam if available
+    if HAS_BNB:
+        optimizer_class = bnb.optim.AdamW8bit
+        print("✅ Using AdamW8bit for memory efficiency")
+    else:
+        optimizer_class = torch.optim.AdamW
+        print("⚠️  Using standard AdamW (install bitsandbytes for memory savings)")
+    optimizer = optimizer_class(
+        [
+            {"params": base_params, "weight_decay": training_cfg["weight_decay"]},
+            {"params": expert_params, "weight_decay": training_cfg["weight_decay"]},
+            {"params": router_params, "weight_decay": 0.0},  # Usually no WD for router
+        ],
+        lr=training_cfg["learning_rate"],
+        betas=(training_cfg["beta1"], training_cfg["beta2"]),
+    )
+    return optimizer
+def get_lr(step, config):
+    """Learning rate scheduler with warmup and cosine decay"""
+    training_cfg = config["training"]
+    warmup_steps = training_cfg["warmup_steps"]
+    total_steps = training_cfg["total_steps"]
+    base_lr = training_cfg["learning_rate"]
+    if step < warmup_steps:
+        return base_lr * step / warmup_steps
+    # Cosine decay
+    progress = (step - warmup_steps) / (total_steps - warmup_steps)
+    return base_lr * 0.5 * (1 + math.cos(math.pi * progress))
+def load_data(config):
+    """Create data loaders"""
+    data_cfg = config["data"]
+    print("\n" + "="*70)
+    print("DATA LOADING")
+    print("="*70 + "\n")
+    from data import create_dataloader
+    train_loader = create_dataloader(
+        txt=Path(data_cfg["train_file"]).read_text(encoding="utf-8"),
+        args=ModelArgs(**config["model"]),
+        stride=data_cfg["stride"],
+        shuffle=True,
+        drop_last=True,
+    )
+    val_loader = create_dataloader(
+        txt=Path(data_cfg["val_file"]).read_text(encoding="utf-8"),
+        args=ModelArgs(**config["model"]),
+        stride=data_cfg["stride"],
+        shuffle=False,
+        drop_last=True,
+    )
+    print(f"✅ Train batches: {len(train_loader)}")
+    print(f"✅ Val batches: {len(val_loader)}\n")
+    return train_loader, val_loader
+def evaluate(model, val_loader, device, config):
+    """Evaluate model on validation set"""
+    model.eval()
+    total_loss = 0.0
+    total_tokens = 0
+    with torch.no_grad():
+        for input_ids, target_ids in val_loader:
+            input_ids = input_ids.to(device)
+            target_ids = target_ids.to(device)
+            logits, lb_loss = model(input_ids, start_pos=0)
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                target_ids.view(-1),
+                ignore_index=-1,
+            )
+            total_loss += loss.item() * target_ids.numel()
+            total_tokens += target_ids.numel()
+    model.train()
+    return total_loss / total_tokens
+def save_checkpoint(model, optimizer, step, config, expert_idx=None):
+    """Save model checkpoint"""
+    save_dir = Path(config["training"]["save_dir"])
+    save_dir.mkdir(parents=True, exist_ok=True)
+    # Create checkpoint name
+    if expert_idx is not None:
+        ckpt_name = f"step_{step}_expert_{expert_idx}.pt"
+    else:
+        ckpt_name = f"step_{step}.pt"
+    ckpt_path = save_dir / ckpt_name
+    checkpoint = {
+        "step": step,
+        "model_state_dict": model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "config": config,
+    }
+    torch.save(checkpoint, ckpt_path)
+    print(f"💾 Checkpoint saved: {ckpt_path}")
+def train_step(model, batch, device, config, scaler=None):
+    """Single training step"""
+    input_ids, target_ids = batch
+    input_ids = input_ids.to(device, non_blocking=True)
+    target_ids = target_ids.to(device, non_blocking=True)
+    # Forward pass
+    with torch.cuda.amp.autocast(enabled=(config["training"]["dtype"] == "bf16")):
+        logits, lb_loss = model(input_ids, start_pos=0)
+        # Main language modeling loss
+        lm_loss = F.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            target_ids.view(-1),
+            ignore_index=-1,
+        )
+        # Total loss with load balancing
+        total_loss = lm_loss + config["training"].get("lb_loss_coef", 0.01) * lb_loss
+    return total_loss, lm_loss, lb_loss
+def main():
+    args = parse_args()
+    config = load_config(args)
+    # Device setup
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Wandb setup
+    if config["logging"]["use_wandb"] and HAS_WANDB:
+        wandb.init(
+            project=config["logging"]["project_name"],
+            name=config["logging"]["run_name"],
+            config=config,
+        )
+    # Model setup
+    model, model_args = setup_model(config, device)
+    # Optimizer setup
+    optimizer = setup_optimizer(model, config)
+    # Data setup
+    train_loader, val_loader = load_data(config)
+    train_iter = iter(train_loader)
+    # Training state
+    step = 0
+    best_val_loss = float("inf")
+    # Resume from checkpoint
+    if args.resume:
+        ckpt = torch.load(args.resume, map_location=device)
+        model.load_state_dict(ckpt["model_state_dict"])
+        optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+        step = ckpt["step"]
+        print(f"✅ Resumed from step {step}\n")
+    # Gradient scaler for mixed precision
+    scaler = torch.cuda.amp.GradScaler(enabled=(config["training"]["dtype"] == "bf16"))
+    # Expert rotation schedule
+    current_expert = 0
+    rotation_steps = config["training"]["expert_rotation_steps"]
+    # Set initial expert
+    model.set_active_expert(current_expert)
+    print(f"🎯 Training expert {current_expert}/{model_args.n_routed_experts - 1}")
+    # Training loop
+    print("\n" + "="*70)
+    print("TRAINING STARTED")
+    print("="*70 + "\n")
+    model.train()
+    while step < config["training"]["total_steps"]:
+        step_start = time.time()
+        # Expert rotation
+        if step > 0 and step % rotation_steps == 0:
+            current_expert = (current_expert + 1) % model_args.n_routed_experts
+            model.set_active_expert(current_expert)
+            print(f"\n🔄 Rotating to expert {current_expert}/{model_args.n_routed_experts - 1}")
+            # Clear gradients after rotation
+            optimizer.zero_grad(set_to_none=True)
+        # Get batch with cycle handling
+        try:
+            batch = next(train_iter)
+        except StopIteration:
+            train_iter = iter(train_loader)
+            batch = next(train_iter)
+        # Training step with gradient accumulation
+        accum_steps = config["training"]["gradient_accumulation_steps"]
+        total_loss_accum = 0.0
+        lm_loss_accum = 0.0
+        lb_loss_accum = 0.0
+        for accum_step in range(accum_steps):
+            # Split batch for micro-batching (if needed)
+            # For now, process full batch
+            loss, lm_loss, lb_loss = train_step(model, batch, device, config, scaler)
+            # Normalize for accumulation
+            loss = loss / accum_steps
+            # Backward pass
+            if config["training"]["dtype"] == "bf16":
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            total_loss_accum += loss.item()
+            lm_loss_accum += lm_loss.item() / accum_steps
+            lb_loss_accum += lb_loss.item() / accum_steps
+        # Gradient clipping
+        if config["training"]["grad_clip"] > 0:
+            if config["training"]["dtype"] == "bf16":
+                scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), config["training"]["grad_clip"])
+        # Optimizer step
+        if config["training"]["dtype"] == "bf16":
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+        # LR scheduling
+        lr = get_lr(step, config)
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+        # Logging
+        if step % config["training"]["log_every"] == 0:
+            step_time = time.time() - step_start
+            tokens_per_sec = (model_args.max_batch_size * model_args.max_seq_len) / step_time
+            print(f"Step {step:6d} | "
+                  f"Loss: {lm_loss_accum:.4f} | "
+                  f"LB Loss: {lb_loss_accum:.4f} | "
+                  f"LR: {lr:.2e} | "
+                  f"Expert: {current_expert} | "
+                  f"Tokens/s: {tokens_per_sec:.0f}")
+            if config["logging"]["use_wandb"] and HAS_WANDB:
+                wandb.log({
+                    "step": step,
+                    "loss": lm_loss_accum,
+                    "load_balance_loss": lb_loss_accum,
+                    "total_loss": total_loss_accum,
+                    "learning_rate": lr,
+                    "active_expert": current_expert,
+                    "tokens_per_sec": tokens_per_sec,
+                    "gpu_memory_gb": torch.cuda.memory_allocated() / 1024**3,
+                })
+        # Evaluation
+        if step % config["training"]["eval_every"] == 0 and step > 0:
+            print(f"\n📊 Evaluating at step {step}...")
+            val_loss = evaluate(model, val_loader, device, config)
+            print(f"Val Loss: {val_loss:.4f} | Perplexity: {math.exp(val_loss):.2f}\n")
+            if config["logging"]["use_wandb"] and HAS_WANDB:
+                wandb.log({"val_loss": val_loss, "val_perplexity": math.exp(val_loss)})
+            # Save best model
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                save_checkpoint(model, optimizer, step, config, expert_idx="best")
+        # Save checkpoint
+        if step % config["training"]["save_every"] == 0 and step > 0:
+            save_checkpoint(model, optimizer, step, config, expert_idx=current_expert)
+        step += 1
+    # Final save
+    save_checkpoint(model, optimizer, step, config, expert_idx="final")
+    if config["logging"]["use_wandb"] and HAS_WANDB:
+        wandb.finish()
+    print("\n" + "="*70)
+    print("TRAINING COMPLETED")
+    print("="*70)
+if __name__ == "__main__":
+    main()