Better Configuration Implementation

Browse files

Files changed (5) hide show

Model_Architecture/config.json +59 -0
Model_Architecture/generation.py +15 -19
Model_Architecture/model.py +34 -11
Model_Architecture/model_size.py +13 -2
Model_Architecture/train.py +43 -25

Model_Architecture/config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+    "model": {
+        "max_batch_size": 8,
+        "max_seq_len": 2048,
+        "dtype": "bf16",
+        "scale_fmt": null,
+        "vocab_size": 102400,
+        "dim": 1024,
+        "inter_dim": 4096,
+        "moe_inter_dim": 1024,
+        "n_layers": 20,
+        "n_dense_layers": 3,
+        "n_heads": 12,
+        "n_routed_experts": 6,
+        "n_shared_experts": 1,
+        "n_activated_experts": 2,
+        "route_scale": 1.0,
+        "use_routing_bias": true,
+        "q_lora_rank": 0,
+        "kv_lora_rank": 512,
+        "qk_nope_head_dim": 128,
+        "qk_rope_head_dim": 64,
+        "v_head_dim": 128,
+        "original_seq_len": 4096,
+        "rope_theta": 10000.0,
+        "rope_factor": 40,
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "mscale": 1.0,
+        "tokenizer_name": "gpt2"
+    },
+    "training": {
+        "learning_rate": 3e-4,
+        "weight_decay": 0.1,
+        "beta1": 0.9,
+        "beta2": 0.95,
+        "grad_clip": 1.0,
+        "warmup_steps": 1000,
+        "total_steps": 50000,
+        "expert_rotation_steps": 2000,
+        "gradient_accumulation_steps": 16,
+        "eval_every": 1000,
+        "save_every": 5000,
+        "save_dir": "./checkpoints",
+        "log_every": 100,
+        "dtype": "bf16",
+        "compile": true
+    },
+    "data": {
+        "train_file": "./data/train.txt",
+        "val_file": "./data/val.txt",
+        "stride": 512
+    },
+    "logging": {
+        "use_wandb": true,
+        "project_name": "sequential-moe",
+        "run_name": "moe-12gb-gpu"
+    }
+}

Model_Architecture/generation.py CHANGED Viewed

@@ -128,25 +128,20 @@ def token_ids_to_text(token_ids, tokenizer):
 #####################################
 if __name__ == "__main__":
     # Example configuration - smaller model for testing
-    args = ModelArgs(
-        max_batch_size=4,
-        max_seq_len=1024,
-        vocab_size=50257,  # GPT-2 vocab size
-        dim=768,
-        inter_dim=3072,
-        moe_inter_dim=768,
-        n_layers=12,
-        n_dense_layers=1,
-        n_heads=12,
-        n_routed_experts=8,
-        n_shared_experts=2,
-        n_activated_experts=2,
-        kv_lora_rank=256,
-        qk_nope_head_dim=64,
-        qk_rope_head_dim=32,
-        v_head_dim=64,
-    )
     # Initialize model and tokenizer
     print("Initializing model...")
@@ -154,7 +149,8 @@ if __name__ == "__main__":
     model = ismail(args)
     model.eval()
-    tokenizer = tiktoken.get_encoding("gpt2")
     # Example 1: Greedy generation (argmax)
     print(f"\n{'='*60}")

 #####################################
 if __name__ == "__main__":
+    import json
+    from pathlib import Path
     # Example configuration - smaller model for testing
+    config_path = Path("config.json")
+    if config_path.exists():
+        with open(config_path) as f:
+            config = json.load(f)
+        print(f"✅ Loaded config from {config_path}")
+        args = ModelArgs(**config["model"])
+    else:
+        print("⚠️ config.json not found, using default ModelArgs")
+        args = ModelArgs()
     # Initialize model and tokenizer
     print("Initializing model...")
     model = ismail(args)
     model.eval()
+    tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
+    tokenizer = tiktoken.get_encoding(tokenizer_name)
     # Example 1: Greedy generation (argmax)
     print(f"\n{'='*60}")

Model_Architecture/model.py CHANGED Viewed

@@ -52,6 +52,8 @@ class ModelArgs:
     beta_slow: int = 1
     mscale: float = 1.
 # others
 world_size = 1
 rank = 0
@@ -304,9 +306,8 @@ class Gate(nn.Module):
         indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
         weights = original_scores.gather(1, indices)
-        # Normalize weights if using sigmoid
-        if self.score_func == "sigmoid":
-            weights = weights / weights.sum(dim=-1, keepdim=True)
         # Apply route scaling
         weights = weights * self.route_scale
@@ -387,10 +388,9 @@ class MoE(nn.Module):
         # Select top-k experts
         weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
-        # Normalize weights
-        if self.gate.score_func == "sigmoid":
-            weights = weights / weights.sum(dim=-1, keepdim=True)
         weights = weights * self.gate.route_scale
         # Sequential Training Mode
@@ -468,10 +468,19 @@ class Block(nn.Module):
         self.attn_norm = RMSNorm(args.dim)
         self.ffn_norm = RMSNorm(args.dim)
-    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> torch.Tensor:
         x = x + self.attn(self.attn_norm(x), start_pos, freqs_cis, mask)
-        x = x + self.ffn(self.ffn_norm(x))
-        return x
 #####################################
@@ -492,6 +501,12 @@ class ismail(nn.Module):
         self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
     def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
         bsz, seqlen = tokens.shape
         h = self.tok_embeddings(tokens)
@@ -504,9 +519,17 @@ class ismail(nn.Module):
             mask = torch.triu(mask, diagonal=1)
             mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
         for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
         h = self.norm(h)
         output = self.output(h)
         return output

     beta_slow: int = 1
     mscale: float = 1.
+    tokenizer_name: str = "gpt2"  #
 # others
 world_size = 1
 rank = 0
         indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
         weights = original_scores.gather(1, indices)
+        # Normalize weights (sigmoid always needs normalization)
+        weights = weights / weights.sum(dim=-1, keepdim=True)
         # Apply route scaling
         weights = weights * self.route_scale
         # Select top-k experts
         weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
+        # Normalize weights (sigmoid always needs normalization)
+        weights = weights / weights.sum(dim=-1, keepdim=True)
         weights = weights * self.gate.route_scale
         # Sequential Training Mode
         self.attn_norm = RMSNorm(args.dim)
         self.ffn_norm = RMSNorm(args.dim)
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         x = x + self.attn(self.attn_norm(x), start_pos, freqs_cis, mask)
+        # Handle both MLP (returns single output) and MoE (returns output + loss)
+        ffn_result = self.ffn(self.ffn_norm(x))
+        if isinstance(ffn_result, tuple):
+            ffn_out, lb_loss = ffn_result
+        else:
+            ffn_out = ffn_result
+            lb_loss = None
+        x = x + ffn_out
+        return x, lb_loss
 #####################################
         self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
+    def set_active_expert(self, expert_idx: Optional[int]):
+        """Set active expert for all MoE layers (for sequential training)"""
+        for layer in self.layers:
+            if isinstance(layer.ffn, MoE):
+                layer.ffn.set_active_expert(expert_idx)
     def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
         bsz, seqlen = tokens.shape
         h = self.tok_embeddings(tokens)
             mask = torch.triu(mask, diagonal=1)
             mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
+        total_lb_loss = 0.0
         for layer in self.layers:
+            h, lb_loss = layer(h, start_pos, freqs_cis, mask)
+            if lb_loss is not None:
+                total_lb_loss += lb_loss
         h = self.norm(h)
         output = self.output(h)
+        # Return output and total load balancing loss if training
+        if self.training and total_lb_loss > 0:
+            return output, total_lb_loss
         return output

Model_Architecture/model_size.py CHANGED Viewed

@@ -219,8 +219,19 @@ def estimate_model_size(args: ModelArgs):
 if __name__ == "__main__":
-    # Load default configuration
-    args = ModelArgs()
     # Run estimation
     results = estimate_model_size(args)

 if __name__ == "__main__":
+    import json
+    from pathlib import Path
+    # Try to load from config.json, otherwise use defaults
+    config_path = Path(__file__).parent / "config.json"
+    if config_path.exists():
+        print(f"📄 Loading configuration from {config_path}")
+        with open(config_path) as f:
+            config = json.load(f)
+        args = ModelArgs(**config["model"])
+    else:
+        print("⚠️  config.json not found, using default ModelArgs")
+        args = ModelArgs()
     # Run estimation
     results = estimate_model_size(args)

Model_Architecture/train.py CHANGED Viewed

@@ -31,29 +31,37 @@ except ImportError:
     HAS_BNB = False
     print("⚠️  bitsandbytes not installed. Run 'pip install bitsandbytes' for memory-efficient optimizer.")
-# Configuration
 DEFAULT_CONFIG = {
     "model": {
-        "vocab_size": 32000,  # Reduced from 102400
         "dim": 1024,
         "inter_dim": 4096,
         "moe_inter_dim": 1024,
-        "n_layers": 16,
-        "n_dense_layers": 1,  # Only first layer dense
-        "n_heads": 16,  # Increased for better parallelism
-        # MoE
         "n_routed_experts": 6,
         "n_shared_experts": 1,
         "n_activated_experts": 2,
-        # MLA
-        "q_lora_rank": 128,  # Enable Q LoRA
         "kv_lora_rank": 512,
-        "qk_nope_head_dim": 64,
-        "qk_rope_head_dim": 32,
-        "v_head_dim": 64,
-        # Sequence
-        "max_seq_len": 2048,  # Start shorter
-        "max_batch_size": 4,
     },
     "training": {
         "learning_rate": 3e-4,
@@ -237,22 +245,25 @@ def evaluate(model, val_loader, device, config):
     model.eval()
     total_loss = 0.0
     total_tokens = 0
     with torch.no_grad():
         for input_ids, target_ids in val_loader:
             input_ids = input_ids.to(device)
             target_ids = target_ids.to(device)
-            logits, lb_loss = model(input_ids, start_pos=0)
             loss = F.cross_entropy(
                 logits.view(-1, logits.size(-1)),
                 target_ids.view(-1),
                 ignore_index=-1,
             )
             total_loss += loss.item() * target_ids.numel()
             total_tokens += target_ids.numel()
     model.train()
     return total_loss / total_tokens
@@ -286,22 +297,29 @@ def train_step(model, batch, device, config, scaler=None):
     input_ids, target_ids = batch
     input_ids = input_ids.to(device, non_blocking=True)
     target_ids = target_ids.to(device, non_blocking=True)
     # Forward pass
     with torch.cuda.amp.autocast(enabled=(config["training"]["dtype"] == "bf16")):
-        logits, lb_loss = model(input_ids, start_pos=0)
         # Main language modeling loss
         lm_loss = F.cross_entropy(
             logits.view(-1, logits.size(-1)),
             target_ids.view(-1),
             ignore_index=-1,
         )
         # Total loss with load balancing
         total_loss = lm_loss + config["training"].get("lb_loss_coef", 0.01) * lb_loss
-    return total_loss, lm_loss, lb_loss
 def main():

     HAS_BNB = False
     print("⚠️  bitsandbytes not installed. Run 'pip install bitsandbytes' for memory-efficient optimizer.")
+# Configuration - matches ModelArgs defaults
 DEFAULT_CONFIG = {
     "model": {
+        "max_batch_size": 8,
+        "max_seq_len": 2048,
+        "dtype": "bf16",
+        "scale_fmt": None,
+        "vocab_size": 102400,
         "dim": 1024,
         "inter_dim": 4096,
         "moe_inter_dim": 1024,
+        "n_layers": 20,
+        "n_dense_layers": 3,
+        "n_heads": 12,
         "n_routed_experts": 6,
         "n_shared_experts": 1,
         "n_activated_experts": 2,
+        "route_scale": 1.0,
+        "use_routing_bias": True,
+        "q_lora_rank": 0,
         "kv_lora_rank": 512,
+        "qk_nope_head_dim": 128,
+        "qk_rope_head_dim": 64,
+        "v_head_dim": 128,
+        "original_seq_len": 4096,
+        "rope_theta": 10000.0,
+        "rope_factor": 40,
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "mscale": 1.0,
+        "tokenizer_name": "gpt2",
     },
     "training": {
         "learning_rate": 3e-4,
     model.eval()
     total_loss = 0.0
     total_tokens = 0
     with torch.no_grad():
         for input_ids, target_ids in val_loader:
             input_ids = input_ids.to(device)
             target_ids = target_ids.to(device)
+            # Model returns just logits in eval mode (no lb_loss)
+            output = model(input_ids, start_pos=0)
+            logits = output if not isinstance(output, tuple) else output[0]
             loss = F.cross_entropy(
                 logits.view(-1, logits.size(-1)),
                 target_ids.view(-1),
                 ignore_index=-1,
             )
             total_loss += loss.item() * target_ids.numel()
             total_tokens += target_ids.numel()
     model.train()
     return total_loss / total_tokens
     input_ids, target_ids = batch
     input_ids = input_ids.to(device, non_blocking=True)
     target_ids = target_ids.to(device, non_blocking=True)
     # Forward pass
     with torch.cuda.amp.autocast(enabled=(config["training"]["dtype"] == "bf16")):
+        output = model(input_ids, start_pos=0)
+        # Handle model output (tuple in training mode with MoE, single tensor otherwise)
+        if isinstance(output, tuple):
+            logits, lb_loss = output
+        else:
+            logits = output
+            lb_loss = 0.0
         # Main language modeling loss
         lm_loss = F.cross_entropy(
             logits.view(-1, logits.size(-1)),
             target_ids.view(-1),
             ignore_index=-1,
         )
         # Total loss with load balancing
         total_loss = lm_loss + config["training"].get("lb_loss_coef", 0.01) * lb_loss
+    return total_loss, lm_loss, lb_loss if isinstance(lb_loss, float) else lb_loss.item()
 def main():