Some Fixes

Browse files

Files changed (3) hide show

Model_Architecture/diognose_weights.py +83 -0
Model_Architecture/model.py +35 -38
Model_Architecture/train.py +37 -18

Model_Architecture/diognose_weights.py ADDED Viewed

	@@ -0,0 +1,83 @@

+def diagnose_checkpoint(checkpoint_path, config, device):
+    """Diagnose if the checkpoint has actually learned anything"""
+    import torch
+    import numpy as np
+    print("🔍 Diagnosing checkpoint...")
+    # Load checkpoint
+    ckpt = torch.load(checkpoint_path, map_location=device)
+    # Create model with fixes
+    from model import ismail, ModelArgs
+    args = ModelArgs(**config["model"])
+    model = ismail(args).to(device)
+    # Load weights
+    model.load_state_dict(ckpt["model_state_dict"], strict=False)
+    model.eval()
+    # Check expert weight statistics
+    print("\n📊 Expert Weight Analysis:")
+    for name, param in model.named_parameters():
+        if "experts" in name and "routed" in name:
+            expert_idx = int(name.split("experts.")[1].split(".")[0])
+            weight_std = param.std().item()
+            weight_mean = param.mean().item()
+            print(f"  Expert {expert_idx}: mean={weight_mean:.6f}, std={weight_std:.6f}")
+    # Check router weights
+    print("\n🎯 Router Weight Analysis:")
+    for name, param in model.named_parameters():
+        if "gate.weight" in name:
+            weight_std = param.std().item()
+            weight_range = (param.max() - param.min()).item()
+            print(f"  {name}: std={weight_std:.6f}, range={weight_range:.6f}")
+            # Check if router has learned to differentiate
+            router_weights = param.detach().cpu()
+            correlations = []
+            for i in range(min(5, router_weights.shape[0])):
+                for j in range(i+1, min(5, router_weights.shape[0])):
+                    corr = torch.corrcoef(torch.stack([router_weights[i], router_weights[j]]))[0,1].item()
+                    correlations.append(abs(corr))
+            if correlations:
+                avg_correlation = np.mean(correlations)
+                print(f"  Average correlation between experts: {avg_correlation:.4f}")
+                if avg_correlation < 0.9:
+                    print("  ✅ Experts show differentiation (good!)")
+                else:
+                    print("  ⚠️ Experts are too similar (potential issue)")
+    # Test with random input
+    print("\n🎲 Testing with random input:")
+    with torch.no_grad():
+        test_input = torch.randint(0, config["model"]["vocab_size"], (2, 128)).to(device)
+        output = model(test_input)
+        if isinstance(output, tuple):
+            output = output[0]
+        # Check output statistics
+        output_std = output.std().item()
+        output_mean = output.mean().item()
+        print(f"  Output mean: {output_mean:.6f}, std: {output_std:.6f}")
+        if output_std > 0.1:
+            print("  ✅ Model produces varied outputs")
+        else:
+            print("  ⚠️ Model outputs might be collapsed")
+    return ckpt["step"]
+if __name__== "__main__":
+    import json
+    # Load config
+    with open("./config.json", "r") as f:
+        config = json.load(f)
+    # Run diagnostic
+    current_step = diagnose_checkpoint("./checkpoints/your_latest_checkpoint.pt", config, "cuda")
+    print(f"\n📍 Current step: {current_step}")

Model_Architecture/model.py CHANGED Viewed

@@ -395,33 +395,27 @@ class MoE(nn.Module):
         original_shape = x.size()
         x = x.view(-1, self.dim)
-        router_logits = F.linear(x, self.gate.weight, self.gate.bias)  # Use bias directly
         router_probs = router_logits.sigmoid()
         weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
         # Normalize weights
-        weights = weights / weights.sum(dim=-1, keepdim=True)
         weights = weights * self.gate.route_scale
-        # ✅ FIX: Sequential Training Mode - correct indexing logic
         if self.training and self.active_expert_idx is not None:
             y = torch.zeros_like(x)
             i = self.active_expert_idx
             # Find tokens where expert i is in the top-k
-            # indices shape: [num_tokens, top_k]
-            mask = (indices == i)  # shape: [num_tokens, top_k]
-            idx = torch.where(mask.any(dim=1))[0]  # token indices
             if idx.numel() > 0:
-                # For each token, find which position in top-k contains expert i
-                top_positions = torch.argmax(mask[idx].int(), dim=1)  # shape: [num_selected_tokens]
-                # Get weights for expert i
-                expert_weights = weights[idx, top_positions].unsqueeze(-1)  # shape: [num_selected_tokens, 1]
-                # Forward pass ONLY for active expert
                 expert_out = self.experts[i](x[idx])
                 y[idx] = expert_out * expert_weights
@@ -430,31 +424,32 @@ class MoE(nn.Module):
             # Shared experts
             z = self.shared_experts(x)
             return (y + z).view(original_shape), lb_loss
-        # Normal MoE Mode
-        y = torch.zeros_like(x)
-        for i in range(self.n_routed_experts):
-            mask = (indices == i)
-            idx = torch.where(mask.any(dim=1))[0]
-            if idx.numel() == 0:
-                continue
-            top_positions = torch.argmax(mask[idx].int(), dim=1)
-            expert_weights = weights[idx, top_positions].unsqueeze(-1)
-            expert_out = self.experts[i](x[idx])
-            y[idx] += expert_out * expert_weights
-        z = self.shared_experts(x)
-        output = (y + z).view(original_shape)
-        if self.training:
-            lb_loss = self.compute_load_balance_loss(router_probs, indices)
-            return output, lb_loss
-        else:
-            return output, None
@@ -536,6 +531,7 @@ class ismail(nn.Module):
         h = self.tok_embeddings(tokens).to(Linear.dtype)
         freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
         if start_pos == 0:
             for layer in self.layers:
                 if hasattr(layer.attn, 'kv_cache'):
@@ -545,9 +541,9 @@ class ismail(nn.Module):
         mask = None
         if seqlen > 1:
-            mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device)
             mask = torch.triu(mask, diagonal=1)
-            mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
         total_lb_loss = 0.0
@@ -559,7 +555,8 @@ class ismail(nn.Module):
         h = self.norm(h)
         output = self.output(h)
         if self.training and total_lb_loss > 0:
             return output, total_lb_loss
-        return output

         original_shape = x.size()
         x = x.view(-1, self.dim)
+        router_logits = linear(x, self.gate.weight, self.gate.bias)
         router_probs = router_logits.sigmoid()
         weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
         # Normalize weights
+        weights = weights / (weights.sum(dim=-1, keepdim=True) + 1e-8)  # Add epsilon for stability
         weights = weights * self.gate.route_scale
+        # CRITICAL FIX: Check training mode AND active expert
         if self.training and self.active_expert_idx is not None:
+            # Sequential training mode - only train one expert
             y = torch.zeros_like(x)
             i = self.active_expert_idx
             # Find tokens where expert i is in the top-k
+            mask = (indices == i)
+            idx = torch.where(mask.any(dim=1))[0]
             if idx.numel() > 0:
+                top_positions = torch.argmax(mask[idx].int(), dim=1)
+                expert_weights = weights[idx, top_positions].unsqueeze(-1)
                 expert_out = self.experts[i](x[idx])
                 y[idx] = expert_out * expert_weights
             # Shared experts
             z = self.shared_experts(x)
             return (y + z).view(original_shape), lb_loss
+        else:
+            # Inference mode or all-experts training mode
+            y = torch.zeros_like(x)
+            for i in range(self.n_routed_experts):
+                mask = (indices == i)
+                idx = torch.where(mask.any(dim=1))[0]
+                if idx.numel() == 0:
+                    continue
+                top_positions = torch.argmax(mask[idx].int(), dim=1)
+                expert_weights = weights[idx, top_positions].unsqueeze(-1)
+                expert_out = self.experts[i](x[idx])
+                y[idx] += expert_out * expert_weights
+            z = self.shared_experts(x)
+            output = (y + z).view(original_shape)
+            # Only compute load balance loss during training
+            if self.training:
+                lb_loss = self.compute_load_balance_loss(router_probs, indices)
+                return output, lb_loss
+            else:
+                return output, None
         h = self.tok_embeddings(tokens).to(Linear.dtype)
         freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
+        # CRITICAL: Always clear caches at start_pos=0, regardless of training mode
         if start_pos == 0:
             for layer in self.layers:
                 if hasattr(layer.attn, 'kv_cache'):
         mask = None
         if seqlen > 1:
+            mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device, dtype=h.dtype)
             mask = torch.triu(mask, diagonal=1)
+            mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device, dtype=h.dtype), mask])
         total_lb_loss = 0.0
         h = self.norm(h)
         output = self.output(h)
+        # FIX: Only return load balance loss during training
         if self.training and total_lb_loss > 0:
             return output, total_lb_loss
+        else:
+            return output

Model_Architecture/train.py CHANGED Viewed

@@ -264,6 +264,17 @@ def evaluate(model, val_loader, device, config, tokenizer, active_expert=None):
     """
     model.eval()
     # Clear caches...
     for layer in model.layers:
         if hasattr(layer.attn, 'kv_cache'):
@@ -273,11 +284,18 @@ def evaluate(model, val_loader, device, config, tokenizer, active_expert=None):
     # Set expert mode for validation
     if hasattr(model, 'set_active_expert'):
-        model.set_active_expert(active_expert)
         if active_expert is not None:
             print(f"   Validating with ONLY expert {active_expert}")
         else:
             print(f"   Validating with ALL experts")
     total_loss = 0.0
     total_tokens = 0
@@ -297,21 +315,9 @@ def evaluate(model, val_loader, device, config, tokenizer, active_expert=None):
             input_ids = input_ids.to(device, non_blocking=True)
             target_ids = target_ids.to(device, non_blocking=True)
-            # 🔥 VISUAL TURKISH SAMPLE: Show human-readable text
-            if i == 0:  # First batch only
-                sample_tokens = input_ids[0].cpu().tolist()
-                # Decode first 30 tokens (skip padding zeros)
-                non_zero_tokens = [t for t in sample_tokens[:30] if t > 0]
-                try:
-                    sample_text = tokenizer.decode(non_zero_tokens)
-                    # Truncate if too long
-                    if len(sample_text) > 60:
-                        sample_text = sample_text[:57] + "..."
-                    print(f"\n📝 Örnek Turkce metin: '{sample_text}'")
-                except Exception as e:
-                    print(f"\n⚠️ Decode failed: {e}\n   Tokens: {non_zero_tokens[:10]}...")
-            with torch.amp.autocast(device_type='cuda', enabled=(val_dtype == 'bf16')):
                 output = model(input_ids, start_pos=0)
                 logits = output[0] if isinstance(output, tuple) else output
@@ -328,6 +334,16 @@ def evaluate(model, val_loader, device, config, tokenizer, active_expert=None):
             pbar.set_postfix({'loss': f'{loss.item():.3f}'})
     pbar.close()
     model.train()
     final_loss = total_loss / total_tokens
@@ -339,7 +355,6 @@ def evaluate(model, val_loader, device, config, tokenizer, active_expert=None):
     return final_loss
 def save_checkpoint(model, optimizer, step, config, expert_idx=None):
     """Save model checkpoint"""
     save_dir = Path(config["training"]["save_dir"])
@@ -392,7 +407,11 @@ def train_step(model, input_mb, target_mb, device, config, scaler=None):
     input_mb = input_mb.to(device, non_blocking=True)
     target_mb = target_mb.to(device, non_blocking=True)
-    with torch.amp.autocast(device_type='cuda', enabled=(config["training"]["dtype"] == "bf16")):
         output = model(input_mb, start_pos=0)
         if isinstance(output, tuple):

     """
     model.eval()
+    # CRITICAL FIX: Store original gradient requirements for experts
+    original_expert_grads = {}
+    for name, param in model.named_parameters():
+        if "experts" in name:
+            original_expert_grads[name] = param.requires_grad
+    # Enable gradients for all experts during evaluation
+    for name, param in model.named_parameters():
+        if "experts" in name:
+            param.requires_grad = True
     # Clear caches...
     for layer in model.layers:
         if hasattr(layer.attn, 'kv_cache'):
     # Set expert mode for validation
     if hasattr(model, 'set_active_expert'):
+        # CRITICAL: For validation, temporarily set to None (all experts)
+        # even if we're in sequential training mode
         if active_expert is not None:
             print(f"   Validating with ONLY expert {active_expert}")
+            # Store the actual active expert but use all for forward pass
+            validation_expert = active_expert
         else:
             print(f"   Validating with ALL experts")
+            validation_expert = None
+        # Always use all experts for validation forward pass
+        model.set_active_expert(None)
     total_loss = 0.0
     total_tokens = 0
             input_ids = input_ids.to(device, non_blocking=True)
             target_ids = target_ids.to(device, non_blocking=True)
+            # CRITICAL: Use proper autocast settings based on dtype
+            use_autocast = val_dtype in ['bf16', 'fp16']
+            with torch.amp.autocast(device_type='cuda', enabled=use_autocast, dtype=torch.bfloat16 if val_dtype == 'bf16' else torch.float16):
                 output = model(input_ids, start_pos=0)
                 logits = output[0] if isinstance(output, tuple) else output
             pbar.set_postfix({'loss': f'{loss.item():.3f}'})
     pbar.close()
+    # CRITICAL: Restore original gradient requirements
+    for name, param in model.named_parameters():
+        if name in original_expert_grads:
+            param.requires_grad = original_expert_grads[name]
+    # Restore the active expert if in sequential training mode
+    if hasattr(model, 'set_active_expert') and 'validation_expert' in locals():
+        model.set_active_expert(validation_expert)
     model.train()
     final_loss = total_loss / total_tokens
     return final_loss
 def save_checkpoint(model, optimizer, step, config, expert_idx=None):
     """Save model checkpoint"""
     save_dir = Path(config["training"]["save_dir"])
     input_mb = input_mb.to(device, non_blocking=True)
     target_mb = target_mb.to(device, non_blocking=True)
+    training_dtype = config["training"]["dtype"].lower()
+    use_autocast = training_dtype in ['bf16', 'fp16']
+    autocast_dtype = torch.bfloat16 if training_dtype == 'bf16' else torch.float16
+    with torch.amp.autocast(device_type='cuda', enabled=use_autocast, dtype=autocast_dtype if use_autocast else None):
         output = model(input_mb, start_pos=0)
         if isinstance(output, tuple):