Fixes are lies

Files changed (3) hide show

Model_Architecture/config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "model": {
-        "max_batch_size": 2,
         "max_seq_len": 512,
         "dtype": "bf16",
         "scale_fmt": null,
@@ -37,7 +37,7 @@
         "grad_clip": 1.0,
         "warmup_steps": 1000,
         "total_steps": 100000,
-        "use_checkpointing": true,
         "expert_rotation_steps": 5000,
         "gradient_accumulation_steps": 8,
         "eval_every": 1000,
@@ -45,7 +45,7 @@
         "save_dir": "./checkpoints",
         "log_every": 100,
         "dtype": "bf16",
-        "compile": true
     },
     "data": {
         "train_file": "./data/train.txt",

 {
     "model": {
+        "max_batch_size": 8,
         "max_seq_len": 512,
         "dtype": "bf16",
         "scale_fmt": null,
         "grad_clip": 1.0,
         "warmup_steps": 1000,
         "total_steps": 100000,
+        "use_checkpointing": false,
         "expert_rotation_steps": 5000,
         "gradient_accumulation_steps": 8,
         "eval_every": 1000,
         "save_dir": "./checkpoints",
         "log_every": 100,
         "dtype": "bf16",
+        "compile": false
     },
     "data": {
         "train_file": "./data/train.txt",

Model_Architecture/model.py CHANGED Viewed

@@ -520,7 +520,6 @@ class ismail(nn.Module):
         h = self.tok_embeddings(tokens).to(Linear.dtype)
         freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
-        # Create causal mask
         mask = None
         if seqlen > 1:
             mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device)
@@ -528,25 +527,16 @@ class ismail(nn.Module):
             mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
         total_lb_loss = 0.0
         for layer in self.layers:
-            layer.start_pos = start_pos
-            layer.freqs_cis = freqs_cis
-            layer.mask = mask
-            if self.training and self.use_checkpointing:
-                from torch.utils.checkpoint import checkpoint
-                h, lb_loss = checkpoint(layer.checkpoint_forward, h, use_reentrant=False )
-            else:
-                h, lb_loss = layer(h, start_pos, freqs_cis, mask)
             if lb_loss is not None:
                 total_lb_loss += lb_loss
         h = self.norm(h)
         output = self.output(h)
-        # Return output and total load balancing loss if training
         if self.training and total_lb_loss > 0:
             return output, total_lb_loss
         return output

         h = self.tok_embeddings(tokens).to(Linear.dtype)
         freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
         mask = None
         if seqlen > 1:
             mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device)
             mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
         total_lb_loss = 0.0
+        # ✅ SIMPLE forward pass - no checkpointing
         for layer in self.layers:
+            h, lb_loss = layer(h, start_pos, freqs_cis, mask)
             if lb_loss is not None:
                 total_lb_loss += lb_loss
         h = self.norm(h)
         output = self.output(h)
         if self.training and total_lb_loss > 0:
             return output, total_lb_loss
         return output

Model_Architecture/train.py CHANGED Viewed

@@ -13,12 +13,10 @@ from pathlib import Path
 import json
 import time
 import math
-from torch.utils.checkpoint import checkpoint
 # Import your model
 from model import ismail, ModelArgs
-from model_size import estimate_model_size
 # Try to import optional dependencies
 try:

 import json
 import time
 import math
 # Import your model
 from model import ismail, ModelArgs
 # Try to import optional dependencies
 try: