algorythmtechnologies
/

Supernova25million

English

Model card Files Files and versions

xet

Community

algorythmtechnologies commited on Nov 16, 2025

Commit

ca8d994

verified ·

1 Parent(s): 333e53d

Update supernova/train.py

Browse files

Files changed (1) hide show

supernova/train.py +23 -28

supernova/train.py CHANGED Viewed

@@ -31,7 +31,7 @@ def compute_grad_norm(model: nn.Module, debug: bool = False) -> float:
             grad_count += 1
             param_norm = p.grad.data.float().norm(2).item()
             total += param_norm * param_norm
-            if debug and param_norm > 1e-8:  # Only print non-zero gradients
                 print(f"  {name}: grad_norm={param_norm:.6f}")
         elif debug:
             print(f"  {name}: NO GRAD")
@@ -46,13 +46,13 @@ def atomic_save(obj: Dict[str, Any], path: str):
     torch.save(obj, tmp)
     os.replace(tmp, path)
-def save_safetensors(model_state_dict: Dict[str, torch.Tensor], path: str):
     """Save model weights in safetensors format."""
     try:
         tmp = path + ".tmp"
         save_file(model_state_dict, tmp)
         os.replace(tmp, path)
-        print(f"Saved safetensors to {path}")
     except Exception as e:
         print(f"Warning: Failed to save safetensors: {e}")
@@ -111,14 +111,13 @@ def train(
     num_workers: int = 4,
     pin_memory: bool = True,
     compile_model: bool = False,
-    save_safetensors: bool = True,
 ):
     # reproducibility
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     import random
     random.seed(seed)
-    # performance flags
     torch.backends.cudnn.benchmark = True
     # device / distributed
@@ -136,7 +135,6 @@ def train(
     assert tok.vocab_size == cfg.vocab_size, "Tokenizer vocab size mismatch."
     model = SupernovaModel(cfg)
-    # optional: enable gradient checkpointing for memory saving if model supports it
     if hasattr(model, "gradient_checkpointing_enable"):
         try:
             model.gradient_checkpointing_enable()
@@ -145,24 +143,19 @@ def train(
     model.to(device)
-    # double-check params
     total_params = sum(p.numel() for p in model.parameters())
     assert total_params == 25_000_000, f"Model has {total_params} params, expected 25,000,000"
-    # optional compile (PyTorch 2.0)
     if compile_model:
         try:
             model = torch.compile(model)
         except Exception as e:
             print("torch.compile not available/failed:", e)
-    # DDP wrap
     if ddp:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], find_unused_parameters=False)
-    # dataset and dataloader
     sources = load_sources_from_yaml(data_config_path)
-    # TODO: improve TokenChunkDataset to perform token-packing (pack multiple short examples into one sequence)
     ds = TokenChunkDataset(
         tokenizer=tok,
         sources=sources,
@@ -171,7 +164,6 @@ def train(
     )
     sampler = DistributedSampler(ds) if ddp else None
-    # NOTE: NO shuffle for IterableDataset!
     dl = DataLoader(
         ds,
         batch_size=batch_size,
@@ -182,7 +174,6 @@ def train(
         drop_last=True,
     )
-    # optimizer
     def param_groups(model):
         decay, no_decay = [], []
         for n, p in model.named_parameters():
@@ -199,20 +190,16 @@ def train(
     optimizer = torch.optim.AdamW(param_groups(model), lr=lr, betas=(0.9, 0.95), eps=1e-8)
     scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps)
-    # AMP scaler
     scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
-    # EMA
     ema = EMA(model if not ddp else model.module, decay=ema_decay) if use_ema else None
     os.makedirs(out_dir, exist_ok=True)
     writer = SummaryWriter(log_dir=os.path.join(out_dir, "runs")) if use_tensorboard and (not ddp or local_rank == 0) else None
-    # validation
     val_ds = None
     val_dl = None
-    # resume
     start_step = 0
     best_val_loss = float("inf")
     if resume_from and os.path.exists(resume_from):
@@ -236,12 +223,11 @@ def train(
     running_loss = 0.0
     t0 = time.time()
     no_improve_steps = 0
-    early_stop_patience = 10_000 # you can tune this
-    # training loop
     while step < max_steps:
         if sampler is not None:
-            sampler.set_epoch(step) # shuffle differently per epoch for DDP
         for batch in dl:
             x, y = batch
@@ -262,10 +248,8 @@ def train(
                     scaler.unscale_(optimizer)
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
-                # Compute gradient norm BEFORE clearing gradients (only when needed for logging)
                 grad_norm = None
                 if (step + 1) % 50 == 0 and (not ddp or local_rank == 0):
-                    # Enable debug mode for first few steps to diagnose gradient issues
                     debug_gradients = step < 5
                     grad_norm = compute_grad_norm(model if not ddp else model.module, debug=debug_gradients)
@@ -278,7 +262,6 @@ def train(
                     ema.update(model if not ddp else model.module)
                 step += 1
-                # logging
                 if step % 50 == 0 and (not ddp or local_rank == 0) and grad_norm is not None:
                     avg_loss = running_loss * grad_accum / 50.0
                     running_loss = 0.0
@@ -291,11 +274,8 @@ def train(
                         writer.add_scalar("train/lr", lr_now, step)
                     t0 = time.time()
-                # periodic validation
                 if validate_every and step % validate_every == 0:
                     if val_dl is None:
-                        # Use a proper validation dataset with wikitext-2 validation split
-                        # This provides more reliable validation than using training data subsets
                         val_sources = []
                         for source in sources[:min(3, len(sources))]:
                             val_source = DataSource(
@@ -344,7 +324,22 @@ def train(
                     if mean_val < best_val_loss:
                         best_val_loss = mean_val
                         no_improve_steps = 0
-                        best_path = os.path.join(out_dir, f"supernova_best_step{step}.pt")
                         model_state = model.module.state_dict() if ddp else model.state_dict()
                         ckpt = {
-                            "model_state_dict": model_state

             grad_count += 1
             param_norm = p.grad.data.float().norm(2).item()
             total += param_norm * param_norm
+            if debug and param_norm > 1e-8:
                 print(f"  {name}: grad_norm={param_norm:.6f}")
         elif debug:
             print(f"  {name}: NO GRAD")
     torch.save(obj, tmp)
     os.replace(tmp, path)
+def save_safetensors_checkpoint(model_state_dict: Dict[str, torch.Tensor], path: str):
     """Save model weights in safetensors format."""
     try:
         tmp = path + ".tmp"
         save_file(model_state_dict, tmp)
         os.replace(tmp, path)
+        print(f"✓ Saved safetensors to {path}")
     except Exception as e:
         print(f"Warning: Failed to save safetensors: {e}")
     num_workers: int = 4,
     pin_memory: bool = True,
     compile_model: bool = False,
+    export_safetensors: bool = True,
 ):
     # reproducibility
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     import random
     random.seed(seed)
     torch.backends.cudnn.benchmark = True
     # device / distributed
     assert tok.vocab_size == cfg.vocab_size, "Tokenizer vocab size mismatch."
     model = SupernovaModel(cfg)
     if hasattr(model, "gradient_checkpointing_enable"):
         try:
             model.gradient_checkpointing_enable()
     model.to(device)
     total_params = sum(p.numel() for p in model.parameters())
     assert total_params == 25_000_000, f"Model has {total_params} params, expected 25,000,000"
     if compile_model:
         try:
             model = torch.compile(model)
         except Exception as e:
             print("torch.compile not available/failed:", e)
     if ddp:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], find_unused_parameters=False)
     sources = load_sources_from_yaml(data_config_path)
     ds = TokenChunkDataset(
         tokenizer=tok,
         sources=sources,
     )
     sampler = DistributedSampler(ds) if ddp else None
     dl = DataLoader(
         ds,
         batch_size=batch_size,
         drop_last=True,
     )
     def param_groups(model):
         decay, no_decay = [], []
         for n, p in model.named_parameters():
     optimizer = torch.optim.AdamW(param_groups(model), lr=lr, betas=(0.9, 0.95), eps=1e-8)
     scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps)
     scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
     ema = EMA(model if not ddp else model.module, decay=ema_decay) if use_ema else None
     os.makedirs(out_dir, exist_ok=True)
     writer = SummaryWriter(log_dir=os.path.join(out_dir, "runs")) if use_tensorboard and (not ddp or local_rank == 0) else None
     val_ds = None
     val_dl = None
     start_step = 0
     best_val_loss = float("inf")
     if resume_from and os.path.exists(resume_from):
     running_loss = 0.0
     t0 = time.time()
     no_improve_steps = 0
+    early_stop_patience = 10_000
     while step < max_steps:
         if sampler is not None:
+            sampler.set_epoch(step)
         for batch in dl:
             x, y = batch
                     scaler.unscale_(optimizer)
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
                 grad_norm = None
                 if (step + 1) % 50 == 0 and (not ddp or local_rank == 0):
                     debug_gradients = step < 5
                     grad_norm = compute_grad_norm(model if not ddp else model.module, debug=debug_gradients)
                     ema.update(model if not ddp else model.module)
                 step += 1
                 if step % 50 == 0 and (not ddp or local_rank == 0) and grad_norm is not None:
                     avg_loss = running_loss * grad_accum / 50.0
                     running_loss = 0.0
                         writer.add_scalar("train/lr", lr_now, step)
                     t0 = time.time()
                 if validate_every and step % validate_every == 0:
                     if val_dl is None:
                         val_sources = []
                         for source in sources[:min(3, len(sources))]:
                             val_source = DataSource(
                     if mean_val < best_val_loss:
                         best_val_loss = mean_val
                         no_improve_steps = 0
+                        best_path_pt = os.path.join(out_dir, f"supernova_best_step{step}.pt")
                         model_state = model.module.state_dict() if ddp else model.state_dict()
                         ckpt = {
+                            "model_state_dict": model_state,
+                            "optimizer_state_dict": optimizer.state_dict(),
+                            "scheduler_state_dict": scheduler.state_dict(),
+                            "scaler_state_dict": (scaler.state_dict() if scaler else None),
+                            "step": step,
+                            "best_val_loss": best_val_loss,
+                            "config": cfg.__dict__,
+                        }
+                        if not ddp or local_rank == 0:
+                            atomic_save(ckpt, best_path_pt)
+                            print(f"Saved best checkpoint to {best_path_pt}")
+                            # Save safetensors
+                            if export_safetensors:
+                                best_path_st = os.path.join(out_dir, f"supernova_best_step{step}.safetensors")
+                                save_safetensors_checkpoint(