algorythmtechnologies
/

Supernova25million

English

Model card Files Files and versions

xet

Community

algorythmtechnologies commited on Sep 21, 2025

Commit

3a0f81c

verified ·

1 Parent(s): a55cadf

Update supernova/train.py

Browse files

Files changed (1) hide show

supernova/train.py +24 -30

supernova/train.py CHANGED Viewed

@@ -15,11 +15,11 @@ from transformers import get_cosine_schedule_with_warmup
 from .config import ModelConfig
 from .model import SupernovaModel
 from .tokenizer import load_gpt2_tokenizer
-from .data import load_sources_from_yaml, TokenChunkDataset
-# -----------------------
 # Utilities
-# -----------------------
 def compute_grad_norm(model: nn.Module) -> float:
     total = 0.0
     for p in model.parameters():
@@ -61,9 +61,9 @@ class EMA:
                 p.data.copy_(self.backup[name])
         del self.backup
-# -----------------------
 # Training loop
-# -----------------------
 def train(
     config_path: str,
     data_config_path: str,
@@ -153,7 +153,7 @@ def train(
         drop_last=True,
     )
-    # optimizer with simple parameter grouping example to avoid weight decay on norms/bias
     def param_groups(model):
         decay, no_decay = [], []
         for n, p in model.named_parameters():
@@ -213,12 +213,12 @@ def train(
     running_loss = 0.0
     t0 = time.time()
     no_improve_steps = 0
-    early_stop_patience = 10_000  # you can tune this
     # training loop
     while step < max_steps:
         if sampler is not None:
-            sampler.set_epoch(step)  # shuffle differently per epoch for DDP
         for batch in dl:
             x, y = batch
@@ -266,26 +266,21 @@ def train(
                 # periodic validation
                 if validate_every and step % validate_every == 0:
                     if val_dl is None:
-                        # quick in-memory val split: take first N batches (user should replace with real val)
-                        # NOTE: for production, create a dedicated validation dataset.
-                         if val_dl is None:
-    # Create a proper validation dataset with a small subset of training sources
-    val_sources = []
-    for source in sources[:min(3, len(sources))]:  # Use first few sources for validation
-        val_source = DataSource(
-            name=f"{source.name}_val",
-            hf_path="wikitext",  # Use a reliable, small dataset for validation
-            hf_name="wikitext-2-v1",
-            split="validation",
-            text_field="text",
-            weight=1,
-            streaming=False
-        )
-        val_sources.append(val_source)
-    val_ds = TokenChunkDataset(val_sources, seq_len=seq_len, eos_token_id=tok.eos_token_id)
-    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False)
                     model.eval()
                     # optionally swap in EMA weights for evaluation
@@ -329,7 +324,7 @@ def train(
                         }
                         if not ddp or local_rank == 0:
                             atomic_save(ckpt, best_path)
-                            print(f"Saved best checkpoint to {best_path}")
                     else:
                         no_improve_steps += validate_every
                         if no_improve_steps >= early_stop_patience:
@@ -376,7 +371,6 @@ def train(
     if writer:
         writer.close()
 if __name__ == "__main__":
     ap = argparse.ArgumentParser()
     ap.add_argument("--config", required=True)

 from .config import ModelConfig
 from .model import SupernovaModel
 from .tokenizer import load_gpt2_tokenizer
+from .data import load_sources_from_yaml, TokenChunkDataset, DataSource
+# ------------------------------
 # Utilities
+# ------------------------------
 def compute_grad_norm(model: nn.Module) -> float:
     total = 0.0
     for p in model.parameters():
                 p.data.copy_(self.backup[name])
         del self.backup
+# ------------------------------
 # Training loop
+# ------------------------------
 def train(
     config_path: str,
     data_config_path: str,
         drop_last=True,
     )
+    # optimizer with simple parameter grouping to avoid weight decay on norms/bias
     def param_groups(model):
         decay, no_decay = [], []
         for n, p in model.named_parameters():
     running_loss = 0.0
     t0 = time.time()
     no_improve_steps = 0
+    early_stop_patience = 10_000 # you can tune this
     # training loop
     while step < max_steps:
         if sampler is not None:
+            sampler.set_epoch(step) # shuffle differently per epoch for DDP
         for batch in dl:
             x, y = batch
                 # periodic validation
                 if validate_every and step % validate_every == 0:
                     if val_dl is None:
+                        # Create a proper validation dataset with a small subset of training sources
+                        val_sources = []
+                        for source in sources[:min(3, len(sources))]:
+                            val_source = DataSource(
+                                name=f"{source.name}_val",
+                                hf_path="wikitext",  # Use a reliable, small dataset for validation
+                                hf_name="wikitext-2-v1",
+                                split="validation",
+                                text_field="text",
+                                weight=1,
+                                streaming=False  # Don't stream validation data
+                            )
+                            val_sources.append(val_source)
+                        val_ds = TokenChunkDataset(val_sources, seq_len=seq_len, eos_token_id=tok.eos_token_id)
+                        val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False)
                     model.eval()
                     # optionally swap in EMA weights for evaluation
                         }
                         if not ddp or local_rank == 0:
                             atomic_save(ckpt, best_path)
+                        print(f"Saved best checkpoint to {best_path}")
                     else:
                         no_improve_steps += validate_every
                         if no_improve_steps >= early_stop_patience:
     if writer:
         writer.close()
 if __name__ == "__main__":
     ap = argparse.ArgumentParser()
     ap.add_argument("--config", required=True)