AbstractPhil
/

liminal-staircase-v2

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on Nov 17, 2025

Commit

cb977ae

verified ·

1 Parent(s): f512b47

Update trainer_v2.py

Browse files

Files changed (1) hide show

trainer_v2.py +77 -44

trainer_v2.py CHANGED Viewed

@@ -189,7 +189,7 @@ class DanbooruTrainingConfig:
             use_gradient_checkpointing=self.use_gradient_checkpointing,
             share_scale_embeddings=self.share_scale_embeddings,
             geometric_init_method="hybrid",
-            geometric_init_validate=False,
             geometric_init_seed=42
         )
@@ -199,7 +199,7 @@ class DanbooruTrainingConfig:
 # ============================================================================
 class CheckpointManager:
-    """Manages checkpoints with proper naming (no step in directory name)."""
     def __init__(
         self,
@@ -210,11 +210,16 @@ class CheckpointManager:
     ):
         self.local_dir = Path(local_dir)
         self.hf_repo_id = hf_repo_id
-        self.sub_name = sub_name
         self.hf_private = hf_private
-        # Checkpoint directory structure: checkpoints/{sub_name}/{timestamp}/
-        self.sub_checkpoint_dir = self.local_dir / sub_name
         self.sub_checkpoint_dir.mkdir(parents=True, exist_ok=True)
         self.checkpoints_file = self.sub_checkpoint_dir / "checkpoints.json"
@@ -242,6 +247,7 @@ class CheckpointManager:
                 return json.load(f)
         return {
             "sub_name": self.sub_name,
             "checkpoints": [],
             "latest": None,
             "best": None
@@ -252,9 +258,8 @@ class CheckpointManager:
             json.dump(self.checkpoint_history, f, indent=2)
     def get_checkpoint_dir(self, step: int, epoch: int) -> Path:
-        """Generate checkpoint directory name (timestamp-based, step in metadata)."""
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        dirname = f"epoch{epoch}_step{step}_{timestamp}"
         return self.sub_checkpoint_dir / dirname
     def _safe_state_dict(self, model: nn.Module) -> Dict[str, torch.Tensor]:
@@ -311,11 +316,12 @@ class CheckpointManager:
         ckpt_dir.mkdir(parents=True, exist_ok=True)
         print(f"\n💾 Saving checkpoint: {self.sub_name}/{ckpt_dir.name}")
         state_dict = self._safe_state_dict(model)
         weights_path = ckpt_dir / "model.safetensors"
         save_file(state_dict, weights_path)
-        print(f"  ✓ Model weights: {weights_path.name}")
         training_state = {
             'epoch': epoch,
@@ -323,7 +329,8 @@ class CheckpointManager:
             'optimizer_state_dict': optimizer.state_dict(),
             'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
             'val_loss': val_loss,
-            'sub_name': self.sub_name
         }
         torch.save(training_state, ckpt_dir / "training_state.pt")
         print(f"  ✓ Training state: training_state.pt")
@@ -393,7 +400,7 @@ class CheckpointManager:
             traceback.print_exc()
     def find_latest_checkpoint(self) -> Optional[Dict]:
-        """Find the latest checkpoint for this sub_name."""
         checkpoints = self.checkpoint_history.get('checkpoints', [])
         if checkpoints:
             return max(checkpoints, key=lambda x: x['step'])
@@ -409,7 +416,7 @@ class CheckpointManager:
         latest = self.find_latest_checkpoint()
         if not latest:
-            print(f"ℹ️  No previous checkpoint found for sub_name='{self.sub_name}'")
             return 0, 0, float('inf')
         ckpt_dir = self.sub_checkpoint_dir / latest['dirname']
@@ -437,7 +444,7 @@ class CheckpointManager:
                 print(f"  ⚠️  Checkpoint directory not found: {ckpt_dir}")
                 return 0, 0, float('inf')
-        print(f"\n🔄 Resuming from checkpoint: {latest['dirname']}")
         print(f"   Step: {latest['step']}, Epoch: {latest['epoch']}, Val Loss: {latest['val_loss']:.4f}")
         weights_path = ckpt_dir / "model.safetensors"
@@ -1117,18 +1124,30 @@ class DanbooruLiminalStaircaseTrainer:
             return
         for key, value in metrics.items():
-            self.writer.add_scalar(f"{prefix}/{key}", value, self.global_step)
-        current_lr = self.optimizer.param_groups[0]['lr']
-        self.writer.add_scalar("train/learning_rate", current_lr, self.global_step)
-        # Log text modality stats
-        if self.global_step % self.config.log_every == 0:
             total = sum(self.text_dropout_stats.values()) or 1
             for mode, count in self.text_dropout_stats.items():
                 self.writer.add_scalar(f"text_modality/{mode}_pct", 100 * count / total, self.global_step)
-        if self.global_step % (self.config.log_every * 10) == 0:
             fusion_diag = self.get_fusion_diagnostics()
             for i, w in enumerate(fusion_diag.get('layer_weights', [])):
@@ -1142,6 +1161,8 @@ class DanbooruLiminalStaircaseTrainer:
             for i, b in enumerate(fusion_diag.get('beta_per_scale', [])):
                 self.writer.add_scalar(f"fusion/beta_scale_{i}", b, self.global_step)
     @torch.no_grad()
     def validate(self, max_batches: int = 100) -> Dict[str, float]:
@@ -1197,14 +1218,20 @@ class DanbooruLiminalStaircaseTrainer:
                     continue
             if stats_with_text['count'] == 0 or stats_vision_only['count'] == 0:
-                return {'loss/val': float('inf'), 'acc/val': 0.0}
             return {
-                'loss/val_with_text': stats_with_text['loss'] / stats_with_text['count'],
-                'acc/val_with_text': stats_with_text['acc'] / stats_with_text['count'],
-                'loss/val_vision_only': stats_vision_only['loss'] / stats_vision_only['count'],
-                'acc/val_vision_only': stats_vision_only['acc'] / stats_vision_only['count'],
-                # Overall metric = vision-only (the real use case)
                 'loss/val': stats_vision_only['loss'] / stats_vision_only['count'],
                 'acc/val': stats_vision_only['acc'] / stats_vision_only['count'],
             }
@@ -1212,7 +1239,14 @@ class DanbooruLiminalStaircaseTrainer:
         except Exception as e:
             print(f"\n⚠️  Validation completely failed: {e}")
             traceback.print_exc()
-            return {'loss/val': float('inf'), 'acc/val': 0.0}
     def save_checkpoint_and_upload(self, epoch: int, val_loss: float = float('inf'), is_best: bool = False):
         """Save checkpoint first, then optionally upload."""
@@ -1294,8 +1328,8 @@ class DanbooruLiminalStaircaseTrainer:
                             print("\n🔍 Running validation...")
                             val_metrics = self.validate(max_batches=50)
                             self.log_metrics(val_metrics, prefix="val")
-                            print(f"✓ Val (with text)   - Loss: {val_metrics.get('loss/val_with_text', 0):.4f}, Acc: {val_metrics.get('acc/val_with_text', 0):.4f}")
-                            print(f"✓ Val (vision-only) - Loss: {val_metrics.get('loss/val_vision_only', 0):.4f}, Acc: {val_metrics.get('acc/val_vision_only', 0):.4f}")
                     # HuggingFace upload
                     if (self.config.hf_repo_id and
@@ -1305,8 +1339,8 @@ class DanbooruLiminalStaircaseTrainer:
                         if self.accelerator.is_main_process:
                             print("\n🔍 Running validation for upload...")
                             val_metrics = self.validate(max_batches=50)
-                            print(f"✓ Val (with text)   - Loss: {val_metrics.get('loss/val_with_text', 0):.4f}, Acc: {val_metrics.get('acc/val_with_text', 0):.4f}")
-                            print(f"✓ Val (vision-only) - Loss: {val_metrics.get('loss/val_vision_only', 0):.4f}, Acc: {val_metrics.get('acc/val_vision_only', 0):.4f}")
                 if self._interrupt_received:
                     break
@@ -1320,11 +1354,11 @@ class DanbooruLiminalStaircaseTrainer:
                     print(f"\n📊 Validation Results:")
                     print(f"  With Text:")
-                    print(f"    Loss: {val_metrics.get('loss/val_with_text', 0):.4f}")
-                    print(f"    Acc:  {val_metrics.get('acc/val_with_text', 0):.4f}")
                     print(f"  Vision-Only (PRIMARY METRIC):")
-                    print(f"    Loss: {val_metrics.get('loss/val_vision_only', 0):.4f}")
-                    print(f"    Acc:  {val_metrics.get('acc/val_vision_only', 0):.4f}")
                     self.log_metrics(val_metrics, prefix="val")
@@ -1369,7 +1403,6 @@ class DanbooruLiminalStaircaseTrainer:
         if self.writer:
             self.writer.close()
 # ============================================================================
 # MAIN
 # ============================================================================
@@ -1377,16 +1410,16 @@ class DanbooruLiminalStaircaseTrainer:
 if __name__ == "__main__":
     config = DanbooruTrainingConfig(
         # Run identifier
-        sub_name="danbooru-50k-v1-512",
         # Model architecture
         num_opinion_anchors=225,
-        pentachoron_dim=256,
         scales=[128, 256, 512, 1024],
-        scale_hidden_dims={128: 128, 256: 512, 512: 1024, 1024: 2048},
         # Fusion controller
-        alpha_init=0.1,
         alpha_learnable=True,
         beta_init=0.5,
         beta_learnable=True,
@@ -1394,16 +1427,16 @@ if __name__ == "__main__":
         learn_layer_weights=True,
         # Encoders
-        clip_skip=0,
-        siglip_layer_indices=[3, 6, 9, 12, 21, 23, 24, 25, 26],
         # Optimizations
         use_gradient_checkpointing=False,
         share_scale_embeddings=False,
         # Training
-        batch_size=32,
-        num_epochs=3,
         learning_rate=1e-4,
         save_every=500,
@@ -1417,7 +1450,7 @@ if __name__ == "__main__":
         text_dropout_end=0.5,
         # Resume
-        resume=True,
         # HuggingFace
         hf_repo_id="AbstractPhil/liminal-staircase-v2",

             use_gradient_checkpointing=self.use_gradient_checkpointing,
             share_scale_embeddings=self.share_scale_embeddings,
             geometric_init_method="hybrid",
+            geometric_init_validate=True,
             geometric_init_seed=42
         )
 # ============================================================================
 class CheckpointManager:
+    """Manages checkpoints with run timestamp, simple step-based checkpoint names."""
     def __init__(
         self,
     ):
         self.local_dir = Path(local_dir)
         self.hf_repo_id = hf_repo_id
+        self.base_sub_name = sub_name
+        # ADD RUN TIMESTAMP TO SUB_NAME (once, when training starts)
+        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.sub_name = f"{sub_name}-{run_timestamp}"
         self.hf_private = hf_private
+        # Checkpoint directory: checkpoints/{sub_name-timestamp}/
+        self.sub_checkpoint_dir = self.local_dir / self.sub_name
         self.sub_checkpoint_dir.mkdir(parents=True, exist_ok=True)
         self.checkpoints_file = self.sub_checkpoint_dir / "checkpoints.json"
                 return json.load(f)
         return {
             "sub_name": self.sub_name,
+            "base_name": self.base_sub_name,
             "checkpoints": [],
             "latest": None,
             "best": None
             json.dump(self.checkpoint_history, f, indent=2)
     def get_checkpoint_dir(self, step: int, epoch: int) -> Path:
+        """Generate checkpoint directory name: just step{N}."""
+        dirname = f"step{step}"
         return self.sub_checkpoint_dir / dirname
     def _safe_state_dict(self, model: nn.Module) -> Dict[str, torch.Tensor]:
         ckpt_dir.mkdir(parents=True, exist_ok=True)
         print(f"\n💾 Saving checkpoint: {self.sub_name}/{ckpt_dir.name}")
+        print(f"   Step: {step}, Epoch: {epoch}")
         state_dict = self._safe_state_dict(model)
         weights_path = ckpt_dir / "model.safetensors"
         save_file(state_dict, weights_path)
+        print(f"  ✓ Model weights: model.safetensors")
         training_state = {
             'epoch': epoch,
             'optimizer_state_dict': optimizer.state_dict(),
             'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
             'val_loss': val_loss,
+            'sub_name': self.sub_name,
+            'base_name': self.base_sub_name
         }
         torch.save(training_state, ckpt_dir / "training_state.pt")
         print(f"  ✓ Training state: training_state.pt")
             traceback.print_exc()
     def find_latest_checkpoint(self) -> Optional[Dict]:
+        """Find the latest checkpoint for this training run."""
         checkpoints = self.checkpoint_history.get('checkpoints', [])
         if checkpoints:
             return max(checkpoints, key=lambda x: x['step'])
         latest = self.find_latest_checkpoint()
         if not latest:
+            print(f"ℹ️  No previous checkpoint found for training run '{self.sub_name}'")
             return 0, 0, float('inf')
         ckpt_dir = self.sub_checkpoint_dir / latest['dirname']
                 print(f"  ⚠️  Checkpoint directory not found: {ckpt_dir}")
                 return 0, 0, float('inf')
+        print(f"\n🔄 Resuming from checkpoint: {self.sub_name}/{latest['dirname']}")
         print(f"   Step: {latest['step']}, Epoch: {latest['epoch']}, Val Loss: {latest['val_loss']:.4f}")
         weights_path = ckpt_dir / "model.safetensors"
             return
         for key, value in metrics.items():
+            # Handle validation metrics that already have prefixes
+            if prefix == "val" and key.startswith(('loss/', 'acc/')):
+                # Strip the redundant prefix
+                clean_key = key.replace('loss/', '').replace('acc/', '')
+                self.writer.add_scalar(f"val/{clean_key}", value, self.global_step)
+            else:
+                self.writer.add_scalar(f"{prefix}/{key}", value, self.global_step)
+        # Log learning rate
+        if prefix == "train":
+            current_lr = self.optimizer.param_groups[0]['lr']
+            self.writer.add_scalar("train/learning_rate", current_lr, self.global_step)
+        # Flush to disk
+        self.writer.flush()
+        # Log text modality stats periodically
+        if prefix == "train" and self.global_step % self.config.log_every == 0:
             total = sum(self.text_dropout_stats.values()) or 1
             for mode, count in self.text_dropout_stats.items():
                 self.writer.add_scalar(f"text_modality/{mode}_pct", 100 * count / total, self.global_step)
+        # Log fusion diagnostics periodically
+        if prefix == "train" and self.global_step % (self.config.log_every * 10) == 0:
             fusion_diag = self.get_fusion_diagnostics()
             for i, w in enumerate(fusion_diag.get('layer_weights', [])):
             for i, b in enumerate(fusion_diag.get('beta_per_scale', [])):
                 self.writer.add_scalar(f"fusion/beta_scale_{i}", b, self.global_step)
+            self.writer.flush()
     @torch.no_grad()
     def validate(self, max_batches: int = 100) -> Dict[str, float]:
                     continue
             if stats_with_text['count'] == 0 or stats_vision_only['count'] == 0:
+                return {
+                    'val_with_text_loss': float('inf'),
+                    'val_with_text_acc': 0.0,
+                    'val_vision_only_loss': float('inf'),
+                    'val_vision_only_acc': 0.0,
+                    'loss/val': float('inf'),
+                    'acc/val': 0.0
+                }
             return {
+                'val_with_text_loss': stats_with_text['loss'] / stats_with_text['count'],
+                'val_with_text_acc': stats_with_text['acc'] / stats_with_text['count'],
+                'val_vision_only_loss': stats_vision_only['loss'] / stats_vision_only['count'],
+                'val_vision_only_acc': stats_vision_only['acc'] / stats_vision_only['count'],
                 'loss/val': stats_vision_only['loss'] / stats_vision_only['count'],
                 'acc/val': stats_vision_only['acc'] / stats_vision_only['count'],
             }
         except Exception as e:
             print(f"\n⚠️  Validation completely failed: {e}")
             traceback.print_exc()
+            return {
+                'val_with_text_loss': float('inf'),
+                'val_with_text_acc': 0.0,
+                'val_vision_only_loss': float('inf'),
+                'val_vision_only_acc': 0.0,
+                'loss/val': float('inf'),
+                'acc/val': 0.0
+            }
     def save_checkpoint_and_upload(self, epoch: int, val_loss: float = float('inf'), is_best: bool = False):
         """Save checkpoint first, then optionally upload."""
                             print("\n🔍 Running validation...")
                             val_metrics = self.validate(max_batches=50)
                             self.log_metrics(val_metrics, prefix="val")
+                            print(f"✓ Val (with text)   - Loss: {val_metrics['val_with_text_loss']:.4f}, Acc: {val_metrics['val_with_text_acc']:.4f}")
+                            print(f"✓ Val (vision-only) - Loss: {val_metrics['val_vision_only_loss']:.4f}, Acc: {val_metrics['val_vision_only_acc']:.4f}")
                     # HuggingFace upload
                     if (self.config.hf_repo_id and
                         if self.accelerator.is_main_process:
                             print("\n🔍 Running validation for upload...")
                             val_metrics = self.validate(max_batches=50)
+                            print(f"✓ Val (with text)   - Loss: {val_metrics['val_with_text_loss']:.4f}, Acc: {val_metrics['val_with_text_acc']:.4f}")
+                            print(f"✓ Val (vision-only) - Loss: {val_metrics['val_vision_only_loss']:.4f}, Acc: {val_metrics['val_vision_only_acc']:.4f}")
                 if self._interrupt_received:
                     break
                     print(f"\n📊 Validation Results:")
                     print(f"  With Text:")
+                    print(f"    Loss: {val_metrics['val_with_text_loss']:.4f}")
+                    print(f"    Acc:  {val_metrics['val_with_text_acc']:.4f}")
                     print(f"  Vision-Only (PRIMARY METRIC):")
+                    print(f"    Loss: {val_metrics['val_vision_only_loss']:.4f}")
+                    print(f"    Acc:  {val_metrics['val_vision_only_acc']:.4f}")
                     self.log_metrics(val_metrics, prefix="val")
         if self.writer:
             self.writer.close()
 # ============================================================================
 # MAIN
 # ============================================================================
 if __name__ == "__main__":
     config = DanbooruTrainingConfig(
         # Run identifier
+        sub_name="danbooru-50k-v1-512-2",
         # Model architecture
         num_opinion_anchors=225,
+        pentachoron_dim=512,
         scales=[128, 256, 512, 1024],
+        scale_hidden_dims={128: 256, 256: 512, 512: 1024, 1024: 2048},
         # Fusion controller
+        alpha_init=0.125,
         alpha_learnable=True,
         beta_init=0.5,
         beta_learnable=True,
         learn_layer_weights=True,
         # Encoders
+        clip_skip=1,
+        siglip_layer_indices=[1, 2, 3, 4, 5, 6, 9, 12, 18, 21, 23, 24, 25, 26],
         # Optimizations
         use_gradient_checkpointing=False,
         share_scale_embeddings=False,
         # Training
+        batch_size=24,
+        num_epochs=20,
         learning_rate=1e-4,
         save_every=500,
         text_dropout_end=0.5,
         # Resume
+        resume=False,
         # HuggingFace
         hf_repo_id="AbstractPhil/liminal-staircase-v2",