AbstractPhil
/

gated-david

+"""
+David Training Pipeline
+========================
+Author: AbstractPhil
+Assistant: Claude Sonnet 4.5
+------------------------------------------------------------=
+Training pipeline for David multi-scale feature classifier.
+Will be placed officially at: geovocab2/train/model/core/david_trainer.py
+Or run from: scripts/train_david.py
+Runs on colab without hassle, set your repo and your HF_TOKEN as a userdata secret in colab.
+Features:
+- Pure fp32 training (no mixed precision for geometric stability)
+- Can enable mixed if you want speed.
+- Adaptive training controller (freeze/unfreeze scales)
+- Gradient analysis and scaling
+- SafeTensors checkpointing and epoch control support
+- Enhanced loss component tracking
+- Proper weight organization: weights/model_name/timestamp/
+- Accuracy in filenames and comprehensive tracking
+- Saves models into a shared index (MODELS_INDEX.json) in the repo.
+- Parses a readme if one exists, creates a repo if one doesn't.
+"""
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from datasets import load_dataset
+from huggingface_hub import HfApi, create_repo, upload_folder, upload_file
+import numpy as np
+import os
+import json
+import time
+import tempfile
+from datetime import datetime
+from tqdm.auto import tqdm
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass, field, asdict
+# Import David components
+from geovocab2.train.config.david_config import (
+    DavidArchitectureConfig,
+    DavidPresets,
+    SharingMode,
+    FusionMode
+)
+from geovocab2.train.model.core.david import (
+    David,
+    MultiScaleCrystalLoss,
+)
+# Import SimplexFactory
+from geovocab2.shapes.factory import SimplexFactory
+# ============================================================================
+# TRAINING CONFIGURATION
+# ============================================================================
+@dataclass
+class DavidTrainingConfig:
+    """
+    Complete training configuration for David.
+    Separate from model architecture config.
+    """
+    # Metadata
+    name: str = "david_training"
+    run_id: str = ""  # Auto-generated timestamp
+    # Dataset
+    dataset_name: str = "AbstractPhil/imagenet-clip-features-orderly"
+    model_variant: str = "clip_vit_b16"
+    num_classes: int = 1000
+    # Model architecture (references to david_config)
+    preset: Optional[str] = "balanced"  # Or None to use custom config
+    custom_config_path: Optional[str] = None  # Path to custom david_config.json
+    # Architecture overrides (applied to preset or custom config)
+    num_classes_override: Optional[int] = None
+    use_belly_override: Optional[bool] = None
+    belly_expand_override: Optional[float] = None
+    progressive_training_override: Optional[bool] = None  # Override progressive training
+    scale_warmup_epochs_override: Optional[Dict[int, int]] = None  # Custom warmup schedule
+    # Training hyperparameters
+    num_epochs: int = 50
+    batch_size: int = 512
+    learning_rate: float = 5e-3
+    weight_decay: float = 1e-5
+    warmup_epochs: int = 3
+    # Loss weights
+    use_rose_loss: bool = True
+    rose_initial_weight: float = 0.01
+    rose_max_weight: float = 0.1
+    rose_weight_schedule: str = "adaptive"
+    use_cayley_loss: bool = False
+    cayley_weight: float = 0.001
+    scale_loss_balance: Optional[Dict[int, float]] = None
+    # Optimization
+    use_mixed_precision: bool = False  # Keep False for stability
+    gradient_clip: float = 5.0
+    scheduler_type: str = "cosine_restarts"
+    min_lr: float = 1e-6
+    # Adaptive training (safer defaults)
+    freeze_strategy: str = "never"  # "performance" or "never"
+    freeze_threshold: float = 90.0  # Only freeze when scale hits 90% accuracy
+    unfreeze_on_plateau: bool = True
+    patience: int = 10
+    # Gradient monitoring
+    track_gradients: bool = True
+    gradient_scale_threshold: float = 1e-5
+    gradient_scale_multiplier: float = 10.0
+    # Logging
+    log_interval: int = 50
+    val_interval: int = 1
+    save_interval: int = 5
+    log_fusion_weights: bool = True
+    log_loss_components: bool = True
+    # Checkpointing
+    save_format: str = "both"  # "pytorch", "safetensors", or "both"
+    # HuggingFace Hub (optional)
+    hf_repo: Optional[str] = "YourName/Repo" #"AbstractPhil/gated-david"  # Your HF repo
+    upload_to_hub: bool = False
+    # Local paths
+    base_dir: str = "./david_training"
+    # Hardware
+    num_workers: int = 10
+    pin_memory: bool = True
+    prefetch_factor: int = 4
+    persistent_workers: bool = True
+    def __post_init__(self):
+        """Generate run_id if not provided."""
+        if not self.run_id:
+            self.run_id = datetime.now().strftime('%Y%m%d_%H%M%S')
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: dict) -> 'DavidTrainingConfig':
+        """Create from dictionary."""
+        return cls(**data)
+    def to_json(self, path: str):
+        """Save to JSON."""
+        data = self.to_dict()
+        # Convert any nested dicts with int keys to str keys
+        if data.get('scale_loss_balance'):
+            data['scale_loss_balance'] = {
+                str(k): v for k, v in data['scale_loss_balance'].items()
+            }
+        if data.get('scale_warmup_epochs_override'):
+            data['scale_warmup_epochs_override'] = {
+                str(k): v for k, v in data['scale_warmup_epochs_override'].items()
+            }
+        with open(path, 'w') as f:
+            json.dump(data, f, indent=2)
+    @classmethod
+    def from_json(cls, path: str) -> 'DavidTrainingConfig':
+        """Load from JSON."""
+        with open(path, 'r') as f:
+            data = json.load(f)
+        # Convert str keys back to int for scale_loss_balance
+        if 'scale_loss_balance' in data and data['scale_loss_balance']:
+            data['scale_loss_balance'] = {
+                int(k): v for k, v in data['scale_loss_balance'].items()
+            }
+        # Convert str keys back to int for scale_warmup_epochs_override
+        if 'scale_warmup_epochs_override' in data and data['scale_warmup_epochs_override']:
+            data['scale_warmup_epochs_override'] = {
+                int(k): v for k, v in data['scale_warmup_epochs_override'].items()
+            }
+        return cls(**data)
+# ============================================================================
+# ADAPTIVE TRAINING CONTROLLER
+# ============================================================================
+class AdaptiveTrainingController:
+    """Manages adaptive training strategies for multi-scale model."""
+    def __init__(self, model: David, config: DavidTrainingConfig):
+        self.model = model
+        self.config = config
+        scales = model.scales
+        self.scale_history = {scale: [] for scale in scales}
+        self.best_scale_acc = {scale: 0.0 for scale in scales}
+        self.scales_frozen = {scale: False for scale in scales}
+        self.overall_history = []
+        self.plateau_counter = 0
+        self.best_overall = 0.0
+    def update_metrics(self, scale_accuracies: Dict[int, float], overall_accuracy: float):
+        """Update metrics and best scores."""
+        for scale, acc in scale_accuracies.items():
+            self.scale_history[scale].append(acc)
+            if acc > self.best_scale_acc[scale]:
+                self.best_scale_acc[scale] = acc
+        self.overall_history.append(overall_accuracy)
+        if overall_accuracy > self.best_overall:
+            self.best_overall = overall_accuracy
+            self.plateau_counter = 0
+        else:
+            self.plateau_counter += 1
+    def should_freeze_scale(self, scale: int, current_acc: float) -> bool:
+        """Determine if a scale should be frozen."""
+        if self.config.freeze_strategy == "never":
+            return False
+        if self.scales_frozen[scale]:
+            return False
+        if self.config.freeze_strategy == "performance":
+            return current_acc >= self.config.freeze_threshold
+        return False
+    def should_unfreeze_scales(self) -> bool:
+        """Check if scales should be unfrozen due to plateau."""
+        if not self.config.unfreeze_on_plateau:
+            return False
+        return self.plateau_counter >= 5
+    def apply_adaptive_strategies(self, scale_accuracies: Dict[int, float], epoch: int):
+        """Apply freeze/unfreeze based on performance."""
+        active_scales = self.model.get_active_scales()
+        # Don't freeze scales if it would leave no trainable parameters
+        for scale, acc in scale_accuracies.items():
+            if self.should_freeze_scale(scale, acc):
+                # Count how many active scales would remain unfrozen
+                active_unfrozen = [s for s in active_scales if not self.scales_frozen.get(s, False)]
+                if len(active_unfrozen) <= 1:
+                    print(f"[⚠️] Skipping freeze of scale {scale} (would leave no active trainable scales)")
+                    continue
+                self.model.freeze_scale(scale)
+                self.scales_frozen[scale] = True
+                print(f"[❄️] Froze scale {scale} (acc={acc:.2f}%)")
+        if self.should_unfreeze_scales() and any(self.scales_frozen.values()):
+            for scale in self.model.scales:
+                if self.scales_frozen[scale]:
+                    self.model.unfreeze_scale(scale)
+                    self.scales_frozen[scale] = False
+            self.plateau_counter = 0
+            print(f"[🔥] Unfroze all scales due to plateau")
+# ============================================================================
+# OPTIMIZER & SCHEDULER CREATION
+# ============================================================================
+def create_optimizer(david: David, config: DavidTrainingConfig) -> torch.optim.Optimizer:
+    """Create optimizer with parameter groups."""
+    param_groups = []
+    # Shared parameters (if exists)
+    if hasattr(david, 'shared_extractor'):
+        param_groups.append({
+            'params': david.shared_extractor.parameters(),
+            'lr': config.learning_rate,
+            'name': 'shared'
+        })
+    elif hasattr(david, 'shared_base'):
+        param_groups.append({
+            'params': david.shared_base.parameters(),
+            'lr': config.learning_rate,
+            'name': 'shared'
+        })
+    # Scale-specific parameters
+    for scale in david.scales:
+        scale_params = []
+        if david.sharing_mode == SharingMode.HIERARCHICAL:
+            head = getattr(david, f'head_{scale}', None)
+            if head:
+                scale_params.extend(head.parameters())
+            refine = getattr(david, f'refine_{scale}', None)
+            if refine:
+                scale_params.extend(refine.parameters())
+        else:
+            scale_params.extend(david.heads[str(scale)].parameters())
+        if scale_params:
+            param_groups.append({
+                'params': scale_params,
+                'lr': config.learning_rate,
+                'name': f'scale_{scale}'
+            })
+    # Fusion parameters
+    if hasattr(david, 'fusion'):
+        param_groups.append({
+            'params': david.fusion.parameters(),
+            'lr': config.learning_rate * 0.5,
+            'name': 'fusion'
+        })
+    elif hasattr(david, 'fusion_weights'):
+        param_groups.append({
+            'params': [david.fusion_weights],
+            'lr': config.learning_rate * 0.5,
+            'name': 'fusion'
+        })
+    return torch.optim.AdamW(param_groups, weight_decay=config.weight_decay)
+def create_scheduler(optimizer: torch.optim.Optimizer,
+                     config: DavidTrainingConfig) -> torch.optim.lr_scheduler._LRScheduler:
+    """Create learning rate scheduler."""
+    if config.scheduler_type == "cosine_restarts":
+        return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+            optimizer, T_0=10, T_mult=2, eta_min=config.min_lr
+        )
+    elif config.scheduler_type == "cosine":
+        return torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=config.num_epochs, eta_min=config.min_lr
+        )
+    else:
+        return None
+# ============================================================================
+# GRADIENT ANALYSIS
+# ============================================================================
+def analyze_gradients(model: David, config: DavidTrainingConfig) -> Dict[str, float]:
+    """Analyze gradient magnitudes for debugging."""
+    grad_stats = {
+        'mean': 0.0,
+        'max': 0.0,
+        'min': float('inf'),
+        'num_zero': 0,
+        'num_small': 0,
+        'total': 0
+    }
+    for name, param in model.named_parameters():
+        if param.grad is not None:
+            grad_norm = param.grad.norm().item()
+            grad_stats['mean'] += grad_norm
+            grad_stats['max'] = max(grad_stats['max'], grad_norm)
+            grad_stats['min'] = min(grad_stats['min'], grad_norm)
+            grad_stats['total'] += 1
+            if grad_norm < 1e-10:
+                grad_stats['num_zero'] += 1
+            elif grad_norm < config.gradient_scale_threshold:
+                grad_stats['num_small'] += 1
+    if grad_stats['total'] > 0:
+        grad_stats['mean'] /= grad_stats['total']
+    return grad_stats
+def scale_small_gradients(model: David, config: DavidTrainingConfig):
+    """Scale up very small gradients to prevent vanishing."""
+    if not config.track_gradients:
+        return
+    for param in model.parameters():
+        if param.grad is not None:
+            grad_norm = param.grad.norm()
+            if grad_norm < config.gradient_scale_threshold and grad_norm > 0:
+                param.grad.mul_(config.gradient_scale_multiplier)
+# ============================================================================
+# HUGGINGFACE HUB UTILITIES
+# ============================================================================
+def generate_model_readme(
+    config: DavidTrainingConfig,
+    david_config: DavidArchitectureConfig,
+    best_metrics: Dict,
+    run_id: str
+) -> str:
+    """Generate README.md for model card."""
+    readme = f"""---
+language: en
+license: mit
+tags:
+- image-classification
+- imagenet
+- multi-scale
+- feature-geometry
+- david
+datasets:
+- imagenet-1k
+metrics:
+- accuracy
+model-index:
+- name: David-{david_config.sharing_mode}-{david_config.fusion_mode}
+  results:
+  - task:
+      type: image-classification
+    dataset:
+      name: ImageNet-1K
+      type: imagenet-1k
+    metrics:
+    - type: accuracy
+      value: {best_metrics.get('best_val_acc', 0.0):.2f}
+---
+# David: Multi-Scale Feature Classifier
+**David** is a multi-scale deep learning classifier that uses feature geometry (pentachora/4-simplexes)
+as class prototypes with role-weighted similarity computation (Rose Loss).
+## Model Details
+### Architecture
+- **Preset**: {config.preset}
+- **Sharing Mode**: {david_config.sharing_mode}
+- **Fusion Mode**: {david_config.fusion_mode}
+- **Scales**: {david_config.scales}
+- **Feature Dim**: {david_config.feature_dim}
+- **Parameters**: {best_metrics.get('parameters', 0):,}
+### Training Configuration
+- **Dataset**: {config.dataset_name}
+- **Model Variant**: {config.model_variant}
+- **Epochs**: {config.num_epochs}
+- **Batch Size**: {config.batch_size}
+- **Learning Rate**: {config.learning_rate}
+- **Rose Loss Weight**: {config.rose_initial_weight} → {config.rose_max_weight}
+- **Cayley Loss**: {config.use_cayley_loss}
+## Performance
+### Best Results
+- **Validation Accuracy**: {best_metrics.get('best_val_acc', 0.0):.2f}%
+- **Best Epoch**: {best_metrics.get('best_epoch', 0)}
+- **Final Train Accuracy**: {best_metrics.get('final_train_acc', 0.0):.2f}%
+### Per-Scale Performance
+"""
+    if 'scale_accuracies' in best_metrics:
+        for scale, acc in best_metrics['scale_accuracies'].items():
+            readme += f"- **Scale {scale}**: {acc:.2f}%\n"
+    readme += f"""
+## Usage
+### Quick Model Lookup
+**Check `MODELS_INDEX.json` in the repo root** - it lists all trained models sorted by accuracy with links to weights and configs.
+### Repository Structure
+```
+{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}/
+├── MODELS_INDEX.json                # 📊 Master index of all models (sorted by accuracy)
+├── README.md                         # This file
+├── best_model.json                   # Latest best model info
+├── weights/
+│   └── {david_config.name}/
+│       └── {run_id}/
+│           ├── MODEL_SUMMARY.txt     # 🎯 Human-readable performance summary
+│           ├── training_history.json # 📈 Epoch-by-epoch training curve
+│           ├── best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}.safetensors  # ⭐ Accuracy in filename!
+│           ├── best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}_metadata.json
+│           ├── final_model.safetensors
+│           ├── checkpoint_epoch_X_accYY.YY.safetensors
+│           ├── david_config.json
+│           └── train_config.json
+└── runs/
+    └── {david_config.name}/
+        └── {run_id}/
+            └── events.out.tfevents.* # TensorBoard logs
+```
+### Loading the Model
+```python
+from geovocab2.train.model.core.david import David, DavidArchitectureConfig
+from huggingface_hub import hf_hub_download
+# Browse available models in MODELS_INDEX.json first!
+# Specify model variant and run
+model_name = "{david_config.name}"
+run_id = "{run_id}"
+accuracy = "{best_metrics.get('best_val_acc', 0.0):.2f}"  # From MODELS_INDEX.json
+# Download config
+config_path = hf_hub_download(
+    repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}",
+    filename=f"weights/{{model_name}}/{{run_id}}/david_config.json"
+)
+config = DavidArchitectureConfig.from_json(config_path)
+# Download weights (accuracy in filename!)
+weights_path = hf_hub_download(
+    repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}",
+    filename=f"weights/{{model_name}}/{{run_id}}/best_model_acc{{accuracy}}.safetensors"
+)
+# Download training history (optional - see full training curve)
+history_path = hf_hub_download(
+    repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}",
+    filename=f"weights/{{model_name}}/{{run_id}}/training_history.json"
+)
+# Load model
+from safetensors.torch import load_file
+david = David.from_config(config)
+david.load_state_dict(load_file(weights_path))
+david.eval()
+```
+### Inference
+```python
+import torch
+import torch.nn.functional as F
+# Assuming you have CLIP features (512-dim for ViT-B/16)
+features = get_clip_features(image)  # [1, 512]
+# Load anchors
+anchors_dict = torch.load("anchors.pth")
+# Forward pass
+with torch.no_grad():
+    logits, _ = david(features, anchors_dict)
+    predictions = logits.argmax(dim=-1)
+```
+## Architecture Overview
+### Multi-Scale Processing
+David processes inputs at multiple scales ({', '.join(map(str, david_config.scales))}),
+allowing it to capture both coarse and fine-grained features.
+### Feature Geometry
+Each class is represented by a pentachoron (4-simplex) in embedding space with 5 vertices:
+- **Anchor**: Primary class representative
+- **Need**: Complementary direction
+- **Relation**: Contextual alignment
+- **Purpose**: Functional direction
+- **Observer**: Meta-perspective
+### Rose Loss
+Similarity computation uses role-weighted cosine similarities:
+```
+score = w_anchor * sim(z, anchor) + w_need * sim(z, need) + ...
+```
+### Fusion Strategy
+**{david_config.fusion_mode}**: Intelligently combines predictions from multiple scales.
+## Training Details
+### Loss Components
+- **Cross-Entropy**: Standard classification loss
+- **Rose Loss**: Pentachora role-weighted margin loss (weight: {config.rose_initial_weight}→{config.rose_max_weight})
+- **Cayley Loss**: Geometric regularization ({'enabled' if config.use_cayley_loss else 'disabled'})
+### Optimization
+- **Optimizer**: AdamW
+- **Weight Decay**: {config.weight_decay}
+- **Scheduler**: {config.scheduler_type}
+- **Gradient Clip**: {config.gradient_clip}
+- **Mixed Precision**: {config.use_mixed_precision}
+## Citation
+```bibtex
+@software{{david_classifier_2025,
+  title = {{David: Multi-Scale Feature Classifier}},
+  author = {{AbstractPhil}},
+  year = {{2025}},
+  url = {{https://huggingface.co/{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}}},
+  note = {{Run ID: {run_id}}}
+}}
+```
+## License
+MIT License
+## Acknowledgments
+Built with lattice geometry and multi-scale deep learning.
+Special thanks to Claude (Anthropic) for debugging assistance.
+---
+*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
+"""
+    return readme
+def save_best_model_json(
+    filepath: str,
+    metrics: Dict,
+    config: DavidTrainingConfig,
+    david_config: DavidArchitectureConfig
+):
+    """Save best_model.json with comprehensive metrics."""
+    model_name = f"David-{david_config.sharing_mode}-{david_config.fusion_mode}"
+    best_model_info = {
+        "model_name": model_name,
+        "run_id": config.run_id,
+        "timestamp": datetime.now().isoformat(),
+        # Best metrics
+        "best_val_acc": metrics.get('best_val_acc', 0.0),
+        "best_epoch": metrics.get('best_epoch', 0),
+        "final_train_acc": metrics.get('final_train_acc', 0.0),
+        "final_train_loss": metrics.get('final_train_loss', 0.0),
+        # Per-scale performance
+        "scale_accuracies": metrics.get('scale_accuracies', {}),
+        # Architecture
+        "architecture": {
+            "preset": config.preset,
+            "sharing_mode": david_config.sharing_mode,
+            "fusion_mode": david_config.fusion_mode,
+            "scales": david_config.scales,
+            "feature_dim": david_config.feature_dim,
+            "num_classes": david_config.num_classes,
+            "use_belly": david_config.use_belly,
+            "belly_expand": david_config.belly_expand,
+        },
+        # Training config
+        "training": {
+            "dataset": config.dataset_name,
+            "model_variant": config.model_variant,
+            "num_epochs": config.num_epochs,
+            "batch_size": config.batch_size,
+            "learning_rate": config.learning_rate,
+            "rose_weight": f"{config.rose_initial_weight}→{config.rose_max_weight}",
+            "cayley_loss": config.use_cayley_loss,
+            "optimizer": "AdamW",
+            "scheduler": config.scheduler_type,
+        },
+        # Files (organized by model/run)
+        "files": {
+            "weights_safetensors": f"weights/{model_name}/{config.run_id}/best_model_acc{metrics.get('best_val_acc', 0.0):.2f}.safetensors",
+            "weights_pytorch": f"weights/{model_name}/{config.run_id}/best_model.pth",
+            "config": f"weights/{model_name}/{config.run_id}/david_config.json",
+            "training_config": f"weights/{model_name}/{config.run_id}/train_config.json",
+            "tensorboard": f"runs/{model_name}/{config.run_id}/"
+        }
+    }
+    with open(filepath, 'w') as f:
+        json.dump(best_model_info, f, indent=2)
+    print(f"[📄] Saved best_model.json: {filepath}")
+def create_model_summary(
+    weights_dir: str,
+    config: DavidTrainingConfig,
+    david_config: DavidArchitectureConfig,
+    best_metrics: Dict,
+    model_name: str
+):
+    """Create prominent model summary with accuracy front and center."""
+    summary_path = os.path.join(weights_dir, 'MODEL_SUMMARY.txt')
+    best_acc = best_metrics.get('best_val_acc', 0.0)
+    training_history = best_metrics.get('training_history', {})
+    summary = f"""
+╔══════════════════════════════════════════════════════════════╗
+║                      DAVID MODEL SUMMARY                      ║
+╠══════════════════════════════════════════════════════════════╣
+║                                                               ║
+║  🎯 VALIDATION ACCURACY: {best_acc:.2f}%                          ║
+║                                                               ║
+╚══════════════════════════════════════════��═══════════════════╝
+MODEL: {model_name}
+RUN ID: {config.run_id}
+BEST EPOCH: {best_metrics.get('best_epoch', 0) + 1}/{config.num_epochs}
+═══════════════════════════════════════════════════════════════
+📊 PERFORMANCE BREAKDOWN
+Final Training Accuracy:  {best_metrics.get('final_train_acc', 0.0):.2f}%
+Best Validation Accuracy: {best_acc:.2f}%
+Per-Scale Accuracies:
+"""
+    scale_accs = best_metrics.get('scale_accuracies', {})
+    for scale in sorted(scale_accs.keys()):
+        acc = scale_accs[scale]
+        summary += f"  • Scale {scale:4d}: {acc:.2f}%\n"
+    summary += f"""
+═══════════════════════════════════════════════════════════════
+🏗️  ARCHITECTURE
+Preset:        {config.preset}
+Sharing Mode:  {david_config.sharing_mode}
+Fusion Mode:   {david_config.fusion_mode}
+Scales:        {len(david_config.scales)} scales - {david_config.scales}
+Feature Dim:   {david_config.feature_dim}
+Parameters:    {best_metrics.get('parameters', 0):,}
+═══════════════════════════════════════════════════════════════
+📈 TRAINING CURVE
+"""
+    if training_history and 'val_acc' in training_history:
+        summary += "Epoch | Train Acc | Val Acc  | Learning Rate\n"
+        summary += "------|-----------|----------|--------------\n"
+        for i, epoch in enumerate(training_history.get('epochs', [])):
+            train_acc = training_history['train_acc'][i] if i < len(training_history['train_acc']) else 0
+            val_acc = training_history['val_acc'][i] if i < len(training_history['val_acc']) else 0
+            lr = training_history['lr'][i] if i < len(training_history['lr']) else 0
+            marker = " 👑" if val_acc == best_acc else ""
+            summary += f"{epoch:5d} | {train_acc:8.2f}% | {val_acc:7.2f}%{marker} | {lr:.2e}\n"
+    summary += f"""
+═══════════════════════════════════════════════════════════════
+📁 FILES
+Best Model:    best_model_acc{best_acc:.2f}.safetensors
+Config:        david_config.json
+Training Cfg:  train_config.json
+History:       training_history.json
+═══════════════════════════════════════════════════════════════
+Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+"""
+    with open(summary_path, 'w') as f:
+        f.write(summary)
+    print(f"[📄] Created MODEL_SUMMARY.txt")
+    return summary_path
+def update_models_index(
+    config: DavidTrainingConfig,
+    david_config: DavidArchitectureConfig,
+    best_metrics: Dict,
+    model_name: str
+):
+    """Update master models index file tracking all trained models."""
+    if not config.upload_to_hub or not config.hf_repo:
+        return
+    try:
+        from huggingface_hub import hf_hub_download
+        api = HfApi()
+        # Try to download existing index
+        try:
+            index_path = hf_hub_download(
+                repo_id=config.hf_repo,
+                filename="MODELS_INDEX.json",
+                repo_type="model"
+            )
+            with open(index_path, 'r') as f:
+                models_index = json.load(f)
+        except:
+            # Create new index if doesn't exist
+            models_index = {
+                "repository": config.hf_repo,
+                "updated": datetime.now().isoformat(),
+                "models": []
+            }
+        # Add current model entry
+        model_entry = {
+            "model_name": model_name,
+            "run_id": config.run_id,
+            "timestamp": datetime.now().isoformat(),
+            "best_val_acc": best_metrics.get('best_val_acc', 0.0),
+            "best_epoch": best_metrics.get('best_epoch', 0),
+            "num_scales": len(david_config.scales),
+            "scales": david_config.scales,
+            "parameters": best_metrics.get('parameters', 0),
+            "sharing_mode": david_config.sharing_mode,
+            "fusion_mode": david_config.fusion_mode,
+            "preset": config.preset,
+            "weights_path": f"weights/{model_name}/{config.run_id}/best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}.safetensors",
+            "config_path": f"weights/{model_name}/{config.run_id}/david_config.json",
+            "history_path": f"weights/{model_name}/{config.run_id}/training_history.json"
+        }
+        # Remove old entry for same run_id if exists (update)
+        models_index["models"] = [m for m in models_index["models"] if m.get("run_id") != config.run_id]
+        models_index["models"].append(model_entry)
+        # Sort by accuracy (descending)
+        models_index["models"].sort(key=lambda x: x.get("best_val_acc", 0), reverse=True)
+        models_index["updated"] = datetime.now().isoformat()
+        models_index["total_models"] = len(models_index["models"])
+        # Save locally
+        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
+            json.dump(models_index, f, indent=2)
+            temp_path = f.name
+        # Upload to hub root
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo="MODELS_INDEX.json",
+            repo_id=config.hf_repo,
+            commit_message=f"Update models index - {model_name} @ {best_metrics.get('best_val_acc', 0.0):.2f}%"
+        )
+        os.unlink(temp_path)
+        print(f"[📊] Updated MODELS_INDEX.json - {len(models_index['models'])} models tracked")
+    except Exception as e:
+        print(f"[⚠️] Failed to update models index: {e}")
+def upload_to_huggingface(
+    local_dir: str,
+    repo_id: str,
+    commit_message: str,
+    path_in_repo: Optional[str] = None,
+    patterns: Optional[List[str]] = None
+):
+    """Upload directory to HuggingFace Hub."""
+    try:
+        api = HfApi()
+        # Create repo if it doesn't exist
+        try:
+            create_repo(repo_id, exist_ok=True, repo_type="model")
+            print(f"[🤗] Repo ready: {repo_id}")
+        except Exception as e:
+            print(f"[⚠️] Repo exists or creation failed: {e}")
+        # Upload folder
+        if patterns:
+            # Upload specific patterns
+            for pattern in patterns:
+                matching_files = list(Path(local_dir).rglob(pattern))
+                for file_path in matching_files:
+                    rel_path = file_path.relative_to(local_dir)
+                    if path_in_repo:
+                        repo_path = f"{path_in_repo}/{rel_path}"
+                    else:
+                        repo_path = str(rel_path)
+                    api.upload_file(
+                        path_or_fileobj=str(file_path),
+                        path_in_repo=repo_path,
+                        repo_id=repo_id,
+                        commit_message=commit_message
+                    )
+        else:
+            # Upload entire folder
+            api.upload_folder(
+                folder_path=local_dir,
+                repo_id=repo_id,
+                path_in_repo=path_in_repo,
+                commit_message=commit_message
+            )
+        print(f"[✅] Uploaded to Hub: https://huggingface.co/{repo_id}")
+    except Exception as e:
+        print(f"[❌] Hub upload failed: {e}")
+        print(f"    Continuing training (files saved locally)")
+def prepare_hub_upload(
+    weights_dir: str,
+    runs_dir: str,
+    config: DavidTrainingConfig,
+    david_config: DavidArchitectureConfig,
+    best_metrics: Dict,
+    model_name: str
+):
+    """Prepare and upload all artifacts to HuggingFace Hub."""
+    if not config.upload_to_hub or not config.hf_repo:
+        return
+    print("\n[🤗] Preparing HuggingFace Hub upload...")
+    # Create model summary file
+    summary_path = create_model_summary(weights_dir, config, david_config, best_metrics, model_name)
+    # Update master models index
+    update_models_index(config, david_config, best_metrics, model_name)
+    api = HfApi()
+    try:
+        create_repo(config.hf_repo, exist_ok=True, repo_type="model")
+    except:
+        pass
+    # Create temporary directory for root files
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Generate README at root
+        readme_path = os.path.join(temp_dir, "README.md")
+        readme_content = generate_model_readme(config, david_config, best_metrics, config.run_id)
+        with open(readme_path, 'w') as f:
+            f.write(readme_content)
+        print(f"[📝] Generated README.md")
+        # Save best_model.json at root
+        best_json_path = os.path.join(temp_dir, "best_model.json")
+        save_best_model_json(best_json_path, best_metrics, config, david_config)
+        # Upload root files (README.md, best_model.json)
+        print(f"[📤] Uploading root files...")
+        api.upload_file(
+            path_or_fileobj=readme_path,
+            path_in_repo="README.md",
+            repo_id=config.hf_repo,
+            commit_message=f"Update README - Run {config.run_id}"
+        )
+        api.upload_file(
+            path_or_fileobj=best_json_path,
+            path_in_repo="best_model.json",
+            repo_id=config.hf_repo,
+            commit_message=f"Update metrics - Run {config.run_id}"
+        )
+    # Upload ONLY essential weight files (not entire directory!)
+    weights_repo_path = f"weights/{model_name}/{config.run_id}"
+    best_acc = best_metrics.get('best_val_acc', 0.0)
+    print(f"[📤] Uploading essential files to {weights_repo_path}...")
+    # List of specific files to upload (not entire directory)
+    files_to_upload = [
+        ('MODEL_SUMMARY.txt', 'MODEL_SUMMARY.txt'),
+        ('training_history.json', 'training_history.json'),
+        ('david_config.json', 'david_config.json'),
+        ('train_config.json', 'train_config.json'),
+        (f'best_model_acc{best_acc:.2f}.safetensors', f'best_model_acc{best_acc:.2f}.safetensors'),
+        (f'best_model_acc{best_acc:.2f}_metadata.json', f'best_model_acc{best_acc:.2f}_metadata.json'),
+    ]
+    for local_filename, repo_filename in files_to_upload:
+        local_path = os.path.join(weights_dir, local_filename)
+        if os.path.exists(local_path):
+            try:
+                api.upload_file(
+                    path_or_fileobj=local_path,
+                    path_in_repo=f"{weights_repo_path}/{repo_filename}",
+                    repo_id=config.hf_repo,
+                    commit_message=f"Update {repo_filename} - Run {config.run_id}"
+                )
+            except Exception as e:
+                print(f"[⚠️] Failed to upload {repo_filename}: {e}")
+    print(f"[✅] Uploaded to Hub: https://huggingface.co/{config.hf_repo}")
+    # Upload tensorboard logs (only if they exist and it's final upload)
+    # Skip TensorBoard during training to avoid huge uploads every epoch
+    # if os.path.exists(runs_dir):
+    #     runs_repo_path = f"runs/{model_name}/{config.run_id}"
+    #     print(f"[📤] Uploading TensorBoard logs to {runs_repo_path}...")
+    #     upload_to_huggingface(
+    #         local_dir=runs_dir,
+    #         repo_id=config.hf_repo,
+    #         commit_message=f"Upload TensorBoard logs - {model_name} - Run {config.run_id}",
+    #         path_in_repo=runs_repo_path
+    #     )
+# ============================================================================
+# CHECKPOINT UTILITIES
+# ============================================================================
+def save_checkpoint(
+    filepath: str,
+    david: David,
+    optimizer: torch.optim.Optimizer,
+    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler],
+    epoch: int,
+    metrics: Dict,
+    train_config: DavidTrainingConfig
+):
+    """Save checkpoint in PyTorch and/or SafeTensors format."""
+    checkpoint = {
+        'epoch': epoch,
+        'model_state_dict': david.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
+        'metrics': metrics,
+        'train_config': train_config.to_dict(),
+    }
+    # Add accuracy to filename if available
+    val_acc = metrics.get('best_val_acc') or metrics.get('val_acc')
+    if val_acc:
+        acc_suffix = f"_acc{val_acc:.2f}"
+        filepath = filepath + acc_suffix
+    if train_config.save_format in ['pytorch', 'both']:
+        torch.save(checkpoint, filepath + '.pth')
+        print(f"[💾] Saved PyTorch: {filepath}.pth")
+    if train_config.save_format in ['safetensors', 'both']:
+        try:
+            from safetensors.torch import save_file
+            # Save model state
+            model_state = {k: v.contiguous() for k, v in david.state_dict().items()}
+            save_file(model_state, filepath + '.safetensors')
+            # Save metadata separately (now includes full training history)
+            metadata = {k: v for k, v in checkpoint.items()
+                       if k not in ['model_state_dict']}
+            with open(filepath + '_metadata.json', 'w') as f:
+                json.dump(metadata, f, indent=2, default=str)
+            print(f"[💾] Saved SafeTensors: {filepath}.safetensors")
+        except ImportError:
+            print(f"[⚠️] SafeTensors not available, skipping")
+def load_checkpoint(
+    checkpoint_path: str,
+    david: David,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    device: str = "cuda"
+) -> Tuple[int, Dict]:
+    """Load checkpoint and return epoch and metrics."""
+    if checkpoint_path.endswith('.safetensors'):
+        # Load SafeTensors format
+        try:
+            from safetensors.torch import load_file
+            model_state = load_file(checkpoint_path, device=device)
+            david.load_state_dict(model_state)
+            # Load metadata
+            metadata_path = checkpoint_path.replace('.safetensors', '_metadata.json')
+            with open(metadata_path, 'r') as f:
+                metadata = json.load(f)
+            epoch = metadata.get('epoch', 0)
+            metrics = metadata.get('metrics', {})
+            if optimizer and 'optimizer_state_dict' in metadata:
+                optimizer.load_state_dict(metadata['optimizer_state_dict'])
+            if scheduler and 'scheduler_state_dict' in metadata and metadata['scheduler_state_dict']:
+                scheduler.load_state_dict(metadata['scheduler_state_dict'])
+            print(f"[✅] Loaded from SafeTensors: {checkpoint_path}")
+            return epoch, metrics
+        except ImportError:
+            raise ImportError("safetensors not installed")
+    else:
+        # Load PyTorch format
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        david.load_state_dict(checkpoint['model_state_dict'])
+        if optimizer and 'optimizer_state_dict' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        if scheduler and 'scheduler_state_dict' in checkpoint and checkpoint['scheduler_state_dict']:
+            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        print(f"[✅] Loaded from PyTorch: {checkpoint_path}")
+        return checkpoint['epoch'], checkpoint.get('metrics', {})
+# ============================================================================
+# DATASET
+# ============================================================================
+class ImageNetHFDataset(Dataset):
+    """PyTorch Dataset wrapper for HuggingFace ImageNet features."""
+    def __init__(self, dataset_name: str, model_variant: str, split: str = "train"):
+        # Load only the specific split to avoid downloading all data
+        print(f"[📥] Loading {split} split for {model_variant}...")
+        self.dataset = load_dataset(
+            dataset_name,
+            name=model_variant,  # Dataset configuration/variant name
+            split=split          # Only load this specific split
+        )
+        self.length = len(self.dataset)
+        print(f"[✅] Loaded {self.length:,} samples from {split} split")
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        features = torch.tensor(item['clip_features'], dtype=torch.float32)
+        label = torch.tensor(item['label'], dtype=torch.long)
+        return features, label
+def create_dataloaders(config: DavidTrainingConfig):
+    """Create train and validation dataloaders."""
+    train_dataset = ImageNetHFDataset(
+        config.dataset_name, config.model_variant, "train"
+    )
+    val_dataset = ImageNetHFDataset(
+        config.dataset_name, config.model_variant, "validation"
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        num_workers=config.num_workers,
+        pin_memory=config.pin_memory,
+        prefetch_factor=config.prefetch_factor,
+        persistent_workers=config.persistent_workers
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=config.batch_size * 2,
+        shuffle=False,
+        num_workers=config.num_workers,
+        pin_memory=config.pin_memory,
+        prefetch_factor=config.prefetch_factor,
+        persistent_workers=config.persistent_workers
+    )
+    return train_loader, val_loader
+# ============================================================================
+# CRYSTAL GENERATOR
+# ============================================================================
+class CrystalGenerator:
+    """Generate crystals for all scales."""
+    def __init__(self, num_classes: int, scales: List[int], device: str = "cuda"):
+        self.num_classes = num_classes
+        self.scales = scales
+        self.device = device
+        self.factories = {
+            scale: SimplexFactory(k=4, embed_dim=scale, method="random")
+            for scale in scales
+        }
+    def generate(self, seed: int = 42) -> Tuple[Dict[int, torch.Tensor], Dict[int, torch.Tensor]]:
+        """Generate anchors and crystals for all scales."""
+        anchors_dict = {}
+        crystals_dict = {}
+        for scale in tqdm(self.scales, desc="Generating crystals"):
+            factory = self.factories[scale]
+            batch_crystals = []
+            for class_idx in range(self.num_classes):
+                crystal = factory.build(
+                    backend="torch",
+                    device=self.device,
+                    dtype=torch.float32,
+                    seed=seed + class_idx,
+                    validate=True
+                )
+                batch_crystals.append(crystal)
+            crystals = torch.stack(batch_crystals)
+            anchors = F.normalize(crystals[:, 0, :], dim=-1)
+            # Verify anchor diversity
+            anchor_sims = anchors @ anchors.T
+            off_diag = anchor_sims[~torch.eye(self.num_classes, dtype=bool, device=anchors.device)]
+            max_sim = off_diag.max().item()
+            mean_sim = off_diag.mean().item()
+            print(f"  Scale {scale}: max_sim={max_sim:.4f}, mean_sim={mean_sim:.4f}")
+            if max_sim > 0.99:
+                print(f"  ⚠️ WARNING: Anchors too similar at scale {scale}!")
+            anchors_dict[scale] = anchors
+            crystals_dict[scale] = crystals
+        return anchors_dict, crystals_dict
+# ============================================================================
+# TRAINING LOOP
+# ============================================================================
+def train_epoch(
+    david: David,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    criterion: MultiScaleCrystalLoss,
+    anchors_dict: Dict[int, torch.Tensor],
+    crystals_dict: Dict[int, torch.Tensor],
+    epoch: int,
+    config: DavidTrainingConfig,
+    writer: Optional[SummaryWriter],
+    global_step: int
+) -> Tuple[float, float, int, Dict]:
+    """Train for one epoch - Pure FP32."""
+    david.train()
+    david.update_epoch(epoch)
+    total_loss = 0
+    correct = 0
+    total = 0
+    loss_components_sum = {}
+    active_scales = david.get_active_scales()
+    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}")
+    for batch_idx, (features, labels) in enumerate(pbar):
+        features = features.cuda(non_blocking=True)
+        labels = labels.cuda(non_blocking=True)
+        # Zero gradients
+        optimizer.zero_grad()
+        # Forward pass - Pure FP32, no autocast
+        combined, logits_list, features_list, fusion_weights = david(
+            features, anchors_dict, return_all_scales=True
+        )
+        # Compute loss
+        losses = criterion(
+            combined, logits_list, features_list,
+            labels, crystals_dict, epoch
+        )
+        # Backward
+        losses['total'].backward()
+        # Gradient analysis
+        if config.track_gradients and batch_idx % config.log_interval == 0:
+            grad_stats = analyze_gradients(david, config)
+            if writer:
+                step = global_step + batch_idx
+                writer.add_scalar('train/grad_mean', grad_stats['mean'], step)
+                writer.add_scalar('train/grad_max', grad_stats['max'], step)
+                writer.add_scalar('train/grad_num_small', grad_stats['num_small'], step)
+        # Scale small gradients
+        scale_small_gradients(david, config)
+        # Gradient clipping
+        torch.nn.utils.clip_grad_norm_(david.parameters(), config.gradient_clip)
+        # Optimizer step
+        optimizer.step()
+        # Metrics
+        total_loss += losses['total'].item()
+        _, predicted = torch.max(combined, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+        # Accumulate loss components
+        for key, value in losses.items():
+            if key not in loss_components_sum:
+                loss_components_sum[key] = 0.0
+            loss_components_sum[key] += value.item()
+        # Logging
+        if writer and batch_idx % config.log_interval == 0:
+            step = global_step + batch_idx
+            writer.add_scalar('train/loss_batch', losses['total'].item(), step)
+            writer.add_scalar('train/acc_batch', 100 * correct / total, step)
+            if config.log_loss_components:
+                for key, value in losses.items():
+                    if key != 'total':
+                        writer.add_scalar(f'train/loss_{key}', value.item(), step)
+            if config.log_fusion_weights and fusion_weights is not None:
+                if fusion_weights.dim() == 2:
+                    mean_weights = fusion_weights.mean(dim=0)
+                    for i, w in enumerate(mean_weights):
+                        if i < len(active_scales):
+                            writer.add_scalar(
+                                f'train/fusion_weight_{active_scales[i]}',
+                                w.item(), step
+                            )
+            writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], step)
+        pbar.set_postfix({
+            'loss': f'{total_loss / (batch_idx + 1):.4f}',
+            'acc': f'{100 * correct / total:.2f}%'
+        })
+        global_step += 1
+    # Average loss components
+    avg_components = {k: v / len(train_loader) for k, v in loss_components_sum.items()}
+    return (
+        total_loss / len(train_loader),
+        100 * correct / total,
+        global_step,
+        avg_components
+    )
+@torch.no_grad()
+def validate(
+    david: David,
+    val_loader: DataLoader,
+    anchors_dict: Dict[int, torch.Tensor],
+    config: DavidTrainingConfig
+) -> Tuple[float, Dict[int, float]]:
+    """Validate model - Pure FP32."""
+    david.eval()
+    correct = 0
+    total = 0
+    active_scales = david.get_active_scales()
+    scale_correct = {scale: 0 for scale in active_scales}
+    for features, labels in tqdm(val_loader, desc="Validation", leave=False):
+        features = features.cuda(non_blocking=True)
+        labels = labels.cuda(non_blocking=True)
+        # Forward pass - no autocast
+        combined, logits_list, _, _ = david(
+            features, anchors_dict, return_all_scales=True
+        )
+        _, predicted = torch.max(combined, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+        for i, scale in enumerate(active_scales):
+            if i < len(logits_list):
+                _, scale_pred = torch.max(logits_list[i], 1)
+                scale_correct[scale] += (scale_pred == labels).sum().item()
+    accuracy = 100 * correct / total
+    scale_accs = {s: 100 * scale_correct[s] / total for s in scale_correct}
+    return accuracy, scale_accs
+# ============================================================================
+# MAIN TRAINING FUNCTION
+# ============================================================================
+def train_david(config: DavidTrainingConfig):
+    """Main training pipeline."""
+    # Enable TensorFloat32 for better performance on Ampere+ GPUs
+    torch.set_float32_matmul_precision('high')
+    print("="*80)
+    print("🌟 DAVID TRAINING PIPELINE")
+    print("="*80)
+    print(f"Run ID: {config.run_id}")
+    print(f"Preset: {config.preset}")
+    print(f"Batch Size: {config.batch_size}")
+    print(f"Learning Rate: {config.learning_rate}")
+    print(f"Mixed Precision: {config.use_mixed_precision}")
+    print(f"TensorFloat32: Enabled (high precision)")
+    print("="*80)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Load or create David config FIRST (needed for model_name)
+    if config.custom_config_path:
+        david_config = DavidArchitectureConfig.from_json(config.custom_config_path)
+        print(f"[📁] Loaded custom config: {config.custom_config_path}")
+    elif config.preset:
+        david_config = DavidPresets.get_preset(config.preset)
+        print(f"[⚙️] Using preset: {config.preset}")
+    else:
+        raise ValueError("Must specify either preset or custom_config_path")
+    # Create model name from architecture
+    model_name = f"David-{david_config.sharing_mode}-{david_config.fusion_mode}"
+    print(f"[🏷️] Model: {model_name}")
+    # Setup directories with proper hierarchy: weights/model_name/timestamp/
+    weights_dir = os.path.join(config.base_dir, "weights", model_name, config.run_id)
+    runs_dir = os.path.join(config.base_dir, "runs", model_name, config.run_id)
+    os.makedirs(weights_dir, exist_ok=True)
+    os.makedirs(runs_dir, exist_ok=True)
+    print(f"[📁] Weights: {weights_dir}")
+    print(f"[📁] Logs: {runs_dir}")
+    writer = SummaryWriter(runs_dir)
+    # Apply overrides
+    if config.num_classes_override:
+        david_config.num_classes = config.num_classes_override
+    if config.use_belly_override is not None:
+        david_config.use_belly = config.use_belly_override
+    if config.belly_expand_override is not None:
+        david_config.belly_expand = config.belly_expand_override
+    if config.progressive_training_override is not None:
+        david_config.progressive_training = config.progressive_training_override
+        if not david_config.progressive_training:
+            # Disable warmup if progressive training disabled
+            david_config.scale_warmup_epochs = {s: 0 for s in david_config.scales}
+    # Override scale warmup schedule if provided
+    if config.scale_warmup_epochs_override is not None:
+        david_config.scale_warmup_epochs = config.scale_warmup_epochs_override
+        # Enable progressive training if custom schedule provided
+        if not david_config.progressive_training:
+            print(f"[⚙️] Enabling progressive training (custom warmup schedule provided)")
+            david_config.progressive_training = True
+    print(f"[⚙️] Progressive training: {david_config.progressive_training}")
+    if david_config.progressive_training:
+        print(f"    Scale warmup schedule: {david_config.scale_warmup_epochs}")
+    # Save configs
+    david_config_path = os.path.join(weights_dir, "david_config.json")
+    david_config.to_json(david_config_path)
+    print(f"[💾] Saved David config: {david_config_path}")
+    train_config_path = os.path.join(weights_dir, "train_config.json")
+    config.to_json(train_config_path)
+    print(f"[💾] Saved training config: {train_config_path}")
+    # Initialize David
+    david = David.from_config(david_config).cuda()
+    print(f"\n{david}\n")
+    # Count parameters
+    total_params = sum(p.numel() for p in david.parameters())
+    trainable_params = sum(p.numel() for p in david.parameters() if p.requires_grad)
+    print(f"[📊] Total Parameters: {total_params:,}")
+    print(f"[📊] Trainable Parameters: {trainable_params:,}")
+    # Load data
+    train_loader, val_loader = create_dataloaders(config)
+    # Generate crystals
+    crystal_gen = CrystalGenerator(
+        david_config.num_classes,
+        david_config.scales,
+        str(device)
+    )
+    anchors_dict, crystals_dict = crystal_gen.generate()
+    # Setup training
+    criterion = MultiScaleCrystalLoss(
+        scales=david_config.scales,
+        num_classes=david_config.num_classes,
+        use_rose_loss=config.use_rose_loss,
+        use_cayley_loss=config.use_cayley_loss,
+        rose_initial_weight=config.rose_initial_weight,
+        rose_max_weight=config.rose_max_weight,
+        cayley_weight=config.cayley_weight,
+        scale_loss_balance=config.scale_loss_balance
+    ).cuda()
+    optimizer = create_optimizer(david, config)
+    scheduler = create_scheduler(optimizer, config)
+    controller = AdaptiveTrainingController(david, config)
+    # Tracking
+    best_val_acc = 0.0
+    best_epoch = 0
+    best_scale_accs = {}
+    global_step = 0
+    final_train_acc = 0.0
+    final_train_loss = 0.0
+    # Training history for epoch-by-epoch tracking
+    training_history = {
+        'epochs': [],
+        'train_loss': [],
+        'train_acc': [],
+        'val_acc': [],
+        'scale_accs': {},
+        'lr': []
+    }
+    # DIAGNOSTIC: Test one forward/backward pass before training
+    print("\n[🔍] Running diagnostic forward/backward pass...")
+    #david.compile()
+    david.train()
+    # Get a small batch
+    for features_test, labels_test in train_loader:
+        features_test = features_test.cuda(non_blocking=True)[:8]  # Just 8 samples
+        labels_test = labels_test.cuda(non_blocking=True)[:8]
+        # Forward
+        combined_test, logits_test, features_test_out, _ = david(
+            features_test, anchors_dict, return_all_scales=True
+        )
+        # Loss
+        losses_test = criterion(
+            combined_test, logits_test, features_test_out,
+            labels_test, crystals_dict, epoch=0
+        )
+        print(f"   Initial loss: {losses_test['total'].item():.6f}")
+        print(f"   Loss components:")
+        for key, value in losses_test.items():
+            if key != 'total':
+                print(f"      {key}: {value.item():.6f}")
+        # Backward
+        optimizer.zero_grad()
+        losses_test['total'].backward()
+        # Check gradients
+        grad_count = sum(1 for p in david.parameters() if p.grad is not None and p.grad.norm() > 0)
+        total_grad_params = sum(1 for p in david.parameters() if p.requires_grad)
+        print(f"   Parameters with non-zero gradients: {grad_count}/{total_grad_params}")
+        if grad_count == 0:
+            print(f"   ❌ ERROR: No gradients! Training will not work.")
+            return None, 0.0
+        elif grad_count < total_grad_params * 0.5:
+            print(f"   ⚠️ WARNING: Less than 50% of parameters have gradients")
+        else:
+            print(f"   ✅ Gradients look good")
+        break  # Only test one batch
+    print("\n[🚀] Starting training...\n")
+    for epoch in range(config.num_epochs):
+        epoch_start = time.time()
+        # Train
+        train_loss, train_acc, global_step, loss_components = train_epoch(
+            david, train_loader, optimizer, criterion,
+            anchors_dict, crystals_dict, epoch, config,
+            writer, global_step
+        )
+        # Validate
+        val_acc, scale_accs = validate(david, val_loader, anchors_dict, config)
+        # Update controller
+        controller.update_metrics(scale_accs, val_acc)
+        controller.apply_adaptive_strategies(scale_accs, epoch)
+        # Step scheduler
+        if scheduler:
+            scheduler.step()
+        epoch_time = time.time() - epoch_start
+        # Print
+        print(f"\n📊 Epoch {epoch+1}/{config.num_epochs} ({epoch_time:.1f}s)")
+        print(f"   Train: Loss={train_loss:.4f}, Acc={train_acc:.2f}%")
+        print(f"   Val: Acc={val_acc:.2f}% (Best: {best_val_acc:.2f}%)")
+        print(f"   Active scales: {david.get_active_scales()}")
+        print(f"   LR: {optimizer.param_groups[0]['lr']:.2e}")
+        if config.log_loss_components and loss_components:
+            print(f"   Loss breakdown:")
+            for key, value in sorted(loss_components.items()):
+                if key != 'total':
+                    print(f"      {key:20s}: {value:.6f}")
+        for scale, acc in scale_accs.items():
+            frozen = "❄️" if controller.scales_frozen.get(scale, False) else "🔥"
+            print(f"      {frozen} Scale {scale}: {acc:.2f}%")
+        # Update tracking
+        final_train_acc = train_acc
+        final_train_loss = train_loss
+        # Record training history
+        training_history['epochs'].append(epoch + 1)
+        training_history['train_loss'].append(train_loss)
+        training_history['train_acc'].append(train_acc)
+        training_history['val_acc'].append(val_acc)
+        training_history['lr'].append(optimizer.param_groups[0]['lr'])
+        # Record per-scale accuracies
+        for scale, acc in scale_accs.items():
+            if scale not in training_history['scale_accs']:
+                training_history['scale_accs'][scale] = []
+            training_history['scale_accs'][scale].append(acc)
+        # TensorBoard
+        writer.add_scalar('train/loss', train_loss, epoch)
+        writer.add_scalar('train/acc', train_acc, epoch)
+        writer.add_scalar('val/acc', val_acc, epoch)
+        for scale, acc in scale_accs.items():
+            writer.add_scalar(f'val/acc_scale_{scale}', acc, epoch)
+        # Save best
+        if val_acc > best_val_acc:
+            best_val_acc = val_acc
+            best_epoch = epoch
+            best_scale_accs = scale_accs.copy()
+            # Save training history alongside best model
+            history_path = os.path.join(weights_dir, 'training_history.json')
+            with open(history_path, 'w') as f:
+                json.dump(training_history, f, indent=2)
+            save_checkpoint(
+                os.path.join(weights_dir, 'best_model'),
+                david, optimizer, scheduler, epoch,
+                {
+                    'best_val_acc': best_val_acc,
+                    'best_epoch': best_epoch,
+                    'scale_accuracies': best_scale_accs,
+                    'training_history': training_history
+                },
+                config
+            )
+            # Upload to hub when best model improves
+            if config.upload_to_hub:
+                best_metrics = {
+                    'best_val_acc': best_val_acc,
+                    'best_epoch': best_epoch,
+                    'scale_accuracies': best_scale_accs,
+                    'final_train_acc': train_acc,
+                    'final_train_loss': train_loss,
+                    'training_history': training_history,
+                    'parameters': total_params
+                }
+                prepare_hub_upload(weights_dir, runs_dir, config, david_config, best_metrics, model_name)
+        # Periodic save
+        if (epoch + 1) % config.save_interval == 0:
+            save_checkpoint(
+                os.path.join(weights_dir, f'checkpoint_epoch_{epoch+1}'),
+                david, optimizer, scheduler, epoch,
+                {'val_acc': val_acc},
+                config
+            )
+    # Final save
+    save_checkpoint(
+        os.path.join(weights_dir, 'final_model'),
+        david, optimizer, scheduler, config.num_epochs - 1,
+        {'final_val_acc': val_acc},
+        config
+    )
+    writer.close()
+    # Final hub upload with all artifacts
+    if config.upload_to_hub:
+        print("\n[🤗] Performing final HuggingFace Hub upload...")
+        final_metrics = {
+            'best_val_acc': best_val_acc,
+            'best_epoch': best_epoch,
+            'scale_accuracies': best_scale_accs,
+            'final_train_acc': final_train_acc,
+            'final_train_loss': final_train_loss,
+            'training_history': training_history,
+            'parameters': total_params
+        }
+        prepare_hub_upload(weights_dir, runs_dir, config, david_config, final_metrics, model_name)
+    print("\n" + "="*80)
+    print(f"🎉 Training Complete!")
+    print(f"   Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch+1})")
+    print(f"   Final Train Acc: {final_train_acc:.2f}%")
+    print(f"   Weights: {weights_dir}")
+    if config.upload_to_hub:
+        print(f"   Hub: https://huggingface.co/{config.hf_repo}")
+    print("="*80)
+    return david, best_val_acc
+# ============================================================================
+# USAGE EXAMPLE
+# ============================================================================
+if __name__ == "__main__":
+    configs = []
+    config = DavidTrainingConfig(
+        preset="clip_vit_bigg14",  # Uses progressive training by default
+        model_variant="clip_vit_laion_bigg14",
+        num_epochs=10,
+        batch_size=1024,
+        learning_rate=1e-3,
+        use_mixed_precision=False, # leave off, mixed precision kills accuracy with rose
+        gradient_clip=10.0,
+        use_rose_loss=True,
+        rose_initial_weight=0.1,
+        rose_max_weight=0.5,
+        use_cayley_loss=False,
+        progressive_training_override=False,
+        # Adaptive training (disabled by default for stability)
+        freeze_strategy="none",  # Set to "performance" to enable
+        freeze_threshold=90.0,
+        save_format="safetensors",
+        # HuggingFace Hub upload
+        # DO NOT PUT YOUR HF TOKEN IN THE CONFIG, load it in os.environ or other means.
+        upload_to_hub=False, # set to true for your repo
+        hf_repo= "YourName/Repo" # "AbstractPhil/gated-david",
+    )
+    #configs.append(DavidTrainingConfig(
+    #    preset="high_accuracy",  # Uses progressive training by default
+    #    model_variant="clip_vit_laion_b32",
+    #
+    #    num_epochs=20,
+    #    batch_size=1024,
+    #    learning_rate=1e-3,
+    #
+    #    use_mixed_precision=False,
+    #    gradient_clip=10.0,
+    #
+    #    use_rose_loss=True,
+    #    rose_initial_weight=0.1,
+    #    rose_max_weight=0.5,
+    #    use_cayley_loss=False,
+    #
+    #    # Adaptive training (disabled by default for stability)
+    #    freeze_strategy="never",  # Set to "performance" to enable
+    #    freeze_threshold=90.0,
+    #
+    #    save_format="safetensors",
+    #
+    #    # HuggingFace Hub upload
+    #    upload_to_hub=True,
+    #    hf_repo="AbstractPhil/gated-david",
+    #))
+    #configs.append(DavidTrainingConfig(
+    #    preset="balanced",  # Uses progressive training by default
+    #    model_variant="clip_vit_laion_b32",
+    #
+    #    num_epochs=20,
+    #    batch_size=1024,
+    #    learning_rate=1e-3,
+    #
+    #    # Custom scale warmup schedule (overrides preset)
+    #    #scale_warmup_epochs_override={
+    #    #    384: 0,   # Sca+le 256 active from epoch 0
+    #    #    #512: 1,   # Scale 512 active from epoch 2
+    #    #    768: 1,   # Scale 768 active from epoch 5
+    #    #    1024: 2,  # Scale 1024 active from epoch 8
+    #    #    1280: 3  # Scale 1280 active from epoch 10
+    #    #},
+    #    #scale_warmup_epochs_override={
+    #    #  256: 0,
+    #    #  512: 1,
+    #    #  768: 2,
+    #    #  1024: 3,
+    #    #  1280: 4,
+    #    #  1536: 5,
+    #    #  1792: 6,
+    #    #  2048: 7,
+    #    #  2304: 8,
+    #    #  2560: 9
+    #    #},
+    #
+    #    use_mixed_precision=False,
+    #    gradient_clip=10.0,
+    #
+    #    use_rose_loss=True,
+    #    rose_initial_weight=0.1,
+    #    rose_max_weight=0.5,
+    #    use_cayley_loss=False,
+    #
+    #    # Adaptive training (disabled by default for stability)
+    #    freeze_strategy="never",  # Set to "performance" to enable
+    #    freeze_threshold=90.0,
+    #
+    #    save_format="safetensors",
+    #
+    #    # HuggingFace Hub upload
+    #    upload_to_hub=True,
+    #    hf_repo="AbstractPhil/gated-david",
+    #))
+#
+    #configs.append(DavidTrainingConfig(
+    #    preset="clip_vit_l14_ultra_deep",  # Uses progressive training by default
+    #    model_variant="clip_vit_l14",
+    #
+    #    num_epochs=10,
+    #    batch_size=1024,
+    #    learning_rate=1e-3,
+    #
+    #    # Custom scale warmup schedule (overrides preset)
+    #    #scale_warmup_epochs_override={
+    #    #    384: 0,   # Scale 256 active from epoch 0
+    #    #    #512: 1,   # Scale 512 active from epoch 2
+    #    #    768: 1,   # Scale 768 active from epoch 5
+    #    #    1024: 2,  # Scale 1024 active from epoch 8
+    #    #    1280: 3  # Scale 1280 active from epoch 10
+    #    #},
+    #    #scale_warmup_epochs_override={
+    #    #  256: 0,
+    #    #  512: 1,
+    #    #  768: 2,
+    #    #  1024: 3,
+    #    #  1280: 4,
+    #    #  1536: 5,
+    #    #  1792: 6,
+    #    #  2048: 7,
+    #    #  2304: 8,
+    #    #  2560: 9
+    #    #},
+    #
+    #    use_mixed_precision=False,
+    #    gradient_clip=10.0,
+    #
+    #    use_rose_loss=True,
+    #    rose_initial_weight=0.1,
+    #    rose_max_weight=0.5,
+    #    use_cayley_loss=False,
+    #
+    #    # Adaptive training (disabled by default for stability)
+    #    freeze_strategy="never",  # Set to "performance" to enable
+    #    freeze_threshold=90.0,
+    #
+    #    save_format="safetensors",
+    #
+    #    # HuggingFace Hub upload
+    #    upload_to_hub=True,
+    #    hf_repo="AbstractPhil/gated-david",
+    #))
+    #for config in configs:
+    #  print("Starting train")
+    #  try:
+    david, best_acc = train_david(config)
+      #except Exception as e:
+      #  print(f"Error during training: {e}")
+      #print("train complete")