| | """ |
| | David Training Pipeline |
| | ======================== |
| | Training pipeline for David multi-scale crystal classifier. |
| | |
| | Should be placed at: geovocab2/train/model/core/david_trainer.py |
| | Or run from: scripts/train_david.py |
| | |
| | Features: |
| | - Pure fp32 training (no mixed precision for geometric stability) |
| | - Adaptive training controller (freeze/unfreeze scales) |
| | - Gradient analysis and scaling |
| | - SafeTensors checkpoint support |
| | - Enhanced loss component tracking |
| | - Proper weight organization: weights/model_name/timestamp/ |
| | - Accuracy in filenames and comprehensive tracking |
| | - Master models index (MODELS_INDEX.json) |
| | """ |
| |
|
| | import torch |
| | import torch.nn.functional as F |
| | from torch.utils.data import Dataset, DataLoader |
| | from torch.utils.tensorboard import SummaryWriter |
| | from datasets import load_dataset |
| | from huggingface_hub import HfApi, create_repo, upload_folder, upload_file |
| | import numpy as np |
| | import os |
| | import json |
| | import time |
| | import tempfile |
| | from datetime import datetime |
| | from tqdm.auto import tqdm |
| | from pathlib import Path |
| | from typing import Dict, List, Optional, Tuple, Union |
| | from dataclasses import dataclass, field, asdict |
| |
|
| | |
| | from geovocab2.train.config.david_config import ( |
| | DavidArchitectureConfig, |
| | DavidPresets, |
| | SharingMode, |
| | FusionMode |
| | ) |
| |
|
| | from geovocab2.train.model.core.david import ( |
| | David, |
| | MultiScaleCrystalLoss, |
| | ) |
| |
|
| | |
| | from geovocab2.shapes.factory import SimplexFactory |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @dataclass |
| | class DavidTrainingConfig: |
| | """ |
| | Complete training configuration for David. |
| | Separate from model architecture config. |
| | """ |
| | |
| | |
| | name: str = "david_training" |
| | run_id: str = "" |
| | |
| | |
| | dataset_name: str = "AbstractPhil/imagenet-clip-features-orderly" |
| | model_variant: Union[str, List[str]] = "clip_vit_b16" |
| | num_classes: int = 1000 |
| | |
| | |
| | preset: Optional[str] = "balanced" |
| | custom_config_path: Optional[str] = None |
| | |
| | |
| | num_classes_override: Optional[int] = None |
| | use_belly_override: Optional[bool] = None |
| | belly_expand_override: Optional[float] = None |
| | progressive_training_override: Optional[bool] = True |
| | scale_warmup_epochs_override: Optional[Dict[int, int]] = None |
| | |
| | |
| | num_epochs: int = 50 |
| | batch_size: int = 512 |
| | learning_rate: float = 5e-3 |
| | weight_decay: float = 1e-5 |
| | warmup_epochs: int = 3 |
| | |
| | |
| | use_rose_loss: bool = True |
| | rose_initial_weight: float = 0.01 |
| | rose_max_weight: float = 0.1 |
| | rose_weight_schedule: str = "adaptive" |
| | use_cayley_loss: bool = False |
| | cayley_weight: float = 0.001 |
| | scale_loss_balance: Optional[Dict[int, float]] = None |
| | |
| | |
| | use_mixed_precision: bool = False |
| | gradient_clip: float = 5.0 |
| | scheduler_type: str = "cosine_restarts" |
| | min_lr: float = 1e-6 |
| | |
| | |
| | freeze_strategy: str = "never" |
| | freeze_threshold: float = 90.0 |
| | unfreeze_on_plateau: bool = True |
| | patience: int = 10 |
| | |
| | |
| | track_gradients: bool = True |
| | gradient_scale_threshold: float = 1e-5 |
| | gradient_scale_multiplier: float = 10.0 |
| | |
| | |
| | log_interval: int = 50 |
| | val_interval: int = 1 |
| | save_interval: int = 5 |
| | log_fusion_weights: bool = True |
| | log_loss_components: bool = True |
| | |
| | |
| | save_format: str = "both" |
| | |
| | |
| | hf_repo: Optional[str] = "" |
| | upload_to_hub: bool = False |
| | |
| | |
| | base_dir: str = "./david_training" |
| | |
| | |
| | num_workers: int = 10 |
| | pin_memory: bool = True |
| | prefetch_factor: int = 4 |
| | persistent_workers: bool = True |
| | |
| | def __post_init__(self): |
| | """Generate run_id if not provided.""" |
| | if not self.run_id: |
| | self.run_id = datetime.now().strftime('%Y%m%d_%H%M%S') |
| | |
| | def to_dict(self) -> dict: |
| | """Convert to dictionary.""" |
| | return asdict(self) |
| | |
| | @classmethod |
| | def from_dict(cls, data: dict) -> 'DavidTrainingConfig': |
| | """Create from dictionary.""" |
| | return cls(**data) |
| | |
| | def to_json(self, path: str): |
| | """Save to JSON.""" |
| | data = self.to_dict() |
| | |
| | if data.get('scale_loss_balance'): |
| | data['scale_loss_balance'] = { |
| | str(k): v for k, v in data['scale_loss_balance'].items() |
| | } |
| | if data.get('scale_warmup_epochs_override'): |
| | data['scale_warmup_epochs_override'] = { |
| | str(k): v for k, v in data['scale_warmup_epochs_override'].items() |
| | } |
| | with open(path, 'w') as f: |
| | json.dump(data, f, indent=2) |
| | |
| | @classmethod |
| | def from_json(cls, path: str) -> 'DavidTrainingConfig': |
| | """Load from JSON.""" |
| | with open(path, 'r') as f: |
| | data = json.load(f) |
| | |
| | if 'scale_loss_balance' in data and data['scale_loss_balance']: |
| | data['scale_loss_balance'] = { |
| | int(k): v for k, v in data['scale_loss_balance'].items() |
| | } |
| | |
| | if 'scale_warmup_epochs_override' in data and data['scale_warmup_epochs_override']: |
| | data['scale_warmup_epochs_override'] = { |
| | int(k): v for k, v in data['scale_warmup_epochs_override'].items() |
| | } |
| | return cls(**data) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class AdaptiveTrainingController: |
| | """Manages adaptive training strategies for multi-scale model.""" |
| | |
| | def __init__(self, model: David, config: DavidTrainingConfig): |
| | self.model = model |
| | self.config = config |
| | |
| | scales = model.scales |
| | self.scale_history = {scale: [] for scale in scales} |
| | self.best_scale_acc = {scale: 0.0 for scale in scales} |
| | self.scales_frozen = {scale: False for scale in scales} |
| | |
| | self.overall_history = [] |
| | self.plateau_counter = 0 |
| | self.best_overall = 0.0 |
| | |
| | def update_metrics(self, scale_accuracies: Dict[int, float], overall_accuracy: float): |
| | """Update metrics and best scores.""" |
| | for scale, acc in scale_accuracies.items(): |
| | self.scale_history[scale].append(acc) |
| | if acc > self.best_scale_acc[scale]: |
| | self.best_scale_acc[scale] = acc |
| | |
| | self.overall_history.append(overall_accuracy) |
| | |
| | if overall_accuracy > self.best_overall: |
| | self.best_overall = overall_accuracy |
| | self.plateau_counter = 0 |
| | else: |
| | self.plateau_counter += 1 |
| | |
| | def should_freeze_scale(self, scale: int, current_acc: float) -> bool: |
| | """Determine if a scale should be frozen.""" |
| | if self.config.freeze_strategy == "never": |
| | return False |
| | |
| | if self.scales_frozen[scale]: |
| | return False |
| | |
| | if self.config.freeze_strategy == "performance": |
| | return current_acc >= self.config.freeze_threshold |
| | |
| | return False |
| | |
| | def should_unfreeze_scales(self) -> bool: |
| | """Check if scales should be unfrozen due to plateau.""" |
| | if not self.config.unfreeze_on_plateau: |
| | return False |
| | return self.plateau_counter >= 5 |
| | |
| | def apply_adaptive_strategies(self, scale_accuracies: Dict[int, float], epoch: int): |
| | """Apply freeze/unfreeze based on performance.""" |
| | active_scales = self.model.get_active_scales() |
| | |
| | |
| | for scale, acc in scale_accuracies.items(): |
| | if self.should_freeze_scale(scale, acc): |
| | |
| | active_unfrozen = [s for s in active_scales if not self.scales_frozen.get(s, False)] |
| | |
| | if len(active_unfrozen) <= 1: |
| | print(f"[⚠️] Skipping freeze of scale {scale} (would leave no active trainable scales)") |
| | continue |
| | |
| | self.model.freeze_scale(scale) |
| | self.scales_frozen[scale] = True |
| | print(f"[❄️] Froze scale {scale} (acc={acc:.2f}%)") |
| | |
| | if self.should_unfreeze_scales() and any(self.scales_frozen.values()): |
| | for scale in self.model.scales: |
| | if self.scales_frozen[scale]: |
| | self.model.unfreeze_scale(scale) |
| | self.scales_frozen[scale] = False |
| | self.plateau_counter = 0 |
| | print(f"[🔥] Unfroze all scales due to plateau") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def create_optimizer(david: David, config: DavidTrainingConfig) -> torch.optim.Optimizer: |
| | """Create optimizer with parameter groups.""" |
| | |
| | param_groups = [] |
| | |
| | |
| | if hasattr(david, 'shared_extractor'): |
| | param_groups.append({ |
| | 'params': david.shared_extractor.parameters(), |
| | 'lr': config.learning_rate, |
| | 'name': 'shared' |
| | }) |
| | elif hasattr(david, 'shared_base'): |
| | param_groups.append({ |
| | 'params': david.shared_base.parameters(), |
| | 'lr': config.learning_rate, |
| | 'name': 'shared' |
| | }) |
| | |
| | |
| | for scale in david.scales: |
| | scale_params = [] |
| | if david.sharing_mode == SharingMode.HIERARCHICAL: |
| | head = getattr(david, f'head_{scale}', None) |
| | if head: |
| | scale_params.extend(head.parameters()) |
| | refine = getattr(david, f'refine_{scale}', None) |
| | if refine: |
| | scale_params.extend(refine.parameters()) |
| | else: |
| | scale_params.extend(david.heads[str(scale)].parameters()) |
| | |
| | if scale_params: |
| | param_groups.append({ |
| | 'params': scale_params, |
| | 'lr': config.learning_rate, |
| | 'name': f'scale_{scale}' |
| | }) |
| | |
| | |
| | if hasattr(david, 'fusion'): |
| | param_groups.append({ |
| | 'params': david.fusion.parameters(), |
| | 'lr': config.learning_rate * 0.5, |
| | 'name': 'fusion' |
| | }) |
| | elif hasattr(david, 'fusion_weights'): |
| | param_groups.append({ |
| | 'params': [david.fusion_weights], |
| | 'lr': config.learning_rate * 0.5, |
| | 'name': 'fusion' |
| | }) |
| | |
| | return torch.optim.AdamW(param_groups, weight_decay=config.weight_decay) |
| |
|
| |
|
| | def create_scheduler(optimizer: torch.optim.Optimizer, |
| | config: DavidTrainingConfig) -> torch.optim.lr_scheduler._LRScheduler: |
| | """Create learning rate scheduler.""" |
| | |
| | if config.scheduler_type == "cosine_restarts": |
| | return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( |
| | optimizer, T_0=10, T_mult=2, eta_min=config.min_lr |
| | ) |
| | elif config.scheduler_type == "cosine": |
| | return torch.optim.lr_scheduler.CosineAnnealingLR( |
| | optimizer, T_max=config.num_epochs, eta_min=config.min_lr |
| | ) |
| | else: |
| | return None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def analyze_gradients(model: David, config: DavidTrainingConfig) -> Dict[str, float]: |
| | """Analyze gradient magnitudes for debugging.""" |
| | grad_stats = { |
| | 'mean': 0.0, |
| | 'max': 0.0, |
| | 'min': float('inf'), |
| | 'num_zero': 0, |
| | 'num_small': 0, |
| | 'total': 0 |
| | } |
| | |
| | for name, param in model.named_parameters(): |
| | if param.grad is not None: |
| | grad_norm = param.grad.norm().item() |
| | grad_stats['mean'] += grad_norm |
| | grad_stats['max'] = max(grad_stats['max'], grad_norm) |
| | grad_stats['min'] = min(grad_stats['min'], grad_norm) |
| | grad_stats['total'] += 1 |
| | |
| | if grad_norm < 1e-10: |
| | grad_stats['num_zero'] += 1 |
| | elif grad_norm < config.gradient_scale_threshold: |
| | grad_stats['num_small'] += 1 |
| | |
| | if grad_stats['total'] > 0: |
| | grad_stats['mean'] /= grad_stats['total'] |
| | |
| | return grad_stats |
| |
|
| |
|
| | def scale_small_gradients(model: David, config: DavidTrainingConfig): |
| | """Scale up very small gradients to prevent vanishing.""" |
| | if not config.track_gradients: |
| | return |
| | |
| | for param in model.parameters(): |
| | if param.grad is not None: |
| | grad_norm = param.grad.norm() |
| | if grad_norm < config.gradient_scale_threshold and grad_norm > 0: |
| | param.grad.mul_(config.gradient_scale_multiplier) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def generate_model_readme( |
| | config: DavidTrainingConfig, |
| | david_config: DavidArchitectureConfig, |
| | best_metrics: Dict, |
| | run_id: str |
| | ) -> str: |
| | """Generate README.md for model card.""" |
| | |
| | readme = f"""--- |
| | language: en |
| | license: mit |
| | tags: |
| | - image-classification |
| | - imagenet |
| | - multi-scale |
| | - feature-geometry |
| | - david |
| | datasets: |
| | - imagenet-1k |
| | metrics: |
| | - accuracy |
| | model-index: |
| | - name: David-{david_config.sharing_mode}-{david_config.fusion_mode} |
| | results: |
| | - task: |
| | type: image-classification |
| | dataset: |
| | name: ImageNet-1K |
| | type: imagenet-1k |
| | metrics: |
| | - type: accuracy |
| | value: {best_metrics.get('best_val_acc', 0.0):.2f} |
| | --- |
| | |
| | # David: Multi-Scale Feature Classifier |
| | |
| | **David** is a multi-scale deep learning classifier that uses feature geometry (pentachora/4-simplexes) |
| | as class prototypes with role-weighted similarity computation (Rose Loss). |
| | |
| | This version is using multiple variations of clip-vit inputs simultaneously into shared space. |
| | The experiment will determine if entirely deviant variations such as clip-vit-b-patch32 and patch16 can |
| | exist simultaneously in the same shared space with the correct checks and spacings applied. |
| | |
| | ## Model Details |
| | |
| | ### Architecture |
| | - **Preset**: {config.preset} |
| | - **Sharing Mode**: {david_config.sharing_mode} |
| | - **Fusion Mode**: {david_config.fusion_mode} |
| | - **Scales**: {david_config.scales} |
| | - **Feature Dim**: {david_config.feature_dim} |
| | - **Parameters**: {best_metrics.get('parameters', 0):,} |
| | |
| | ### Training Configuration |
| | - **Dataset**: {config.dataset_name} |
| | - **Model Variant**: {config.model_variant} |
| | - **Epochs**: {config.num_epochs} |
| | - **Batch Size**: {config.batch_size} |
| | - **Learning Rate**: {config.learning_rate} |
| | - **Rose Loss Weight**: {config.rose_initial_weight} → {config.rose_max_weight} |
| | - **Cayley Loss**: {config.use_cayley_loss} |
| | |
| | ## Performance |
| | |
| | ### Best Results |
| | - **Validation Accuracy**: {best_metrics.get('best_val_acc', 0.0):.2f}% |
| | - **Best Epoch**: {best_metrics.get('best_epoch', 0)} |
| | - **Final Train Accuracy**: {best_metrics.get('final_train_acc', 0.0):.2f}% |
| | |
| | ### Per-Scale Performance |
| | """ |
| | |
| | if 'scale_accuracies' in best_metrics: |
| | for scale, acc in best_metrics['scale_accuracies'].items(): |
| | readme += f"- **Scale {scale}**: {acc:.2f}%\n" |
| | |
| | readme += f""" |
| | |
| | ## Usage |
| | |
| | ### Quick Model Lookup |
| | |
| | **Check `MODELS_INDEX.json` in the repo root** - it lists all trained models sorted by accuracy with links to weights and configs. |
| | |
| | ### Repository Structure |
| | |
| | ``` |
| | {config.hf_repo if config.hf_repo else 'AbstractPhil/david'}/ |
| | ├── MODELS_INDEX.json # 📊 Master index of all models (sorted by accuracy) |
| | ├── README.md # This file |
| | ├── best_model.json # Latest best model info |
| | ├── weights/ |
| | │ └── {david_config.name}/ |
| | │ └── {run_id}/ |
| | │ ├── MODEL_SUMMARY.txt # 🎯 Human-readable performance summary |
| | │ ├── training_history.json # 📈 Epoch-by-epoch training curve |
| | │ ├── best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}.safetensors # ⭐ Accuracy in filename! |
| | │ ├── best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}_metadata.json |
| | │ ├── final_model.safetensors |
| | │ ├── checkpoint_epoch_X_accYY.YY.safetensors |
| | │ ├── david_config.json |
| | │ └── train_config.json |
| | └── runs/ |
| | └── {david_config.name}/ |
| | └── {run_id}/ |
| | └── events.out.tfevents.* # TensorBoard logs |
| | ``` |
| | |
| | ### Loading the Model |
| | |
| | ```python |
| | from geovocab2.train.model.core.david import David, DavidArchitectureConfig |
| | from huggingface_hub import hf_hub_download |
| | |
| | # Browse available models in MODELS_INDEX.json first! |
| | |
| | # Specify model variant and run |
| | model_name = "{david_config.name}" |
| | run_id = "{run_id}" |
| | accuracy = "{best_metrics.get('best_val_acc', 0.0):.2f}" # From MODELS_INDEX.json |
| | |
| | # Download config |
| | config_path = hf_hub_download( |
| | repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}", |
| | filename=f"weights/{{model_name}}/{{run_id}}/david_config.json" |
| | ) |
| | config = DavidArchitectureConfig.from_json(config_path) |
| | |
| | # Download weights (accuracy in filename!) |
| | weights_path = hf_hub_download( |
| | repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}", |
| | filename=f"weights/{{model_name}}/{{run_id}}/best_model_acc{{accuracy}}.safetensors" |
| | ) |
| | |
| | # Download training history (optional - see full training curve) |
| | history_path = hf_hub_download( |
| | repo_id="{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}", |
| | filename=f"weights/{{model_name}}/{{run_id}}/training_history.json" |
| | ) |
| | |
| | # Load model |
| | from safetensors.torch import load_file |
| | david = David.from_config(config) |
| | david.load_state_dict(load_file(weights_path)) |
| | david.eval() |
| | ``` |
| | |
| | ### Inference |
| | |
| | ```python |
| | import torch |
| | import torch.nn.functional as F |
| | |
| | # Assuming you have CLIP features (512-dim for ViT-B/16) |
| | features = get_clip_features(image) # [1, 512] |
| | |
| | # Load anchors |
| | anchors_dict = torch.load("anchors.pth") |
| | |
| | # Forward pass |
| | with torch.no_grad(): |
| | logits, _ = david(features, anchors_dict) |
| | predictions = logits.argmax(dim=-1) |
| | ``` |
| | |
| | ## Architecture Overview |
| | |
| | ### Multi-Scale Processing |
| | David processes inputs at multiple scales ({', '.join(map(str, david_config.scales))}), |
| | allowing it to capture both coarse and fine-grained features. |
| | |
| | ### Shared Representation Space |
| | This variation shares multiple versions of clip-vit models in the same representation space. |
| | |
| | ### Feature Geometry |
| | Each class is represented by a pentachoron (4-simplex) in embedding space with 5 vertices: |
| | - **Anchor**: Primary class representative |
| | - **Need**: Complementary direction |
| | - **Relation**: Contextual alignment |
| | - **Purpose**: Functional direction |
| | - **Observer**: Meta-perspective |
| | |
| | ### Rose Loss |
| | Similarity computation uses role-weighted cosine similarities: |
| | ``` |
| | score = w_anchor * sim(z, anchor) + w_need * sim(z, need) + ... |
| | ``` |
| | |
| | ### Fusion Strategy |
| | **{david_config.fusion_mode}**: Intelligently combines predictions from multiple scales. |
| | |
| | ## Training Details |
| | |
| | ### Loss Components |
| | - **Cross-Entropy**: Standard classification loss |
| | - **Rose Loss**: Pentachora role-weighted margin loss (weight: {config.rose_initial_weight}→{config.rose_max_weight}) |
| | - **Cayley Loss**: Geometric regularization ({'enabled' if config.use_cayley_loss else 'disabled'}) |
| | |
| | ### Optimization |
| | - **Optimizer**: AdamW |
| | - **Weight Decay**: {config.weight_decay} |
| | - **Scheduler**: {config.scheduler_type} |
| | - **Gradient Clip**: {config.gradient_clip} |
| | - **Mixed Precision**: {config.use_mixed_precision} |
| | |
| | ## Citation |
| | |
| | ```bibtex |
| | @software{{david_classifier_2025, |
| | title = {{David: Multi-Scale Feature Classifier}}, |
| | author = {{AbstractPhil}}, |
| | year = {{2025}}, |
| | url = {{https://huggingface.co/{config.hf_repo if config.hf_repo else 'AbstractPhil/david'}}}, |
| | note = {{Run ID: {run_id}}} |
| | }} |
| | ``` |
| | |
| | ## License |
| | |
| | MIT License |
| | |
| | ## Acknowledgments |
| | |
| | Built with feature lattice geometry and multi-scale deep learning. |
| | Special thanks to Claude (Anthropic) for debugging assistance. |
| | |
| | --- |
| | |
| | *Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}* |
| | """ |
| | |
| | return readme |
| |
|
| |
|
| | def save_best_model_json( |
| | filepath: str, |
| | metrics: Dict, |
| | config: DavidTrainingConfig, |
| | david_config: DavidArchitectureConfig |
| | ): |
| | """Save best_model.json with comprehensive metrics.""" |
| | |
| | model_name = f"David-{david_config.sharing_mode}-{david_config.fusion_mode}" |
| | |
| | best_model_info = { |
| | "model_name": model_name, |
| | "run_id": config.run_id, |
| | "timestamp": datetime.now().isoformat(), |
| | |
| | |
| | "best_val_acc": metrics.get('best_val_acc', 0.0), |
| | "best_epoch": metrics.get('best_epoch', 0), |
| | "final_train_acc": metrics.get('final_train_acc', 0.0), |
| | "final_train_loss": metrics.get('final_train_loss', 0.0), |
| | |
| | |
| | "scale_accuracies": metrics.get('scale_accuracies', {}), |
| | |
| | |
| | "architecture": { |
| | "preset": config.preset, |
| | "sharing_mode": david_config.sharing_mode, |
| | "fusion_mode": david_config.fusion_mode, |
| | "scales": david_config.scales, |
| | "feature_dim": david_config.feature_dim, |
| | "num_classes": david_config.num_classes, |
| | "use_belly": david_config.use_belly, |
| | "belly_expand": david_config.belly_expand, |
| | }, |
| | |
| | |
| | "training": { |
| | "dataset": config.dataset_name, |
| | "model_variant": config.model_variant, |
| | "num_epochs": config.num_epochs, |
| | "batch_size": config.batch_size, |
| | "learning_rate": config.learning_rate, |
| | "rose_weight": f"{config.rose_initial_weight}→{config.rose_max_weight}", |
| | "cayley_loss": config.use_cayley_loss, |
| | "optimizer": "AdamW", |
| | "scheduler": config.scheduler_type, |
| | }, |
| | |
| | |
| | "files": { |
| | "weights_safetensors": f"weights/{model_name}/{config.run_id}/best_model_acc{metrics.get('best_val_acc', 0.0):.2f}.safetensors", |
| | "weights_pytorch": f"weights/{model_name}/{config.run_id}/best_model.pth", |
| | "config": f"weights/{model_name}/{config.run_id}/david_config.json", |
| | "training_config": f"weights/{model_name}/{config.run_id}/train_config.json", |
| | "tensorboard": f"runs/{model_name}/{config.run_id}/" |
| | } |
| | } |
| | |
| | with open(filepath, 'w') as f: |
| | json.dump(best_model_info, f, indent=2) |
| | |
| | print(f"[📄] Saved best_model.json: {filepath}") |
| |
|
| |
|
| | def create_model_summary( |
| | weights_dir: str, |
| | config: DavidTrainingConfig, |
| | david_config: DavidArchitectureConfig, |
| | best_metrics: Dict, |
| | model_name: str |
| | ): |
| | """Create prominent model summary with accuracy front and center.""" |
| | |
| | summary_path = os.path.join(weights_dir, 'MODEL_SUMMARY.txt') |
| | |
| | best_acc = best_metrics.get('best_val_acc', 0.0) |
| | training_history = best_metrics.get('training_history', {}) |
| | |
| | summary = f""" |
| | ╔══════════════════════════════════════════════════════════════╗ |
| | ║ DAVID MODEL SUMMARY ║ |
| | ╠══════════════════════════════════════════════════════════════╣ |
| | ║ ║ |
| | ║ 🎯 VALIDATION ACCURACY: {best_acc:.2f}% ║ |
| | ║ ║ |
| | ╚══════════════════════════════════════════════════════════════╝ |
| | |
| | MODEL: {model_name} |
| | RUN ID: {config.run_id} |
| | BEST EPOCH: {best_metrics.get('best_epoch', 0) + 1}/{config.num_epochs} |
| | |
| | ═══════════════════════════════════════════════════════════════ |
| | |
| | 📊 PERFORMANCE BREAKDOWN |
| | |
| | Final Training Accuracy: {best_metrics.get('final_train_acc', 0.0):.2f}% |
| | Best Validation Accuracy: {best_acc:.2f}% |
| | |
| | Per-Scale Accuracies: |
| | """ |
| | |
| | scale_accs = best_metrics.get('scale_accuracies', {}) |
| | for scale in sorted(scale_accs.keys()): |
| | acc = scale_accs[scale] |
| | summary += f" • Scale {scale:4d}: {acc:.2f}%\n" |
| | |
| | summary += f""" |
| | ═══════════════════════════════════════════════════════════════ |
| | |
| | 🏗️ ARCHITECTURE |
| | |
| | Preset: {config.preset} |
| | Sharing Mode: {david_config.sharing_mode} |
| | Fusion Mode: {david_config.fusion_mode} |
| | Scales: {len(david_config.scales)} scales - {david_config.scales} |
| | Feature Dim: {david_config.feature_dim} |
| | Parameters: {best_metrics.get('parameters', 0):,} |
| | |
| | ═══════════════════════════════════════════════════════════════ |
| | |
| | 📈 TRAINING CURVE |
| | |
| | """ |
| | |
| | if training_history and 'val_acc' in training_history: |
| | summary += "Epoch | Train Acc | Val Acc | Learning Rate\n" |
| | summary += "------|-----------|----------|--------------\n" |
| | |
| | for i, epoch in enumerate(training_history.get('epochs', [])): |
| | train_acc = training_history['train_acc'][i] if i < len(training_history['train_acc']) else 0 |
| | val_acc = training_history['val_acc'][i] if i < len(training_history['val_acc']) else 0 |
| | lr = training_history['lr'][i] if i < len(training_history['lr']) else 0 |
| | |
| | marker = " 👑" if val_acc == best_acc else "" |
| | summary += f"{epoch:5d} | {train_acc:8.2f}% | {val_acc:7.2f}%{marker} | {lr:.2e}\n" |
| | |
| | summary += f""" |
| | ═══════════════════════════════════════════════════════════════ |
| | |
| | 📁 FILES |
| | |
| | Best Model: best_model_acc{best_acc:.2f}.safetensors |
| | Config: david_config.json |
| | Training Cfg: train_config.json |
| | History: training_history.json |
| | |
| | ═══════════════════════════════════════════════════════════════ |
| | |
| | Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
| | """ |
| | |
| | with open(summary_path, 'w') as f: |
| | f.write(summary) |
| | |
| | print(f"[📄] Created MODEL_SUMMARY.txt") |
| | return summary_path |
| |
|
| |
|
| | def update_models_index( |
| | config: DavidTrainingConfig, |
| | david_config: DavidArchitectureConfig, |
| | best_metrics: Dict, |
| | model_name: str |
| | ): |
| | """Update master models index file tracking all trained models.""" |
| | |
| | if not config.upload_to_hub or not config.hf_repo: |
| | return |
| | |
| | try: |
| | from huggingface_hub import hf_hub_download |
| | api = HfApi() |
| | |
| | |
| | try: |
| | index_path = hf_hub_download( |
| | repo_id=config.hf_repo, |
| | filename="MODELS_INDEX.json", |
| | repo_type="model" |
| | ) |
| | with open(index_path, 'r') as f: |
| | models_index = json.load(f) |
| | except: |
| | |
| | models_index = { |
| | "repository": config.hf_repo, |
| | "updated": datetime.now().isoformat(), |
| | "models": [] |
| | } |
| | |
| | |
| | model_entry = { |
| | "model_name": model_name, |
| | "run_id": config.run_id, |
| | "timestamp": datetime.now().isoformat(), |
| | "best_val_acc": best_metrics.get('best_val_acc', 0.0), |
| | "best_epoch": best_metrics.get('best_epoch', 0), |
| | "num_scales": len(david_config.scales), |
| | "scales": david_config.scales, |
| | "parameters": best_metrics.get('parameters', 0), |
| | "sharing_mode": david_config.sharing_mode, |
| | "fusion_mode": david_config.fusion_mode, |
| | "preset": config.preset, |
| | "weights_path": f"weights/{model_name}/{config.run_id}/best_model_acc{best_metrics.get('best_val_acc', 0.0):.2f}.safetensors", |
| | "config_path": f"weights/{model_name}/{config.run_id}/david_config.json", |
| | "history_path": f"weights/{model_name}/{config.run_id}/training_history.json" |
| | } |
| | |
| | |
| | models_index["models"] = [m for m in models_index["models"] if m.get("run_id") != config.run_id] |
| | models_index["models"].append(model_entry) |
| | |
| | |
| | models_index["models"].sort(key=lambda x: x.get("best_val_acc", 0), reverse=True) |
| | models_index["updated"] = datetime.now().isoformat() |
| | models_index["total_models"] = len(models_index["models"]) |
| | |
| | |
| | with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f: |
| | json.dump(models_index, f, indent=2) |
| | temp_path = f.name |
| | |
| | |
| | api.upload_file( |
| | path_or_fileobj=temp_path, |
| | path_in_repo="MODELS_INDEX.json", |
| | repo_id=config.hf_repo, |
| | commit_message=f"Update models index - {model_name} @ {best_metrics.get('best_val_acc', 0.0):.2f}%" |
| | ) |
| | |
| | os.unlink(temp_path) |
| | print(f"[📊] Updated MODELS_INDEX.json - {len(models_index['models'])} models tracked") |
| | |
| | except Exception as e: |
| | print(f"[⚠️] Failed to update models index: {e}") |
| |
|
| |
|
| | def upload_to_huggingface( |
| | local_dir: str, |
| | repo_id: str, |
| | commit_message: str, |
| | path_in_repo: Optional[str] = None, |
| | patterns: Optional[List[str]] = None |
| | ): |
| | """Upload directory to HuggingFace Hub.""" |
| | |
| | try: |
| | api = HfApi() |
| | |
| | |
| | try: |
| | create_repo(repo_id, exist_ok=True, repo_type="model") |
| | print(f"[🤗] Repo ready: {repo_id}") |
| | except Exception as e: |
| | print(f"[⚠️] Repo exists or creation failed: {e}") |
| | |
| | |
| | if patterns: |
| | |
| | for pattern in patterns: |
| | matching_files = list(Path(local_dir).rglob(pattern)) |
| | for file_path in matching_files: |
| | rel_path = file_path.relative_to(local_dir) |
| | if path_in_repo: |
| | repo_path = f"{path_in_repo}/{rel_path}" |
| | else: |
| | repo_path = str(rel_path) |
| | |
| | api.upload_file( |
| | path_or_fileobj=str(file_path), |
| | path_in_repo=repo_path, |
| | repo_id=repo_id, |
| | commit_message=commit_message |
| | ) |
| | else: |
| | |
| | api.upload_folder( |
| | folder_path=local_dir, |
| | repo_id=repo_id, |
| | path_in_repo=path_in_repo, |
| | commit_message=commit_message |
| | ) |
| | |
| | print(f"[✅] Uploaded to Hub: https://huggingface.co/{repo_id}") |
| | |
| | except Exception as e: |
| | print(f"[❌] Hub upload failed: {e}") |
| | print(f" Continuing training (files saved locally)") |
| |
|
| |
|
| | def prepare_hub_upload( |
| | weights_dir: str, |
| | runs_dir: str, |
| | config: DavidTrainingConfig, |
| | david_config: DavidArchitectureConfig, |
| | best_metrics: Dict, |
| | model_name: str |
| | ): |
| | """Prepare and upload all artifacts to HuggingFace Hub.""" |
| | |
| | if not config.upload_to_hub or not config.hf_repo: |
| | return |
| | |
| | print("\n[🤗] Preparing HuggingFace Hub upload...") |
| | |
| | |
| | summary_path = create_model_summary(weights_dir, config, david_config, best_metrics, model_name) |
| | |
| | |
| | update_models_index(config, david_config, best_metrics, model_name) |
| | |
| | api = HfApi() |
| | try: |
| | create_repo(config.hf_repo, exist_ok=True, repo_type="model") |
| | except: |
| | pass |
| | |
| | |
| | with tempfile.TemporaryDirectory() as temp_dir: |
| | |
| | readme_path = os.path.join(temp_dir, "README.md") |
| | readme_content = generate_model_readme(config, david_config, best_metrics, config.run_id) |
| | with open(readme_path, 'w') as f: |
| | f.write(readme_content) |
| | print(f"[📝] Generated README.md") |
| | |
| | |
| | best_json_path = os.path.join(temp_dir, "best_model.json") |
| | save_best_model_json(best_json_path, best_metrics, config, david_config) |
| | |
| | |
| | print(f"[📤] Uploading root files...") |
| | |
| | api.upload_file( |
| | path_or_fileobj=readme_path, |
| | path_in_repo="README.md", |
| | repo_id=config.hf_repo, |
| | commit_message=f"Update README - Run {config.run_id}" |
| | ) |
| | |
| | api.upload_file( |
| | path_or_fileobj=best_json_path, |
| | path_in_repo="best_model.json", |
| | repo_id=config.hf_repo, |
| | commit_message=f"Update metrics - Run {config.run_id}" |
| | ) |
| | |
| | |
| | weights_repo_path = f"weights/{model_name}/{config.run_id}" |
| | best_acc = best_metrics.get('best_val_acc', 0.0) |
| | |
| | print(f"[📤] Uploading essential files to {weights_repo_path}...") |
| | |
| | |
| | files_to_upload = [ |
| | ('MODEL_SUMMARY.txt', 'MODEL_SUMMARY.txt'), |
| | ('training_history.json', 'training_history.json'), |
| | ('david_config.json', 'david_config.json'), |
| | ('train_config.json', 'train_config.json'), |
| | (f'best_model_acc{best_acc:.2f}.safetensors', f'best_model_acc{best_acc:.2f}.safetensors'), |
| | (f'best_model_acc{best_acc:.2f}_metadata.json', f'best_model_acc{best_acc:.2f}_metadata.json'), |
| | ] |
| | |
| | for local_filename, repo_filename in files_to_upload: |
| | local_path = os.path.join(weights_dir, local_filename) |
| | if os.path.exists(local_path): |
| | try: |
| | api.upload_file( |
| | path_or_fileobj=local_path, |
| | path_in_repo=f"{weights_repo_path}/{repo_filename}", |
| | repo_id=config.hf_repo, |
| | commit_message=f"Update {repo_filename} - Run {config.run_id}" |
| | ) |
| | except Exception as e: |
| | print(f"[⚠️] Failed to upload {repo_filename}: {e}") |
| | |
| | print(f"[✅] Uploaded to Hub: https://huggingface.co/{config.hf_repo}") |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def save_checkpoint( |
| | filepath: str, |
| | david: David, |
| | optimizer: torch.optim.Optimizer, |
| | scheduler: Optional[torch.optim.lr_scheduler._LRScheduler], |
| | epoch: int, |
| | metrics: Dict, |
| | train_config: DavidTrainingConfig |
| | ): |
| | """Save checkpoint in PyTorch and/or SafeTensors format.""" |
| | |
| | checkpoint = { |
| | 'epoch': epoch, |
| | 'model_state_dict': david.state_dict(), |
| | 'optimizer_state_dict': optimizer.state_dict(), |
| | 'scheduler_state_dict': scheduler.state_dict() if scheduler else None, |
| | 'metrics': metrics, |
| | 'train_config': train_config.to_dict(), |
| | } |
| | |
| | |
| | val_acc = metrics.get('best_val_acc') or metrics.get('val_acc') |
| | if val_acc: |
| | acc_suffix = f"_acc{val_acc:.2f}" |
| | filepath = filepath + acc_suffix |
| | |
| | if train_config.save_format in ['pytorch', 'both']: |
| | torch.save(checkpoint, filepath + '.pth') |
| | print(f"[💾] Saved PyTorch: {filepath}.pth") |
| | |
| | if train_config.save_format in ['safetensors', 'both']: |
| | try: |
| | from safetensors.torch import save_file |
| | |
| | |
| | model_state = {k: v.contiguous() for k, v in david.state_dict().items()} |
| | save_file(model_state, filepath + '.safetensors') |
| | |
| | |
| | metadata = {k: v for k, v in checkpoint.items() |
| | if k not in ['model_state_dict']} |
| | with open(filepath + '_metadata.json', 'w') as f: |
| | json.dump(metadata, f, indent=2, default=str) |
| | |
| | print(f"[💾] Saved SafeTensors: {filepath}.safetensors") |
| | except ImportError: |
| | print(f"[⚠️] SafeTensors not available, skipping") |
| |
|
| |
|
| | def load_checkpoint( |
| | checkpoint_path: str, |
| | david: David, |
| | optimizer: Optional[torch.optim.Optimizer] = None, |
| | scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, |
| | device: str = "cuda" |
| | ) -> Tuple[int, Dict]: |
| | """Load checkpoint and return epoch and metrics.""" |
| | |
| | if checkpoint_path.endswith('.safetensors'): |
| | |
| | try: |
| | from safetensors.torch import load_file |
| | |
| | model_state = load_file(checkpoint_path, device=device) |
| | david.load_state_dict(model_state) |
| | |
| | |
| | metadata_path = checkpoint_path.replace('.safetensors', '_metadata.json') |
| | with open(metadata_path, 'r') as f: |
| | metadata = json.load(f) |
| | |
| | epoch = metadata.get('epoch', 0) |
| | metrics = metadata.get('metrics', {}) |
| | |
| | if optimizer and 'optimizer_state_dict' in metadata: |
| | optimizer.load_state_dict(metadata['optimizer_state_dict']) |
| | |
| | if scheduler and 'scheduler_state_dict' in metadata and metadata['scheduler_state_dict']: |
| | scheduler.load_state_dict(metadata['scheduler_state_dict']) |
| | |
| | print(f"[✅] Loaded from SafeTensors: {checkpoint_path}") |
| | return epoch, metrics |
| | |
| | except ImportError: |
| | raise ImportError("safetensors not installed") |
| | |
| | else: |
| | |
| | checkpoint = torch.load(checkpoint_path, map_location=device) |
| | |
| | david.load_state_dict(checkpoint['model_state_dict']) |
| | |
| | if optimizer and 'optimizer_state_dict' in checkpoint: |
| | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) |
| | |
| | if scheduler and 'scheduler_state_dict' in checkpoint and checkpoint['scheduler_state_dict']: |
| | scheduler.load_state_dict(checkpoint['scheduler_state_dict']) |
| | |
| | print(f"[✅] Loaded from PyTorch: {checkpoint_path}") |
| | return checkpoint['epoch'], checkpoint.get('metrics', {}) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class ImageNetHFDataset(Dataset): |
| | """PyTorch Dataset wrapper for HuggingFace ImageNet features.""" |
| | |
| | def __init__(self, dataset_name: str, model_variant: str, split: str = "train"): |
| | |
| | print(f"[📥] Loading {split} split for {model_variant}...") |
| | self.dataset = load_dataset( |
| | dataset_name, |
| | name=model_variant, |
| | split=split |
| | ) |
| | self.length = len(self.dataset) |
| | print(f"[✅] Loaded {self.length:,} samples from {split} split") |
| | |
| | def __len__(self): |
| | return self.length |
| | |
| | def __getitem__(self, idx): |
| | item = self.dataset[idx] |
| | features = torch.tensor(item['clip_features'], dtype=torch.float32) |
| | label = torch.tensor(item['label'], dtype=torch.long) |
| | return features, label |
| |
|
| |
|
| | class MergedImageNetDataset(Dataset): |
| | """ |
| | Merge multiple CLIP variants into a single dataset. |
| | Perfect for testing if David can unify different encoder spaces! |
| | """ |
| | |
| | def __init__( |
| | self, |
| | dataset_name: str, |
| | model_variants: List[str], |
| | split: str = "train", |
| | shuffle_seed: int = 42 |
| | ): |
| | print(f"[🔀] Creating merged dataset from {len(model_variants)} variants...") |
| | |
| | self.datasets = [] |
| | self.cumulative_lengths = [0] |
| | |
| | |
| | for variant in model_variants: |
| | print(f"[📥] Loading {split} split for {variant}...") |
| | ds = load_dataset( |
| | dataset_name, |
| | name=variant, |
| | split=split |
| | ) |
| | self.datasets.append(ds) |
| | self.cumulative_lengths.append(self.cumulative_lengths[-1] + len(ds)) |
| | print(f"[✅] Loaded {len(ds):,} samples from {variant}") |
| | |
| | self.total_length = self.cumulative_lengths[-1] |
| | |
| | |
| | print(f"[🎲] Shuffling {self.total_length:,} samples (seed={shuffle_seed})...") |
| | rng = np.random.RandomState(shuffle_seed) |
| | self.shuffle_indices = rng.permutation(self.total_length) |
| | |
| | print(f"[✅] Merged dataset ready: {self.total_length:,} samples from {len(model_variants)} encoders") |
| | |
| | def __len__(self): |
| | return self.total_length |
| | |
| | def __getitem__(self, idx): |
| | |
| | actual_idx = int(self.shuffle_indices[idx]) |
| | |
| | |
| | dataset_idx = 0 |
| | for i, cumsum in enumerate(self.cumulative_lengths[1:]): |
| | if actual_idx < cumsum: |
| | dataset_idx = i |
| | break |
| | |
| | |
| | local_idx = actual_idx - self.cumulative_lengths[dataset_idx] |
| | item = self.datasets[dataset_idx][local_idx] |
| | |
| | features = torch.tensor(item['clip_features'], dtype=torch.float32) |
| | label = torch.tensor(item['label'], dtype=torch.long) |
| | |
| | return features, label |
| |
|
| |
|
| | def create_dataloaders(config: DavidTrainingConfig): |
| | """Create train and validation dataloaders.""" |
| | |
| | |
| | if isinstance(config.model_variant, list): |
| | print(f"[🧪] MULTI-ENCODER EXPERIMENT: Merging {len(config.model_variant)} variants") |
| | train_dataset = MergedImageNetDataset( |
| | config.dataset_name, |
| | config.model_variant, |
| | "train" |
| | ) |
| | val_dataset = MergedImageNetDataset( |
| | config.dataset_name, |
| | config.model_variant, |
| | "validation" |
| | ) |
| | else: |
| | |
| | train_dataset = ImageNetHFDataset( |
| | config.dataset_name, config.model_variant, "train" |
| | ) |
| | val_dataset = ImageNetHFDataset( |
| | config.dataset_name, config.model_variant, "validation" |
| | ) |
| | |
| | train_loader = DataLoader( |
| | train_dataset, |
| | batch_size=config.batch_size, |
| | shuffle=True, |
| | num_workers=config.num_workers, |
| | pin_memory=config.pin_memory, |
| | prefetch_factor=config.prefetch_factor, |
| | persistent_workers=config.persistent_workers |
| | ) |
| | |
| | val_loader = DataLoader( |
| | val_dataset, |
| | batch_size=config.batch_size * 2, |
| | shuffle=False, |
| | num_workers=config.num_workers, |
| | pin_memory=config.pin_memory, |
| | prefetch_factor=config.prefetch_factor, |
| | persistent_workers=config.persistent_workers |
| | ) |
| | |
| | return train_loader, val_loader |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class CrystalGenerator: |
| | """Generate crystals for all scales.""" |
| | |
| | def __init__(self, num_classes: int, scales: List[int], device: str = "cuda"): |
| | self.num_classes = num_classes |
| | self.scales = scales |
| | self.device = device |
| | self.factories = { |
| | scale: SimplexFactory(k=4, embed_dim=scale, method="random") |
| | for scale in scales |
| | } |
| | |
| | def generate(self, seed: int = 42) -> Tuple[Dict[int, torch.Tensor], Dict[int, torch.Tensor]]: |
| | """Generate anchors and crystals for all scales.""" |
| | |
| | anchors_dict = {} |
| | crystals_dict = {} |
| | |
| | for scale in tqdm(self.scales, desc="Generating crystals"): |
| | factory = self.factories[scale] |
| | batch_crystals = [] |
| | |
| | for class_idx in range(self.num_classes): |
| | crystal = factory.build( |
| | backend="torch", |
| | device=self.device, |
| | dtype=torch.float32, |
| | seed=seed + class_idx, |
| | validate=True |
| | ) |
| | batch_crystals.append(crystal) |
| | |
| | crystals = torch.stack(batch_crystals) |
| | anchors = F.normalize(crystals[:, 0, :], dim=-1) |
| | |
| | |
| | anchor_sims = anchors @ anchors.T |
| | off_diag = anchor_sims[~torch.eye(self.num_classes, dtype=bool, device=anchors.device)] |
| | max_sim = off_diag.max().item() |
| | mean_sim = off_diag.mean().item() |
| | |
| | print(f" Scale {scale}: max_sim={max_sim:.4f}, mean_sim={mean_sim:.4f}") |
| | |
| | if max_sim > 0.99: |
| | print(f" ⚠️ WARNING: Anchors too similar at scale {scale}!") |
| | |
| | anchors_dict[scale] = anchors |
| | crystals_dict[scale] = crystals |
| | |
| | return anchors_dict, crystals_dict |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def train_epoch( |
| | david: David, |
| | train_loader: DataLoader, |
| | optimizer: torch.optim.Optimizer, |
| | criterion: MultiScaleCrystalLoss, |
| | anchors_dict: Dict[int, torch.Tensor], |
| | crystals_dict: Dict[int, torch.Tensor], |
| | epoch: int, |
| | config: DavidTrainingConfig, |
| | writer: Optional[SummaryWriter], |
| | global_step: int |
| | ) -> Tuple[float, float, int, Dict]: |
| | """Train for one epoch - Pure FP32.""" |
| | |
| | david.train() |
| | david.update_epoch(epoch) |
| | |
| | total_loss = 0 |
| | correct = 0 |
| | total = 0 |
| | loss_components_sum = {} |
| | |
| | active_scales = david.get_active_scales() |
| | |
| | pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}") |
| | |
| | for batch_idx, (features, labels) in enumerate(pbar): |
| | features = features.cuda(non_blocking=True) |
| | labels = labels.cuda(non_blocking=True) |
| | |
| | |
| | optimizer.zero_grad() |
| | |
| | |
| | combined, logits_list, features_list, fusion_weights = david( |
| | features, anchors_dict, return_all_scales=True |
| | ) |
| | |
| | |
| | losses = criterion( |
| | combined, logits_list, features_list, |
| | labels, crystals_dict, epoch |
| | ) |
| | |
| | |
| | losses['total'].backward() |
| | |
| | |
| | if config.track_gradients and batch_idx % config.log_interval == 0: |
| | grad_stats = analyze_gradients(david, config) |
| | if writer: |
| | step = global_step + batch_idx |
| | writer.add_scalar('train/grad_mean', grad_stats['mean'], step) |
| | writer.add_scalar('train/grad_max', grad_stats['max'], step) |
| | writer.add_scalar('train/grad_num_small', grad_stats['num_small'], step) |
| | |
| | |
| | scale_small_gradients(david, config) |
| | |
| | |
| | torch.nn.utils.clip_grad_norm_(david.parameters(), config.gradient_clip) |
| | |
| | |
| | optimizer.step() |
| | |
| | |
| | total_loss += losses['total'].item() |
| | _, predicted = torch.max(combined, 1) |
| | total += labels.size(0) |
| | correct += (predicted == labels).sum().item() |
| | |
| | |
| | for key, value in losses.items(): |
| | if key not in loss_components_sum: |
| | loss_components_sum[key] = 0.0 |
| | loss_components_sum[key] += value.item() |
| | |
| | |
| | if writer and batch_idx % config.log_interval == 0: |
| | step = global_step + batch_idx |
| | writer.add_scalar('train/loss_batch', losses['total'].item(), step) |
| | writer.add_scalar('train/acc_batch', 100 * correct / total, step) |
| | |
| | if config.log_loss_components: |
| | for key, value in losses.items(): |
| | if key != 'total': |
| | writer.add_scalar(f'train/loss_{key}', value.item(), step) |
| | |
| | if config.log_fusion_weights and fusion_weights is not None: |
| | if fusion_weights.dim() == 2: |
| | mean_weights = fusion_weights.mean(dim=0) |
| | for i, w in enumerate(mean_weights): |
| | if i < len(active_scales): |
| | writer.add_scalar( |
| | f'train/fusion_weight_{active_scales[i]}', |
| | w.item(), step |
| | ) |
| | |
| | writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], step) |
| | |
| | pbar.set_postfix({ |
| | 'loss': f'{total_loss / (batch_idx + 1):.4f}', |
| | 'acc': f'{100 * correct / total:.2f}%' |
| | }) |
| | |
| | global_step += 1 |
| | |
| | |
| | avg_components = {k: v / len(train_loader) for k, v in loss_components_sum.items()} |
| | |
| | return ( |
| | total_loss / len(train_loader), |
| | 100 * correct / total, |
| | global_step, |
| | avg_components |
| | ) |
| |
|
| |
|
| | @torch.no_grad() |
| | def validate( |
| | david: David, |
| | val_loader: DataLoader, |
| | anchors_dict: Dict[int, torch.Tensor], |
| | config: DavidTrainingConfig |
| | ) -> Tuple[float, Dict[int, float]]: |
| | """Validate model - Pure FP32.""" |
| | |
| | david.eval() |
| | |
| | correct = 0 |
| | total = 0 |
| | active_scales = david.get_active_scales() |
| | scale_correct = {scale: 0 for scale in active_scales} |
| | |
| | for features, labels in tqdm(val_loader, desc="Validation", leave=False): |
| | features = features.cuda(non_blocking=True) |
| | labels = labels.cuda(non_blocking=True) |
| | |
| | |
| | combined, logits_list, _, _ = david( |
| | features, anchors_dict, return_all_scales=True |
| | ) |
| | |
| | _, predicted = torch.max(combined, 1) |
| | total += labels.size(0) |
| | correct += (predicted == labels).sum().item() |
| | |
| | for i, scale in enumerate(active_scales): |
| | if i < len(logits_list): |
| | _, scale_pred = torch.max(logits_list[i], 1) |
| | scale_correct[scale] += (scale_pred == labels).sum().item() |
| | |
| | accuracy = 100 * correct / total |
| | scale_accs = {s: 100 * scale_correct[s] / total for s in scale_correct} |
| | |
| | return accuracy, scale_accs |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def train_david(config: DavidTrainingConfig): |
| | """Main training pipeline.""" |
| | |
| | |
| | torch.set_float32_matmul_precision('high') |
| | |
| | print("="*80) |
| | print("🌟 DAVID TRAINING PIPELINE") |
| | print("="*80) |
| | print(f"Run ID: {config.run_id}") |
| | print(f"Preset: {config.preset}") |
| | print(f"Batch Size: {config.batch_size}") |
| | print(f"Learning Rate: {config.learning_rate}") |
| | print(f"Mixed Precision: {config.use_mixed_precision}") |
| | print(f"TensorFloat32: Enabled (high precision)") |
| | print("="*80) |
| | |
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| | |
| | |
| | if config.custom_config_path: |
| | david_config = DavidArchitectureConfig.from_json(config.custom_config_path) |
| | print(f"[📁] Loaded custom config: {config.custom_config_path}") |
| | elif config.preset: |
| | david_config = DavidPresets.get_preset(config.preset) |
| | print(f"[⚙️] Using preset: {config.preset}") |
| | else: |
| | raise ValueError("Must specify either preset or custom_config_path") |
| | |
| | |
| | model_name = f"David-{david_config.sharing_mode}-{david_config.fusion_mode}" |
| | print(f"[🏷️] Model: {model_name}") |
| | |
| | |
| | weights_dir = os.path.join(config.base_dir, "weights", model_name, config.run_id) |
| | runs_dir = os.path.join(config.base_dir, "runs", model_name, config.run_id) |
| | os.makedirs(weights_dir, exist_ok=True) |
| | os.makedirs(runs_dir, exist_ok=True) |
| | |
| | print(f"[📁] Weights: {weights_dir}") |
| | print(f"[📁] Logs: {runs_dir}") |
| | |
| | writer = SummaryWriter(runs_dir) |
| | |
| | |
| | if config.num_classes_override: |
| | david_config.num_classes = config.num_classes_override |
| | if config.use_belly_override is not None: |
| | david_config.use_belly = config.use_belly_override |
| | if config.belly_expand_override is not None: |
| | david_config.belly_expand = config.belly_expand_override |
| | if config.progressive_training_override is not None: |
| | david_config.progressive_training = config.progressive_training_override |
| | if not david_config.progressive_training: |
| | |
| | david_config.scale_warmup_epochs = {s: 0 for s in david_config.scales} |
| | |
| | |
| | if config.scale_warmup_epochs_override is not None: |
| | david_config.scale_warmup_epochs = config.scale_warmup_epochs_override |
| | |
| | if not david_config.progressive_training: |
| | print(f"[⚙️] Enabling progressive training (custom warmup schedule provided)") |
| | david_config.progressive_training = True |
| | |
| | print(f"[⚙️] Progressive training: {david_config.progressive_training}") |
| | if david_config.progressive_training: |
| | print(f" Scale warmup schedule: {david_config.scale_warmup_epochs}") |
| | |
| | |
| | david_config_path = os.path.join(weights_dir, "david_config.json") |
| | david_config.to_json(david_config_path) |
| | print(f"[💾] Saved David config: {david_config_path}") |
| | |
| | train_config_path = os.path.join(weights_dir, "train_config.json") |
| | config.to_json(train_config_path) |
| | print(f"[💾] Saved training config: {train_config_path}") |
| | |
| | |
| | david = David.from_config(david_config).cuda() |
| | print(f"\n{david}\n") |
| | |
| | |
| | total_params = sum(p.numel() for p in david.parameters()) |
| | trainable_params = sum(p.numel() for p in david.parameters() if p.requires_grad) |
| | print(f"[📊] Total Parameters: {total_params:,}") |
| | print(f"[📊] Trainable Parameters: {trainable_params:,}") |
| | |
| | |
| | train_loader, val_loader = create_dataloaders(config) |
| | |
| | |
| | crystal_gen = CrystalGenerator( |
| | david_config.num_classes, |
| | david_config.scales, |
| | str(device) |
| | ) |
| | anchors_dict, crystals_dict = crystal_gen.generate() |
| | |
| | |
| | criterion = MultiScaleCrystalLoss( |
| | scales=david_config.scales, |
| | num_classes=david_config.num_classes, |
| | use_rose_loss=config.use_rose_loss, |
| | use_cayley_loss=config.use_cayley_loss, |
| | rose_initial_weight=config.rose_initial_weight, |
| | rose_max_weight=config.rose_max_weight, |
| | cayley_weight=config.cayley_weight, |
| | scale_loss_balance=config.scale_loss_balance |
| | ).cuda() |
| | |
| | optimizer = create_optimizer(david, config) |
| | scheduler = create_scheduler(optimizer, config) |
| | |
| | controller = AdaptiveTrainingController(david, config) |
| | |
| | |
| | best_val_acc = 0.0 |
| | best_epoch = 0 |
| | best_scale_accs = {} |
| | global_step = 0 |
| | final_train_acc = 0.0 |
| | final_train_loss = 0.0 |
| | |
| | |
| | training_history = { |
| | 'epochs': [], |
| | 'train_loss': [], |
| | 'train_acc': [], |
| | 'val_acc': [], |
| | 'scale_accs': {}, |
| | 'lr': [] |
| | } |
| | |
| | |
| | print("\n[🔍] Running diagnostic forward/backward pass...") |
| | david.train() |
| | |
| | |
| | for features_test, labels_test in train_loader: |
| | features_test = features_test.cuda(non_blocking=True)[:8] |
| | labels_test = labels_test.cuda(non_blocking=True)[:8] |
| | |
| | |
| | combined_test, logits_test, features_test_out, _ = david( |
| | features_test, anchors_dict, return_all_scales=True |
| | ) |
| | |
| | |
| | losses_test = criterion( |
| | combined_test, logits_test, features_test_out, |
| | labels_test, crystals_dict, epoch=0 |
| | ) |
| | |
| | print(f" Initial loss: {losses_test['total'].item():.6f}") |
| | print(f" Loss components:") |
| | for key, value in losses_test.items(): |
| | if key != 'total': |
| | print(f" {key}: {value.item():.6f}") |
| | |
| | |
| | optimizer.zero_grad() |
| | losses_test['total'].backward() |
| | |
| | |
| | grad_count = sum(1 for p in david.parameters() if p.grad is not None and p.grad.norm() > 0) |
| | total_grad_params = sum(1 for p in david.parameters() if p.requires_grad) |
| | print(f" Parameters with non-zero gradients: {grad_count}/{total_grad_params}") |
| | |
| | if grad_count == 0: |
| | print(f" ❌ ERROR: No gradients! Training will not work.") |
| | return None, 0.0 |
| | elif grad_count < total_grad_params * 0.5: |
| | print(f" ⚠️ WARNING: Less than 50% of parameters have gradients") |
| | else: |
| | print(f" ✅ Gradients look good") |
| | |
| | break |
| | |
| | print("\n[🚀] Starting training...\n") |
| | |
| | for epoch in range(config.num_epochs): |
| | epoch_start = time.time() |
| | |
| | |
| | train_loss, train_acc, global_step, loss_components = train_epoch( |
| | david, train_loader, optimizer, criterion, |
| | anchors_dict, crystals_dict, epoch, config, |
| | writer, global_step |
| | ) |
| | |
| | |
| | val_acc, scale_accs = validate(david, val_loader, anchors_dict, config) |
| | |
| | |
| | controller.update_metrics(scale_accs, val_acc) |
| | controller.apply_adaptive_strategies(scale_accs, epoch) |
| | |
| | |
| | if scheduler: |
| | scheduler.step() |
| | |
| | epoch_time = time.time() - epoch_start |
| | |
| | |
| | print(f"\n📊 Epoch {epoch+1}/{config.num_epochs} ({epoch_time:.1f}s)") |
| | print(f" Train: Loss={train_loss:.4f}, Acc={train_acc:.2f}%") |
| | print(f" Val: Acc={val_acc:.2f}% (Best: {best_val_acc:.2f}%)") |
| | print(f" Active scales: {david.get_active_scales()}") |
| | print(f" LR: {optimizer.param_groups[0]['lr']:.2e}") |
| | |
| | if config.log_loss_components and loss_components: |
| | print(f" Loss breakdown:") |
| | for key, value in sorted(loss_components.items()): |
| | if key != 'total': |
| | print(f" {key:20s}: {value:.6f}") |
| | |
| | for scale, acc in scale_accs.items(): |
| | frozen = "❄️" if controller.scales_frozen.get(scale, False) else "🔥" |
| | print(f" {frozen} Scale {scale}: {acc:.2f}%") |
| | |
| | |
| | final_train_acc = train_acc |
| | final_train_loss = train_loss |
| | |
| | |
| | training_history['epochs'].append(epoch + 1) |
| | training_history['train_loss'].append(train_loss) |
| | training_history['train_acc'].append(train_acc) |
| | training_history['val_acc'].append(val_acc) |
| | training_history['lr'].append(optimizer.param_groups[0]['lr']) |
| | |
| | |
| | for scale, acc in scale_accs.items(): |
| | if scale not in training_history['scale_accs']: |
| | training_history['scale_accs'][scale] = [] |
| | training_history['scale_accs'][scale].append(acc) |
| | |
| | |
| | writer.add_scalar('train/loss', train_loss, epoch) |
| | writer.add_scalar('train/acc', train_acc, epoch) |
| | writer.add_scalar('val/acc', val_acc, epoch) |
| | |
| | for scale, acc in scale_accs.items(): |
| | writer.add_scalar(f'val/acc_scale_{scale}', acc, epoch) |
| | |
| | |
| | if val_acc > best_val_acc: |
| | best_val_acc = val_acc |
| | best_epoch = epoch |
| | best_scale_accs = scale_accs.copy() |
| | |
| | |
| | history_path = os.path.join(weights_dir, 'training_history.json') |
| | with open(history_path, 'w') as f: |
| | json.dump(training_history, f, indent=2) |
| | |
| | save_checkpoint( |
| | os.path.join(weights_dir, 'best_model'), |
| | david, optimizer, scheduler, epoch, |
| | { |
| | 'best_val_acc': best_val_acc, |
| | 'best_epoch': best_epoch, |
| | 'scale_accuracies': best_scale_accs, |
| | 'training_history': training_history |
| | }, |
| | config |
| | ) |
| | |
| | |
| | if config.upload_to_hub: |
| | best_metrics = { |
| | 'best_val_acc': best_val_acc, |
| | 'best_epoch': best_epoch, |
| | 'scale_accuracies': best_scale_accs, |
| | 'final_train_acc': train_acc, |
| | 'final_train_loss': train_loss, |
| | 'training_history': training_history, |
| | 'parameters': total_params |
| | } |
| | prepare_hub_upload(weights_dir, runs_dir, config, david_config, best_metrics, model_name) |
| | |
| | |
| | if (epoch + 1) % config.save_interval == 0: |
| | save_checkpoint( |
| | os.path.join(weights_dir, f'checkpoint_epoch_{epoch+1}'), |
| | david, optimizer, scheduler, epoch, |
| | {'val_acc': val_acc}, |
| | config |
| | ) |
| | |
| | |
| | save_checkpoint( |
| | os.path.join(weights_dir, 'final_model'), |
| | david, optimizer, scheduler, config.num_epochs - 1, |
| | {'final_val_acc': val_acc}, |
| | config |
| | ) |
| | |
| | writer.close() |
| | |
| | |
| | if config.upload_to_hub: |
| | print("\n[🤗] Performing final HuggingFace Hub upload...") |
| | final_metrics = { |
| | 'best_val_acc': best_val_acc, |
| | 'best_epoch': best_epoch, |
| | 'scale_accuracies': best_scale_accs, |
| | 'final_train_acc': final_train_acc, |
| | 'final_train_loss': final_train_loss, |
| | 'training_history': training_history, |
| | 'parameters': total_params |
| | } |
| | prepare_hub_upload(weights_dir, runs_dir, config, david_config, final_metrics, model_name) |
| | |
| | |
| | if os.path.exists(runs_dir): |
| | runs_repo_path = f"runs/{model_name}/{config.run_id}" |
| | print(f"[📤] Uploading TensorBoard logs to {runs_repo_path}...") |
| | upload_to_huggingface( |
| | local_dir=runs_dir, |
| | repo_id=config.hf_repo, |
| | commit_message=f"Upload TensorBoard logs - {model_name} - Run {config.run_id}", |
| | path_in_repo=runs_repo_path |
| | ) |
| | |
| | print("\n" + "="*80) |
| | print(f"🎉 Training Complete!") |
| | print(f" Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch+1})") |
| | print(f" Final Train Acc: {final_train_acc:.2f}%") |
| | print(f" Weights: {weights_dir}") |
| | if config.upload_to_hub: |
| | print(f" Hub: https://huggingface.co/{config.hf_repo}") |
| | print("="*80) |
| | |
| | return david, best_val_acc |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | if __name__ == "__main__": |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | config = DavidTrainingConfig( |
| | preset="balanced", |
| | |
| | |
| | model_variant=["clip_vit_b16", "clip_vit_laion_b32"], |
| | |
| | num_epochs=10, |
| | batch_size=1024, |
| | learning_rate=1e-2, |
| | |
| | |
| | scale_warmup_epochs_override={ |
| | 256: 0, |
| | 512: 2, |
| | 768: 5, |
| | 1024: 8 |
| | }, |
| | |
| | use_rose_loss=True, |
| | rose_initial_weight=0.2, |
| | rose_max_weight=0.8, |
| | |
| | use_cayley_loss=True, |
| | cayley_weight=0.01, |
| | |
| | freeze_strategy="never", |
| | gradient_clip=10.0, |
| | |
| | save_format="safetensors", |
| | upload_to_hub=False, |
| | hf_repo="YourName/YourRepoHere" |
| | ) |
| | |
| | print("="*80) |
| | print("🧪 UNIFIED SPACE EXPERIMENT") |
| | print("="*80) |
| | print(f"Testing if David can unify:") |
| | if isinstance(config.model_variant, list): |
| | for variant in config.model_variant: |
| | print(f" • {variant}") |
| | print("="*80) |
| | |
| | david, best_acc = train_david(config) |