Merge remote-tracking branch 'origin/main' into dev-patch

Browse files

Files changed (6) hide show

README.md +5 -1
forecasting/models/__init__.py +0 -1
forecasting/models/vit_patch_model_local.py +4 -5
forecasting/training/callback.py +113 -60
forecasting/training/localpatch.yaml +13 -8
forecasting/training/train.py +46 -63

README.md CHANGED Viewed

	@@ -1 +1,5 @@
1	- # ~~2025-HL-flaring-intelligence~~

+<<<<<<< HEAD
+# 2025-HL-flaring-intelligence
+=======
+# FOXES
+>>>>>>> origin/main

forecasting/models/__init__.py CHANGED Viewed

	@@ -1,2 +1 @@
1	- from .fusion_vit_hybrid import FusionViTHybrid
2



1

forecasting/models/vit_patch_model_local.py CHANGED Viewed

@@ -108,7 +108,7 @@ class ViTLocal(pl.LightningModule):
             if self.global_step % 200 == 0:
                 multipliers = self.adaptive_loss.get_current_multipliers()
                 for key, value in multipliers.items():
-                    self.log(f"adaptive/{key}", value, on_step=True, on_epoch=False)
         if mode == "val":
             # Validation: typically only log epoch aggregates
@@ -177,9 +177,8 @@ class VisionTransformerLocal(nn.Module):
         self.mlp_head = nn.Sequential(nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 1))
         self.dropout = nn.Dropout(dropout)
-        # Parameters/Embeddings
-        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
-        self.pos_embedding = nn.Parameter(torch.randn(1, 1 + num_patches, embed_dim))
         self.grid_h = int(math.sqrt(num_patches))
         self.grid_w = int(math.sqrt(num_patches))
         self.pos_embedding_2d = nn.Parameter(torch.randn(1, self.grid_h, self.grid_w, embed_dim))
@@ -289,7 +288,7 @@ class AttentionBlock(nn.Module):
 class LocalAttentionBlock(nn.Module):
-    def __init__(self, embed_dim, hidden_dim, num_heads, num_patches, dropout=0.0, local_window=3):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads

             if self.global_step % 200 == 0:
                 multipliers = self.adaptive_loss.get_current_multipliers()
                 for key, value in multipliers.items():
+                    self.log(f"adaptive/{key}", value, on_step=True, on_epoch=False, sync_dist=True)
         if mode == "val":
             # Validation: typically only log epoch aggregates
         self.mlp_head = nn.Sequential(nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 1))
         self.dropout = nn.Dropout(dropout)
+        # Parameters/Embeddings - using 2D positional encoding for local attention
+        # No CLS token needed for local attention architecture
         self.grid_h = int(math.sqrt(num_patches))
         self.grid_w = int(math.sqrt(num_patches))
         self.pos_embedding_2d = nn.Parameter(torch.randn(1, self.grid_h, self.grid_w, embed_dim))
 class LocalAttentionBlock(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, num_heads, num_patches, dropout=0.0, local_window=9):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads

forecasting/training/callback.py CHANGED Viewed

@@ -90,7 +90,7 @@ class ImagePredictionLogger_SXR(Callback):
 class AttentionMapCallback(Callback):
-    def __init__(self, log_every_n_epochs=1, num_samples=4, save_dir="attention_maps", patch_size=8):
         """
         Callback to visualize attention maps during training.
@@ -99,12 +99,14 @@ class AttentionMapCallback(Callback):
             num_samples: Number of samples to visualize
             save_dir: Directory to save attention maps
             patch_size: Size of patches used in the model
         """
         super().__init__()
         self.patch_size = patch_size
         self.log_every_n_epochs = log_every_n_epochs
         self.num_samples = num_samples
         self.save_dir = save_dir
     def on_validation_epoch_end(self, trainer, pl_module):
         if trainer.current_epoch % self.log_every_n_epochs == 0:
@@ -125,8 +127,8 @@ class AttentionMapCallback(Callback):
             # Move to device
             imgs = imgs[:self.num_samples].to(pl_module.device)
-            # Get predictions with attention weights
-            #Dynamically extract attention weights from the model
             try:
                 outputs, attention_weights  = pl_module(imgs, return_attention=True)
             except:
@@ -134,7 +136,7 @@ class AttentionMapCallback(Callback):
                 if hasattr(pl_module, 'model') and hasattr(pl_module.model, 'forward'):
                     try:
                         print("Using model's forward method")
-                        outputs, attention_weights, _ = pl_module.model(imgs, pl_module.sxr_norm, return_attention=True)
                     except:
                         print("Using model's forward method failed")
                         outputs, attention_weights = pl_module.forward_for_callback(imgs, return_attention=True)
@@ -149,12 +151,13 @@ class AttentionMapCallback(Callback):
                     attention_weights,
                     sample_idx,
                     trainer.current_epoch,
-                    patch_size=self.patch_size
                 )
                 trainer.logger.experiment.log({"Attention plots": wandb.Image(map)})
                 plt.close(map)
-    def _plot_attention_map(self, image, attention_weights, sample_idx, epoch, patch_size):
         """
         Plot attention map for a single image.
@@ -164,50 +167,44 @@ class AttentionMapCallback(Callback):
             sample_idx: Index of the sample in the batch
             epoch: Current epoch number
             patch_size: Size of patches
         """
         # Convert image to numpy and transpose
         img_np = image.cpu().numpy()
         if len(img_np.shape) == 3 and img_np.shape[0] in [1, 3]:  # Check if channels first
             img_np = np.transpose(img_np, (1, 2, 0))
         # Get attention from the last layer
         last_layer_attention = attention_weights[-1]  # [B, num_heads, seq_len, seq_len]
         # Extract attention for this sample
         sample_attention = last_layer_attention[sample_idx]  # [num_heads, seq_len, seq_len]
         # Average across heads
         avg_attention = sample_attention.mean(dim=0)  # [seq_len, seq_len]
-        # Get attention from CLS token to patches (exclude CLS->CLS)
-        cls_attention = avg_attention[0, 1:].cpu()  # [num_patches]
-        # Calculate grid size - NOW USING CORRECT DIMENSIONS
-        H, W = img_np.shape[:2]  # Now this is correct after transpose
-        grid_h, grid_w = H // patch_size, W // patch_size
-        # Reshape attention to spatial grid
-        attention_map = cls_attention.reshape(grid_h, grid_w)
-        # Create figure with subplots
-        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-        # Plot 1: Original image
-        # if img_np.shape[2] == 1:  # Grayscale
-        #     img_display = (img_np[:, :, 0] + 1) / 2
-        #     axes[0].imshow(img_display, cmap='gray')
-        # elif img_np.shape[2] == 3:  # RGB
-        #     # Normalize RGB image properly
-        #     img_display = (img_np + 1) / 2  # Assuming images are in [-1, 1] range
-        #     img_display = np.clip(img_display, 0, 1)  # Ensure valid range
-        #     axes[0].imshow(img_display)
-        # else:  # Multi-channel (6 channels in your case)
-        #     # Option 1: Display first channel as grayscale
-        #     img_display = (img_np[:, :, 0] + 1) / 2
-        #     axes[0].imshow(img_display, cmap='gray')
-            # Option 2: Create RGB composite from 3 channels (uncomment if preferred)
         if len(img_np[0,0,:]) >= 6:  # Ensure we have enough channels
             rgb_channels = [0, 2, 4]  # Select which channels to use for R, G, B
             img_display = np.stack([(img_np[:, :, i] + 1) / 2 for i in rgb_channels], axis=2)
@@ -216,32 +213,88 @@ class AttentionMapCallback(Callback):
             # If not enough channels, use grayscale
             img_display = (img_np[:, :, 0] + 1) / 2
             img_display = np.stack([img_display] * 3, axis=2)
-        axes[0].imshow(img_display)
-        axes[0].set_title(f'Original Image (Epoch {epoch})')
-        axes[0].axis('off')
-        # Plot 2: Attention heatmap
-        attention_np = np.log1p(attention_map.numpy())
-        # Resize attention map to match image size
-        attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
-        # Create colormap for attention - FIX: Use the scalar values, not RGB
-        im = axes[1].imshow(attention_resized, cmap='hot')
-        axes[1].set_title(f'Attention Map (Sample {sample_idx})')
-        axes[1].axis('off')
-        # FIXED: Create colorbar from the scalar image, not RGB
-        plt.colorbar(im, ax=axes[1])
-        # Plot 3: Overlay attention on image
-        #img_display_overlay = (img_np[:, :, 0] + 1) / 2
-        axes[2].imshow(img_display)
-        # Overlay attention with proper alpha blending
-        axes[2].imshow(attention_resized, cmap='hot', alpha=0.5)
-        axes[2].set_title(f'Log-Scaled Attention Overlay (Sample {sample_idx})')
-        axes[2].axis('off')
-        plt.tight_layout()
         plt.tight_layout()
         return fig

 class AttentionMapCallback(Callback):
+    def __init__(self, log_every_n_epochs=1, num_samples=4, save_dir="attention_maps", patch_size=8, use_local_attention=False):
         """
         Callback to visualize attention maps during training.
             num_samples: Number of samples to visualize
             save_dir: Directory to save attention maps
             patch_size: Size of patches used in the model
+            use_local_attention: If True, visualize local attention patterns instead of CLS token attention
         """
         super().__init__()
         self.patch_size = patch_size
         self.log_every_n_epochs = log_every_n_epochs
         self.num_samples = num_samples
         self.save_dir = save_dir
+        self.use_local_attention = use_local_attention
     def on_validation_epoch_end(self, trainer, pl_module):
         if trainer.current_epoch % self.log_every_n_epochs == 0:
             # Move to device
             imgs = imgs[:self.num_samples].to(pl_module.device)
+            # Get predictions with attention weights and patch contributions
+            patch_flux_raw = None
             try:
                 outputs, attention_weights  = pl_module(imgs, return_attention=True)
             except:
                 if hasattr(pl_module, 'model') and hasattr(pl_module.model, 'forward'):
                     try:
                         print("Using model's forward method")
+                        outputs, attention_weights, patch_flux_raw = pl_module.model(imgs, pl_module.sxr_norm, return_attention=True)
                     except:
                         print("Using model's forward method failed")
                         outputs, attention_weights = pl_module.forward_for_callback(imgs, return_attention=True)
                     attention_weights,
                     sample_idx,
                     trainer.current_epoch,
+                    patch_size=self.patch_size,
+                    patch_flux=patch_flux_raw[sample_idx] if patch_flux_raw is not None else None
                 )
                 trainer.logger.experiment.log({"Attention plots": wandb.Image(map)})
                 plt.close(map)
+    def _plot_attention_map(self, image, attention_weights, sample_idx, epoch, patch_size, patch_flux=None):
         """
         Plot attention map for a single image.
             sample_idx: Index of the sample in the batch
             epoch: Current epoch number
             patch_size: Size of patches
+            patch_flux: Optional tensor of patch flux contributions [num_patches]
         """
         # Convert image to numpy and transpose
         img_np = image.cpu().numpy()
         if len(img_np.shape) == 3 and img_np.shape[0] in [1, 3]:  # Check if channels first
             img_np = np.transpose(img_np, (1, 2, 0))
+        # Calculate grid size
+        H, W = img_np.shape[:2]
+        grid_h, grid_w = H // patch_size, W // patch_size
         # Get attention from the last layer
         last_layer_attention = attention_weights[-1]  # [B, num_heads, seq_len, seq_len]
         # Extract attention for this sample
         sample_attention = last_layer_attention[sample_idx]  # [num_heads, seq_len, seq_len]
         # Average across heads
         avg_attention = sample_attention.mean(dim=0)  # [seq_len, seq_len]
+        if self.use_local_attention:
+            # For local attention: visualize attention patterns from center patch
+            # and average attention across all patches
+            center_patch_idx = (grid_h * grid_w) // 2  # Center patch
+            center_attention = avg_attention[center_patch_idx, :].cpu()  # [num_patches]
+            # Average attention pattern (how much each patch attends to others on average)
+            avg_attention_map = avg_attention.mean(dim=0).cpu()  # [num_patches]
+            attention_map = avg_attention_map.reshape(grid_h, grid_w)
+            center_map = center_attention.reshape(grid_h, grid_w)
+        else:
+            # For CLS token attention: visualize attention from CLS to patches
+            cls_attention = avg_attention[0, 1:].cpu()  # [num_patches]
+            attention_map = cls_attention.reshape(grid_h, grid_w)
+            center_map = None
+        # Prepare image display
         if len(img_np[0,0,:]) >= 6:  # Ensure we have enough channels
             rgb_channels = [0, 2, 4]  # Select which channels to use for R, G, B
             img_display = np.stack([(img_np[:, :, i] + 1) / 2 for i in rgb_channels], axis=2)
             # If not enough channels, use grayscale
             img_display = (img_np[:, :, 0] + 1) / 2
             img_display = np.stack([img_display] * 3, axis=2)
+        # Create figure with appropriate number of subplots
+        if self.use_local_attention and patch_flux is not None:
+            # Show: Original, Avg Attention, Center Attention, Patch Flux
+            fig, axes = plt.subplots(1, 4, figsize=(20, 5))
+            # Plot 1: Original image
+            axes[0].imshow(img_display)
+            axes[0].set_title(f'Original Image (Epoch {epoch})')
+            axes[0].axis('off')
+            # Plot 2: Average attention pattern
+            attention_np = np.log1p(attention_map.numpy())
+            attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
+            im1 = axes[1].imshow(attention_resized, cmap='hot')
+            axes[1].set_title('Avg Attention (All Patches)')
+            axes[1].axis('off')
+            plt.colorbar(im1, ax=axes[1])
+            # Plot 3: Center patch attention
+            center_np = np.log1p(center_map.numpy())
+            center_resized = zoom(center_np, (H / grid_h, W / grid_w), order=1)
+            im2 = axes[2].imshow(center_resized, cmap='viridis')
+            axes[2].set_title('Center Patch Attention')
+            axes[2].axis('off')
+            plt.colorbar(im2, ax=axes[2])
+            # Plot 4: Patch flux contributions
+            flux_map = patch_flux.cpu().reshape(grid_h, grid_w)
+            flux_np = np.log1p(flux_map.numpy())
+            flux_resized = zoom(flux_np, (H / grid_h, W / grid_w), order=1)
+            im3 = axes[3].imshow(flux_resized, cmap='plasma')
+            axes[3].set_title('Log Patch Flux Contributions')
+            axes[3].axis('off')
+            plt.colorbar(im3, ax=axes[3])
+        elif self.use_local_attention:
+            # Show: Original, Avg Attention, Center Attention
+            fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+            # Plot 1: Original image
+            axes[0].imshow(img_display)
+            axes[0].set_title(f'Original Image (Epoch {epoch})')
+            axes[0].axis('off')
+            # Plot 2: Average attention pattern
+            attention_np = np.log1p(attention_map.numpy())
+            attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
+            im1 = axes[1].imshow(attention_resized, cmap='hot')
+            axes[1].set_title('Avg Attention (All Patches)')
+            axes[1].axis('off')
+            plt.colorbar(im1, ax=axes[1])
+            # Plot 3: Center patch attention
+            center_np = np.log1p(center_map.numpy())
+            center_resized = zoom(center_np, (H / grid_h, W / grid_w), order=1)
+            im2 = axes[2].imshow(center_resized, cmap='viridis')
+            axes[2].set_title('Center Patch Attention')
+            axes[2].axis('off')
+            plt.colorbar(im2, ax=axes[2])
+        else:
+            # Original CLS token visualization
+            fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+            # Plot 1: Original image
+            axes[0].imshow(img_display)
+            axes[0].set_title(f'Original Image (Epoch {epoch})')
+            axes[0].axis('off')
+            # Plot 2: Attention heatmap
+            attention_np = np.log1p(attention_map.numpy())
+            attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
+            im = axes[1].imshow(attention_resized, cmap='hot')
+            axes[1].set_title(f'Attention Map (Sample {sample_idx})')
+            axes[1].axis('off')
+            plt.colorbar(im, ax=axes[1])
+            # Plot 3: Overlay attention on image
+            axes[2].imshow(img_display)
+            axes[2].imshow(attention_resized, cmap='hot', alpha=0.5)
+            axes[2].set_title(f'Log-Scaled Attention Overlay (Sample {sample_idx})')
+            axes[2].axis('off')
         plt.tight_layout()
         return fig

forecasting/training/localpatch.yaml CHANGED Viewed

@@ -1,14 +1,19 @@
 #Base directories - change these to switch datasets
-base_data_dir: "/mnt/data/NO-OVERLAP"  # Change this line for different datasets
-base_checkpoint_dir: "/mnt/data/NO-OVERLAP"    # Change this line for different datasets
 wavelengths: [94, 131, 171, 193, 211, 304]  # AIA wavelengths in Angstroms
 # GPU configuration
-gpu_id: 0  # GPU device ID to use (0, 1, 2, etc.) or -1 for CPU only
 # Model configuration
 selected_model: "ViTLocal"  # Options: "hybrid", "vit", "fusion", "vitpatch"
-batch_size:    128
 epochs:        250
 oversample: false
 balance_strategy: "upsample_minority"
@@ -20,9 +25,9 @@ vit_custom:
     num_classes: 1
     patch_size: 16
     num_patches: 1024
-    hidden_dim: 512
     num_heads: 8
-    num_layers: 6
     dropout: 0.1
     lr: 0.0001
@@ -40,11 +45,11 @@ data:
 wandb:
   entity: jayantbiradar619-university-of-arizona # Use your exact W&B username
-  project: Model Testing
   job_type: training
   tags:
     - aia
     - sxr
     - regression
-  wb_name: vit-local-patch
   notes: Regression from AIA images (6 channels) to GOES SXR flux

 #Base directories - change these to switch datasets
+base_data_dir: "/mnt/data/PAPER_DATA_B"  # Change this line for different datasets
+base_checkpoint_dir: "/mnt/data/PAPER_DATA_B"    # Change this line for different datasets
 wavelengths: [94, 131, 171, 193, 211, 304]  # AIA wavelengths in Angstroms
 # GPU configuration
+# Options:
+#   - Single GPU: gpu_ids: 0  or  gpu_ids: [0]
+#   - Multi GPU: gpu_ids: [0, 1]  (uses both GPU 0 and 1)
+#   - All GPUs: gpu_ids: "all"  (uses all available GPUs)
+#   - CPU only: gpu_ids: -1
+gpu_ids: "all"  # Use both GPUs
 # Model configuration
 selected_model: "ViTLocal"  # Options: "hybrid", "vit", "fusion", "vitpatch"
+batch_size:    48
 epochs:        250
 oversample: false
 balance_strategy: "upsample_minority"
     num_classes: 1
     patch_size: 16
     num_patches: 1024
+    hidden_dim: 2048
     num_heads: 8
+    num_layers: 10
     dropout: 0.1
     lr: 0.0001
 wandb:
   entity: jayantbiradar619-university-of-arizona # Use your exact W&B username
+  project: Paper
   job_type: training
   tags:
     - aia
     - sxr
     - regression
+  wb_name: paper-testing-16-patch-deeper-model-9x9-attention
   notes: Regression from AIA images (6 channels) to GOES SXR flux

forecasting/training/train.py CHANGED Viewed

@@ -13,7 +13,6 @@ import numpy as np
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.callbacks import ModelCheckpoint
-from torch.nn import MSELoss, HuberLoss
 from pathlib import Path
 import sys
 # Add project root to Python path
@@ -21,33 +20,13 @@ PROJECT_ROOT = Path(__file__).parent.parent.parent.absolute()
 sys.path.insert(0, str(PROJECT_ROOT))
 from forecasting.data_loaders.SDOAIA_dataloader import AIA_GOESDataModule
-from forecasting.models.vision_transformer_custom import ViT
-from forecasting.models.linear_and_hybrid import LinearIrradianceModel, HybridIrradianceModel
-from forecasting.models.vit_patch_model import ViT as ViTPatch
-from forecasting.models.vit_patch_model_uncertainty import ViTUncertainty
-from forecasting.models import FusionViTHybrid
-from forecasting.models.CNN_Patch import CNNPatch
 from forecasting.models.vit_patch_model_local import ViTLocal
 from callback import ImagePredictionLogger_SXR, AttentionMapCallback
 from pytorch_lightning.callbacks import Callback
-from forecasting.models.FastSpectralNet import FastViTFlaringModel
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-os.environ["NCCL_DEBUG"] = "WARN"
-# Shared memory optimizations
-os.environ["OMP_NUM_THREADS"] = "1"  # Limit OpenMP threads
-os.environ["MKL_NUM_THREADS"] = "1"  # Limit MKL threads
-def print_gpu_memory(stage=""):
-    """Print GPU memory usage for monitoring"""
-    if torch.cuda.is_available():
-        allocated = torch.cuda.memory_allocated() / 1e9
-        reserved = torch.cuda.memory_reserved() / 1e9
-        print(f"GPU Memory {stage} - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
-    else:
-        print(f"No GPU available for memory monitoring {stage}")
 def resolve_config_variables(config_dict):
     """Recursively resolve ${variable} references within the config"""
@@ -91,27 +70,6 @@ with open(args.config, 'r') as stream:
 # Resolve variables like ${base_data_dir}
 config_data = resolve_config_variables(config_data)
-# GPU Memory Isolation for Multi-GPU Systems
-gpu_id = config_data.get('gpu_id', 0)
-if gpu_id != -1:  # Only if using GPU
-    # Set CUDA device visibility to only the specified GPU
-    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
-    print(f"Set CUDA_VISIBLE_DEVICES to GPU {gpu_id}")
-    # Clear any existing CUDA cache
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        print(f"Cleared CUDA cache for GPU {gpu_id}")
-    # Set memory allocation strategy for better isolation
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,roundup_power2_divisions:16"
-    # Disable memory sharing between processes
-    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-    print(f"GPU Memory Isolation configured for GPU {gpu_id}")
-else:
-    print("Using CPU - no GPU memory isolation needed")
 # Debug: Print resolved paths
 print("Resolved paths:")
@@ -120,12 +78,6 @@ print(f"SXR dir: {config_data['data']['sxr_dir']}")
 print(f"Checkpoints dir: {config_data['data']['checkpoints_dir']}")
 sxr_norm = np.load(config_data['data']['sxr_norm_path'])
-n = 0
-torch.manual_seed(config_data['megsai']['seed'])
-np.random.seed(config_data['megsai']['seed'])
 training_wavelengths = config_data['wavelengths']
@@ -145,10 +97,6 @@ data_loader = AIA_GOESDataModule(
     balance_strategy=config_data['balance_strategy'],
 )
 data_loader.setup()
-# Monitor memory after data loading
-print_gpu_memory("after data loading")
 # Logger
 #wb_name = f"{instrument}_{n}" if len(combined_parameters) > 1 else "aia_sxr_model"
 wandb_logger = WandbLogger(
@@ -158,7 +106,7 @@ wandb_logger = WandbLogger(
     tags=config_data['wandb']['tags'],
     name=config_data['wandb']['wb_name'],
     notes=config_data['wandb']['notes'],
-    config=config_data['megsai']
 )
 # Logging callback
@@ -169,8 +117,8 @@ plot_samples = plot_data  # Keep as list of ((aia, sxr), target)
 sxr_plot_callback = ImagePredictionLogger_SXR(plot_samples, sxr_norm)
 # Attention map callback - get patch size from config
-patch_size = config_data.get('vit_custom', {}).get('patch_size', 8)
-attention = AttentionMapCallback(patch_size=patch_size)
 class PTHCheckpointCallback(Callback):
@@ -323,27 +271,63 @@ else:
     raise NotImplementedError(f"Architecture {config_data['selected_model']} not supported.")
 # Set device based on config
-gpu_id = config_data.get('gpu_id', 0)
-if gpu_id == -1:
     accelerator = "cpu"
     devices = 1
     print("Using CPU for training")
 else:
     if torch.cuda.is_available():
         accelerator = "gpu"
-        # When CUDA_VISIBLE_DEVICES is set, PyTorch Lightning only sees GPU 0
-        devices = [0]  # Always use device 0 since we've isolated to specific GPU
-        print(f"Using GPU {gpu_id} for training (mapped to device 0 after CUDA_VISIBLE_DEVICES)")
     else:
         accelerator = "cpu"
         devices = 1
-        print(f"GPU {gpu_id} not available, falling back to CPU")
 # Trainer
 trainer = Trainer(
     default_root_dir=config_data['data']['checkpoints_dir'],
     accelerator=accelerator,
     devices=devices,
     max_epochs=config_data['epochs'],
     callbacks=[attention, checkpoint_callback],
     logger=wandb_logger,
@@ -359,6 +343,5 @@ torch.save({
     'state_dict': model.state_dict()
 }, final_checkpoint_path)
 print(f"Saved final PyTorch checkpoint: {final_checkpoint_path}")
-n += 1
 # Finalize
 wandb.finish()

 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pathlib import Path
 import sys
 # Add project root to Python path
 sys.path.insert(0, str(PROJECT_ROOT))
 from forecasting.data_loaders.SDOAIA_dataloader import AIA_GOESDataModule
 from forecasting.models.vit_patch_model_local import ViTLocal
 from callback import ImagePredictionLogger_SXR, AttentionMapCallback
 from pytorch_lightning.callbacks import Callback
 def resolve_config_variables(config_dict):
     """Recursively resolve ${variable} references within the config"""
 # Resolve variables like ${base_data_dir}
 config_data = resolve_config_variables(config_data)
 # Debug: Print resolved paths
 print("Resolved paths:")
 print(f"Checkpoints dir: {config_data['data']['checkpoints_dir']}")
 sxr_norm = np.load(config_data['data']['sxr_norm_path'])
 training_wavelengths = config_data['wavelengths']
     balance_strategy=config_data['balance_strategy'],
 )
 data_loader.setup()
 # Logger
 #wb_name = f"{instrument}_{n}" if len(combined_parameters) > 1 else "aia_sxr_model"
 wandb_logger = WandbLogger(
     tags=config_data['wandb']['tags'],
     name=config_data['wandb']['wb_name'],
     notes=config_data['wandb']['notes'],
+    config=config_data
 )
 # Logging callback
 sxr_plot_callback = ImagePredictionLogger_SXR(plot_samples, sxr_norm)
 # Attention map callback - get patch size from config
+patch_size = config_data.get('vit_custom', {}).get('patch_size', 16)
+attention = AttentionMapCallback(patch_size=patch_size, use_local_attention=True)
 class PTHCheckpointCallback(Callback):
     raise NotImplementedError(f"Architecture {config_data['selected_model']} not supported.")
 # Set device based on config
+# Support both old 'gpu_id' and new 'gpu_ids' config keys for backward compatibility
+gpu_config = config_data.get('gpu_ids', config_data.get('gpu_id', 0))
+if gpu_config == -1:
+    # CPU only
     accelerator = "cpu"
     devices = 1
+    strategy = "auto"
     print("Using CPU for training")
+elif gpu_config == "all":
+    # Use all available GPUs
+    if torch.cuda.is_available():
+        accelerator = "gpu"
+        devices = -1  # -1 means use all available GPUs
+        num_gpus = torch.cuda.device_count()
+        strategy = "auto"
+        print(f"Using all available GPUs ({num_gpus} GPUs)")
+        if num_gpus > 1:
+            print(f"Multi-GPU training with DDP: Effective batch size = {config_data['batch_size']} x {num_gpus} GPUs = {config_data['batch_size'] * num_gpus}")
+    else:
+        accelerator = "cpu"
+        devices = 1
+        strategy = "auto"
+        print("No GPUs available, falling back to CPU")
+elif isinstance(gpu_config, list):
+    # Multiple specific GPUs
+    if torch.cuda.is_available():
+        accelerator = "gpu"
+        devices = gpu_config
+        strategy = "auto"
+        print(f"Using GPUs: {gpu_config}")
+        if len(gpu_config) > 1:
+            print(f"Multi-GPU training with DDP: Effective batch size = {config_data['batch_size']} x {len(gpu_config)} GPUs = {config_data['batch_size'] * len(gpu_config)}")
+    else:
+        accelerator = "cpu"
+        devices = 1
+        strategy = "auto"
+        print("No GPUs available, falling back to CPU")
 else:
+    # Single GPU (integer)
     if torch.cuda.is_available():
         accelerator = "gpu"
+        devices = [gpu_config]
+        strategy = "auto"
+        print(f"Using GPU {gpu_config}")
     else:
         accelerator = "cpu"
         devices = 1
+        strategy = "auto"
+        print(f"GPU {gpu_config} not available, falling back to CPU")
 # Trainer
 trainer = Trainer(
     default_root_dir=config_data['data']['checkpoints_dir'],
     accelerator=accelerator,
     devices=devices,
+    strategy=strategy,
     max_epochs=config_data['epochs'],
     callbacks=[attention, checkpoint_callback],
     logger=wandb_logger,
     'state_dict': model.state_dict()
 }, final_checkpoint_path)
 print(f"Saved final PyTorch checkpoint: {final_checkpoint_path}")
 # Finalize
 wandb.finish()