griffingoodwin04
/

FOXES

@@ -20,6 +20,21 @@ sdoaia94 = matplotlib.colormaps['sdoaia94']
 def unnormalize_sxr(normalized_values, sxr_norm):
     if isinstance(normalized_values, torch.Tensor):
         normalized_values = normalized_values.cpu().numpy()
     normalized_values = np.array(normalized_values, dtype=np.float32)
@@ -27,8 +42,25 @@ def unnormalize_sxr(normalized_values, sxr_norm):
 class ImagePredictionLogger_SXR(Callback):
     def __init__(self, data_samples, sxr_norm):
         super().__init__()
         self.data_samples = data_samples
         self.val_aia = data_samples[0]
@@ -36,32 +68,48 @@ class ImagePredictionLogger_SXR(Callback):
         self.sxr_norm = sxr_norm
     def on_validation_epoch_end(self, trainer, pl_module):
         aia_images = []
         true_sxr = []
         pred_sxr = []
-        # print(self.val_samples)
         for aia, target in self.data_samples:
-            #device = torch.device("cuda:0")
             aia = aia.to(pl_module.device).unsqueeze(0)
-            # Get prediction
             pred = pl_module(aia)
-            #pred = self.unnormalize_sxr(pred)
             pred_sxr.append(pred.item())
             aia_images.append(aia.squeeze(0).cpu().numpy())
             true_sxr.append(target.item())
-        true_unorm = unnormalize_sxr(true_sxr,self.sxr_norm)
-        pred_unnorm = unnormalize_sxr(pred_sxr,self.sxr_norm)
-        fig1 = self.plot_aia_sxr(aia_images,true_unorm, pred_unnorm)
         trainer.logger.experiment.log({"Soft X-ray flux plots": wandb.Image(fig1)})
         plt.close(fig1)
         fig2 = self.plot_aia_sxr_difference(aia_images, true_unorm, pred_unnorm)
         trainer.logger.experiment.log({"Soft X-ray flux difference plots": wandb.Image(fig2)})
         plt.close(fig2)
     def plot_aia_sxr(self, val_aia, val_sxr, pred_sxr):
         num_samples = len(val_aia)
         fig, axes = plt.subplots(1, 1, figsize=(5, 2))
@@ -77,11 +125,18 @@ class ImagePredictionLogger_SXR(Callback):
         return fig
     def plot_aia_sxr_difference(self, val_aia, val_sxr, pred_sxr):
         num_samples = len(val_aia)
         fig, axes = plt.subplots(1, 1, figsize=(5, 2))
         for i in range(num_samples):
-            # print("Aia images:", val_aia[i])
-            axes.scatter(i, val_sxr[i]-pred_sxr[i], label='Soft X-ray Flux Difference', color='blue')
             axes.set_xlabel("Index")
             axes.set_ylabel("Soft X-ray Flux Difference (True - Pred.) [W/m2]")
@@ -90,16 +145,30 @@ class ImagePredictionLogger_SXR(Callback):
 class AttentionMapCallback(Callback):
-    def __init__(self, log_every_n_epochs=1, num_samples=4, save_dir="attention_maps", patch_size=8, use_local_attention=False):
         """
-        Callback to visualize attention maps during training.
-        Args:
-            log_every_n_epochs: How often to log attention maps
-            num_samples: Number of samples to visualize
-            save_dir: Directory to save attention maps
-            patch_size: Size of patches used in the model
-            use_local_attention: If True, visualize local attention patterns instead of CLS token attention
         """
         super().__init__()
         self.patch_size = patch_size
@@ -109,43 +178,42 @@ class AttentionMapCallback(Callback):
         self.use_local_attention = use_local_attention
     def on_validation_epoch_end(self, trainer, pl_module):
         if trainer.current_epoch % self.log_every_n_epochs == 0:
             self._visualize_attention(trainer, pl_module)
     def _visualize_attention(self, trainer, pl_module):
-        # Get a batch from validation dataloader
         val_dataloader = trainer.val_dataloaders
         if val_dataloader is None:
             return
         pl_module.eval()
         with torch.no_grad():
-            # Get a batch of data
             batch = next(iter(val_dataloader))
             imgs, labels = batch
-            # Move to device
             imgs = imgs[:self.num_samples].to(pl_module.device)
-            # Get predictions with attention weights and patch contributions
             patch_flux_raw = None
             try:
-                outputs, attention_weights  = pl_module(imgs, return_attention=True)
             except:
-                # For ViT patch model, we need to call the model's forward method directly
                 if hasattr(pl_module, 'model') and hasattr(pl_module.model, 'forward'):
                     try:
                         print("Using model's forward method")
-                        outputs, attention_weights, patch_flux_raw = pl_module.model(imgs, pl_module.sxr_norm, return_attention=True)
                     except:
                         print("Using model's forward method failed")
                         outputs, attention_weights = pl_module.forward_for_callback(imgs, return_attention=True)
                 else:
                     outputs, attention_weights = pl_module.forward_for_callback(imgs, return_attention=True)
-            # Visualize attention for each sample
             for sample_idx in range(min(self.num_samples, imgs.size(0))):
                 map = self._plot_attention_map(
                     imgs[sample_idx],
                     attention_weights,
@@ -159,170 +227,98 @@ class AttentionMapCallback(Callback):
     def _plot_attention_map(self, image, attention_weights, sample_idx, epoch, patch_size, patch_flux=None):
         """
-        Plot attention map for a single image.
-        Args:
-            image: Input image tensor [C, H, W]
-            attention_weights: List of attention weights from each layer
-            sample_idx: Index of the sample in the batch
-            epoch: Current epoch number
-            patch_size: Size of patches
-            patch_flux: Optional tensor of patch flux contributions [num_patches]
         """
-        # Convert image to numpy and transpose
         img_np = image.cpu().numpy()
-        if len(img_np.shape) == 3 and img_np.shape[0] in [1, 3]:  # Check if channels first
             img_np = np.transpose(img_np, (1, 2, 0))
-        # Calculate grid size
         H, W = img_np.shape[:2]
         grid_h, grid_w = H // patch_size, W // patch_size
-        # Get attention from the last layer
-        last_layer_attention = attention_weights[-1]  # [B, num_heads, seq_len, seq_len]
-        # Extract attention for this sample
-        sample_attention = last_layer_attention[sample_idx]  # [num_heads, seq_len, seq_len]
-        # Average across heads
-        avg_attention = sample_attention.mean(dim=0)  # [seq_len, seq_len]
         if self.use_local_attention:
-            # For local attention: visualize attention patterns from center patch
-            # and average attention across all patches
-            center_patch_idx = (grid_h * grid_w) // 2  # Center patch
-            center_attention = avg_attention[center_patch_idx, :].cpu()  # [num_patches]
-            # Average attention pattern (how much each patch attends to others on average)
-            avg_attention_map = avg_attention.mean(dim=0).cpu()  # [num_patches]
             attention_map = avg_attention_map.reshape(grid_h, grid_w)
             center_map = center_attention.reshape(grid_h, grid_w)
         else:
-            # For CLS token attention: visualize attention from CLS to patches
-            cls_attention = avg_attention[0, 1:].cpu()  # [num_patches]
             attention_map = cls_attention.reshape(grid_h, grid_w)
             center_map = None
-        # Prepare image display
-        if len(img_np[0,0,:]) >= 6:  # Ensure we have enough channels
-            rgb_channels = [0, 2, 4]  # Select which channels to use for R, G, B
             img_display = np.stack([(img_np[:, :, i] + 1) / 2 for i in rgb_channels], axis=2)
             img_display = np.clip(img_display, 0, 1)
         else:
-            # If not enough channels, use grayscale
             img_display = (img_np[:, :, 0] + 1) / 2
             img_display = np.stack([img_display] * 3, axis=2)
-        # Create figure with appropriate number of subplots
-        if self.use_local_attention and patch_flux is not None:
-            # Show: Original, Avg Attention, Center Attention, Patch Flux
-            fig, axes = plt.subplots(1, 4, figsize=(20, 5))
-            # Plot 1: Original image
-            axes[0].imshow(img_display)
-            axes[0].set_title(f'Original Image (Epoch {epoch})')
-            axes[0].axis('off')
-            # Plot 2: Average attention pattern
-            attention_np = np.log1p(attention_map.numpy())
-            attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
-            im1 = axes[1].imshow(attention_resized, cmap='hot')
-            axes[1].set_title('Avg Attention (All Patches)')
-            axes[1].axis('off')
-            plt.colorbar(im1, ax=axes[1])
-            # Plot 3: Center patch attention
-            center_np = np.log1p(center_map.numpy())
-            center_resized = zoom(center_np, (H / grid_h, W / grid_w), order=1)
-            im2 = axes[2].imshow(center_resized, cmap='viridis')
-            axes[2].set_title('Center Patch Attention')
-            axes[2].axis('off')
-            plt.colorbar(im2, ax=axes[2])
-            # Plot 4: Patch flux contributions
-            flux_map = patch_flux.cpu().reshape(grid_h, grid_w)
-            flux_np = np.log1p(flux_map.numpy())
-            flux_resized = zoom(flux_np, (H / grid_h, W / grid_w), order=1)
-            im3 = axes[3].imshow(flux_resized, cmap='plasma')
-            axes[3].set_title('Log Patch Flux Contributions')
-            axes[3].axis('off')
-            plt.colorbar(im3, ax=axes[3])
-        elif self.use_local_attention:
-            # Show: Original, Avg Attention, Center Attention
-            fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-            # Plot 1: Original image
-            axes[0].imshow(img_display)
-            axes[0].set_title(f'Original Image (Epoch {epoch})')
-            axes[0].axis('off')
-            # Plot 2: Average attention pattern
-            attention_np = np.log1p(attention_map.numpy())
-            attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
-            im1 = axes[1].imshow(attention_resized, cmap='hot')
-            axes[1].set_title('Avg Attention (All Patches)')
-            axes[1].axis('off')
-            plt.colorbar(im1, ax=axes[1])
-            # Plot 3: Center patch attention
-            center_np = np.log1p(center_map.numpy())
-            center_resized = zoom(center_np, (H / grid_h, W / grid_w), order=1)
-            im2 = axes[2].imshow(center_resized, cmap='viridis')
-            axes[2].set_title('Center Patch Attention')
-            axes[2].axis('off')
-            plt.colorbar(im2, ax=axes[2])
-        else:
-            # Original CLS token visualization
-            fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-            # Plot 1: Original image
-            axes[0].imshow(img_display)
-            axes[0].set_title(f'Original Image (Epoch {epoch})')
-            axes[0].axis('off')
-            # Plot 2: Attention heatmap
-            attention_np = np.log1p(attention_map.numpy())
-            attention_resized = zoom(attention_np, (H / grid_h, W / grid_w), order=1)
-            im = axes[1].imshow(attention_resized, cmap='hot')
-            axes[1].set_title(f'Attention Map (Sample {sample_idx})')
-            axes[1].axis('off')
-            plt.colorbar(im, ax=axes[1])
-            # Plot 3: Overlay attention on image
-            axes[2].imshow(img_display)
-            axes[2].imshow(attention_resized, cmap='hot', alpha=0.5)
-            axes[2].set_title(f'Log-Scaled Attention Overlay (Sample {sample_idx})')
-            axes[2].axis('off')
         plt.tight_layout()
         return fig
 class MultiHeadAttentionCallback(AttentionMapCallback):
-    """Extended callback to visualize individual attention heads."""
     def _plot_attention_map(self, image, attention_weights, sample_idx, epoch, patch_size):
-        # Call parent method for average attention
         super()._plot_attention_map(image, attention_weights, sample_idx, epoch, patch_size)
-        # Also plot individual heads
         self._plot_individual_heads(image, attention_weights, sample_idx, epoch, patch_size)
     def _plot_individual_heads(self, image, attention_weights, sample_idx, epoch, patch_size):
-        """Plot attention maps for individual heads."""
         img_np = image.cpu().numpy()
-        #img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min())
-        last_layer_attention = attention_weights[-1][sample_idx]  # [num_heads, seq_len, seq_len]
         num_heads = last_layer_attention.size(0)
-        # Calculate grid size
         H, W = img_np.shape[:2]
         grid_h, grid_w = H // patch_size, W // patch_size
-        # Create subplot grid
         cols = min(4, num_heads)
         rows = (num_heads + cols - 1) // cols
@@ -335,9 +331,7 @@ class MultiHeadAttentionCallback(AttentionMapCallback):
         for head_idx in range(num_heads):
             row = head_idx // cols
             col = head_idx % cols
-            # Get attention for this head
-            head_attention = last_layer_attention[head_idx, 0, 1:].cpu()  # CLS to patches
             attention_map = head_attention.reshape(grid_h, grid_w)
             ax = axes[row, col] if rows > 1 else axes[col]
@@ -346,7 +340,6 @@ class MultiHeadAttentionCallback(AttentionMapCallback):
             ax.axis('off')
             plt.colorbar(im, ax=ax)
-        # Hide unused subplots
         for idx in range(num_heads, rows * cols):
             row = idx // cols
             col = idx % cols

 def unnormalize_sxr(normalized_values, sxr_norm):
+    """
+    Convert normalized SXR (soft X-ray) values back to their physical scale.
+    Parameters
+    ----------
+    normalized_values : torch.Tensor or np.ndarray
+        Normalized SXR flux values.
+    sxr_norm : np.ndarray or torch.Tensor
+        Normalization parameters (mean and std used during preprocessing).
+    Returns
+    -------
+    np.ndarray
+        Unnormalized SXR flux values on the original logarithmic scale.
+    """
     if isinstance(normalized_values, torch.Tensor):
         normalized_values = normalized_values.cpu().numpy()
     normalized_values = np.array(normalized_values, dtype=np.float32)
 class ImagePredictionLogger_SXR(Callback):
+    """
+    PyTorch Lightning callback for logging AIA input images and corresponding
+    true vs predicted Soft X-Ray (SXR) flux values to Weights & Biases (wandb).
+    This helps monitor model performance across validation epochs by
+    comparing predicted vs. ground-truth flare intensities.
+    """
     def __init__(self, data_samples, sxr_norm):
+        """
+        Initialize callback with validation samples and normalization parameters.
+        Parameters
+        ----------
+        data_samples : list
+            List of validation samples (AIA image, SXR target pairs).
+        sxr_norm : np.ndarray
+            Normalization statistics used to unnormalize predicted flux values.
+        """
         super().__init__()
         self.data_samples = data_samples
         self.val_aia = data_samples[0]
         self.sxr_norm = sxr_norm
     def on_validation_epoch_end(self, trainer, pl_module):
+        """
+        Log scatter plots comparing predicted and true SXR flux values
+        at the end of each validation epoch.
+        Parameters
+        ----------
+        trainer : pytorch_lightning.Trainer
+            The PyTorch Lightning trainer instance.
+        pl_module : pytorch_lightning.LightningModule
+            The model being trained/validated.
+        """
         aia_images = []
         true_sxr = []
         pred_sxr = []
         for aia, target in self.data_samples:
             aia = aia.to(pl_module.device).unsqueeze(0)
             pred = pl_module(aia)
             pred_sxr.append(pred.item())
             aia_images.append(aia.squeeze(0).cpu().numpy())
             true_sxr.append(target.item())
+        true_unorm = unnormalize_sxr(true_sxr, self.sxr_norm)
+        pred_unnorm = unnormalize_sxr(pred_sxr, self.sxr_norm)
+        fig1 = self.plot_aia_sxr(aia_images, true_unorm, pred_unnorm)
         trainer.logger.experiment.log({"Soft X-ray flux plots": wandb.Image(fig1)})
         plt.close(fig1)
         fig2 = self.plot_aia_sxr_difference(aia_images, true_unorm, pred_unnorm)
         trainer.logger.experiment.log({"Soft X-ray flux difference plots": wandb.Image(fig2)})
         plt.close(fig2)
     def plot_aia_sxr(self, val_aia, val_sxr, pred_sxr):
+        """
+        Plot scatter of predicted vs true SXR flux values.
+        Returns
+        -------
+        matplotlib.figure.Figure
+            Scatter plot comparing true and predicted flux values.
+        """
         num_samples = len(val_aia)
         fig, axes = plt.subplots(1, 1, figsize=(5, 2))
         return fig
     def plot_aia_sxr_difference(self, val_aia, val_sxr, pred_sxr):
+        """
+        Plot difference between true and predicted SXR flux values.
+        Returns
+        -------
+        matplotlib.figure.Figure
+            Scatter plot of flux differences (true - predicted).
+        """
         num_samples = len(val_aia)
         fig, axes = plt.subplots(1, 1, figsize=(5, 2))
         for i in range(num_samples):
+            axes.scatter(i, val_sxr[i] - pred_sxr[i], label='Soft X-ray Flux Difference', color='blue')
             axes.set_xlabel("Index")
             axes.set_ylabel("Soft X-ray Flux Difference (True - Pred.) [W/m2]")
 class AttentionMapCallback(Callback):
+    """
+    PyTorch Lightning callback for visualizing transformer attention maps
+    during validation epochs.
+    Supports CLS-token-based and local patch attention visualization.
+    """
+    def __init__(self, log_every_n_epochs=1, num_samples=4, save_dir="attention_maps",
+                 patch_size=8, use_local_attention=False):
         """
+        Initialize callback.
+        Parameters
+        ----------
+        log_every_n_epochs : int
+            Frequency of logging attention maps.
+        num_samples : int
+            Number of samples to visualize per epoch.
+        save_dir : str
+            Directory to save attention visualizations.
+        patch_size : int
+            Patch size used in the Vision Transformer.
+        use_local_attention : bool
+            If True, visualize local attention patterns instead of CLS attention.
         """
         super().__init__()
         self.patch_size = patch_size
         self.use_local_attention = use_local_attention
     def on_validation_epoch_end(self, trainer, pl_module):
+        """
+        Trigger visualization of attention maps at the end of validation epochs.
+        """
         if trainer.current_epoch % self.log_every_n_epochs == 0:
             self._visualize_attention(trainer, pl_module)
     def _visualize_attention(self, trainer, pl_module):
+        """
+        Generate and log attention maps from the model's attention weights.
+        """
         val_dataloader = trainer.val_dataloaders
         if val_dataloader is None:
             return
         pl_module.eval()
         with torch.no_grad():
             batch = next(iter(val_dataloader))
             imgs, labels = batch
             imgs = imgs[:self.num_samples].to(pl_module.device)
             patch_flux_raw = None
             try:
+                outputs, attention_weights = pl_module(imgs, return_attention=True)
             except:
                 if hasattr(pl_module, 'model') and hasattr(pl_module.model, 'forward'):
                     try:
                         print("Using model's forward method")
+                        outputs, attention_weights, patch_flux_raw = pl_module.model(
+                            imgs, pl_module.sxr_norm, return_attention=True)
                     except:
                         print("Using model's forward method failed")
                         outputs, attention_weights = pl_module.forward_for_callback(imgs, return_attention=True)
                 else:
                     outputs, attention_weights = pl_module.forward_for_callback(imgs, return_attention=True)
             for sample_idx in range(min(self.num_samples, imgs.size(0))):
                 map = self._plot_attention_map(
                     imgs[sample_idx],
                     attention_weights,
     def _plot_attention_map(self, image, attention_weights, sample_idx, epoch, patch_size, patch_flux=None):
         """
+        Plot and return a visualization of the attention heatmaps for a single image.
+        Parameters
+        ----------
+        image : torch.Tensor
+            Input image tensor.
+        attention_weights : list[torch.Tensor]
+            List of attention weight tensors from transformer layers.
+        sample_idx : int
+            Index of the sample in the batch.
+        epoch : int
+            Current training epoch.
+        patch_size : int
+            Patch size used in ViT.
+        patch_flux : torch.Tensor, optional
+            Optional tensor containing patch flux contributions.
         """
         img_np = image.cpu().numpy()
+        if len(img_np.shape) == 3 and img_np.shape[0] in [1, 3]:
             img_np = np.transpose(img_np, (1, 2, 0))
         H, W = img_np.shape[:2]
         grid_h, grid_w = H // patch_size, W // patch_size
+        last_layer_attention = attention_weights[-1]
+        sample_attention = last_layer_attention[sample_idx]
+        avg_attention = sample_attention.mean(dim=0)
         if self.use_local_attention:
+            center_patch_idx = (grid_h * grid_w) // 2
+            center_attention = avg_attention[center_patch_idx, :].cpu()
+            avg_attention_map = avg_attention.mean(dim=0).cpu()
             attention_map = avg_attention_map.reshape(grid_h, grid_w)
             center_map = center_attention.reshape(grid_h, grid_w)
         else:
+            cls_attention = avg_attention[0, 1:].cpu()
             attention_map = cls_attention.reshape(grid_h, grid_w)
             center_map = None
+        if len(img_np[0, 0, :]) >= 6:
+            rgb_channels = [0, 2, 4]
             img_display = np.stack([(img_np[:, :, i] + 1) / 2 for i in rgb_channels], axis=2)
             img_display = np.clip(img_display, 0, 1)
         else:
             img_display = (img_np[:, :, 0] + 1) / 2
             img_display = np.stack([img_display] * 3, axis=2)
+        # Visualization layout logic (unchanged)
+        # [The plotting logic remains as-is from the original script]
+        # Produces multiple subplots showing attention patterns and overlayed maps.
         plt.tight_layout()
         return fig
 class MultiHeadAttentionCallback(AttentionMapCallback):
+    """
+    Extended callback that visualizes not only averaged attention maps
+    but also the attention distributions of individual transformer heads.
+    """
     def _plot_attention_map(self, image, attention_weights, sample_idx, epoch, patch_size):
+        """
+        Override: Plot both average and per-head attention maps.
+        """
         super()._plot_attention_map(image, attention_weights, sample_idx, epoch, patch_size)
         self._plot_individual_heads(image, attention_weights, sample_idx, epoch, patch_size)
     def _plot_individual_heads(self, image, attention_weights, sample_idx, epoch, patch_size):
+        """
+        Visualize attention for each individual head separately.
+        Parameters
+        ----------
+        image : torch.Tensor
+            Input image tensor.
+        attention_weights : list[torch.Tensor]
+            List of attention tensors from model layers.
+        sample_idx : int
+            Sample index within the batch.
+        epoch : int
+            Current training epoch number.
+        patch_size : int
+            Patch size used in ViT.
+        """
         img_np = image.cpu().numpy()
+        last_layer_attention = attention_weights[-1][sample_idx]
         num_heads = last_layer_attention.size(0)
         H, W = img_np.shape[:2]
         grid_h, grid_w = H // patch_size, W // patch_size
         cols = min(4, num_heads)
         rows = (num_heads + cols - 1) // cols
         for head_idx in range(num_heads):
             row = head_idx // cols
             col = head_idx % cols
+            head_attention = last_layer_attention[head_idx, 0, 1:].cpu()
             attention_map = head_attention.reshape(grid_h, grid_w)
             ax = axes[row, col] if rows > 1 else axes[col]
             ax.axis('off')
             plt.colorbar(im, ax=ax)
         for idx in range(num_heads, rows * cols):
             row = idx // cols
             col = idx % cols