Refactor SXRRegressionDynamicLoss weights and performance multipliers; adjust model configuration for ViTPatch with reduced patch size and increased number of heads; implement GPU memory isolation and monitoring in training script; enhance AttentionMapCallback to accept dynamic patch size.

Browse files

Files changed (8) hide show

forecasting/models/vit_patch_model.py +8 -8
forecasting/training/callback.py +4 -2
forecasting/training/config.yaml +5 -5
forecasting/training/config2.yaml +5 -1
forecasting/training/config4.yaml +74 -0
forecasting/training/config5.yaml +75 -0
forecasting/training/config6.yaml +75 -0
forecasting/training/train.py +67 -13

forecasting/models/vit_patch_model.py CHANGED Viewed

@@ -341,10 +341,10 @@ class SXRRegressionDynamicLoss:
     def _get_base_weights(self):
         #Calculate the base weights based on the number of samples in each class within training data
         return {
-            'quiet': 1.0,
-            'c_class': 2.0,
-            'm_class': 10.0,
-            'x_class': 20.0
         }
     def calculate_loss(self, preds_norm, sxr_norm, sxr_un):
@@ -360,16 +360,16 @@ class SXRRegressionDynamicLoss:
         # Get continuous multipliers per class with custom params
         quiet_mult = self._get_performance_multiplier(
-            self.quiet_errors, max_multiplier=1.5, min_multiplier=0.6, sensitivity=0.2, sxrclass='quiet'
         )
         c_mult = self._get_performance_multiplier(
-            self.c_errors, max_multiplier=2, min_multiplier=0.7, sensitivity=0.3, sxrclass='c_class'
         )
         m_mult = self._get_performance_multiplier(
-            self.m_errors, max_multiplier=5.0, min_multiplier=0.8, sensitivity=0.4, sxrclass='m_class'
         )
         x_mult = self._get_performance_multiplier(
-            self.x_errors, max_multiplier=8.0, min_multiplier=0.8, sensitivity=.5, sxrclass='x_class'
         )
         quiet_weight = self.base_weights['quiet'] * quiet_mult

     def _get_base_weights(self):
         #Calculate the base weights based on the number of samples in each class within training data
         return {
+            'quiet': 1.2110,
+            'c_class': 1.2110,
+            'm_class': 6.3106,
+            'x_class': 63.4350
         }
     def calculate_loss(self, preds_norm, sxr_norm, sxr_un):
         # Get continuous multipliers per class with custom params
         quiet_mult = self._get_performance_multiplier(
+            self.quiet_errors, max_multiplier=1.5, min_multiplier=0.6, sensitivity=0.05, sxrclass='quiet'  # Was 0.2
         )
         c_mult = self._get_performance_multiplier(
+            self.c_errors, max_multiplier=2, min_multiplier=0.7, sensitivity=0.08, sxrclass='c_class'    # Was 0.3
         )
         m_mult = self._get_performance_multiplier(
+            self.m_errors, max_multiplier=5.0, min_multiplier=0.8, sensitivity=0.1, sxrclass='m_class'   # Was 0.4
         )
         x_mult = self._get_performance_multiplier(
+            self.x_errors, max_multiplier=8.0, min_multiplier=0.8, sensitivity=0.12, sxrclass='x_class'  # Was 0.5
         )
         quiet_weight = self.base_weights['quiet'] * quiet_mult

forecasting/training/callback.py CHANGED Viewed

@@ -90,7 +90,7 @@ class ImagePredictionLogger_SXR(Callback):
 class AttentionMapCallback(Callback):
-    def __init__(self, log_every_n_epochs=1, num_samples=4, save_dir="attention_maps"):
         """
         Callback to visualize attention maps during training.
@@ -98,8 +98,10 @@ class AttentionMapCallback(Callback):
             log_every_n_epochs: How often to log attention maps
             num_samples: Number of samples to visualize
             save_dir: Directory to save attention maps
         """
         super().__init__()
         self.log_every_n_epochs = log_every_n_epochs
         self.num_samples = num_samples
         self.save_dir = save_dir
@@ -142,7 +144,7 @@ class AttentionMapCallback(Callback):
                     attention_weights,
                     sample_idx,
                     trainer.current_epoch,
-                    patch_size=16
                 )
                 trainer.logger.experiment.log({"Attention plots": wandb.Image(map)})
                 plt.close(map)

 class AttentionMapCallback(Callback):
+    def __init__(self, log_every_n_epochs=1, num_samples=4, save_dir="attention_maps", patch_size=8):
         """
         Callback to visualize attention maps during training.
             log_every_n_epochs: How often to log attention maps
             num_samples: Number of samples to visualize
             save_dir: Directory to save attention maps
+            patch_size: Size of patches used in the model
         """
         super().__init__()
+        self.patch_size = patch_size
         self.log_every_n_epochs = log_every_n_epochs
         self.num_samples = num_samples
         self.save_dir = save_dir
                     attention_weights,
                     sample_idx,
                     trainer.current_epoch,
+                    patch_size=self.patch_size
                 )
                 trainer.logger.experiment.log({"Attention plots": wandb.Image(map)})
                 plt.close(map)

forecasting/training/config.yaml CHANGED Viewed

@@ -25,11 +25,11 @@ vit_custom:
     embed_dim: 512
     num_channels: 6
     num_classes: 1
-    patch_size: 16
-    num_patches: 1024
     hidden_dim: 512
-    num_heads: 8
-    num_layers: 6
     dropout: 0.1
     lr: 0.0001
@@ -67,5 +67,5 @@ wandb:
     - aia
     - sxr
     - regression
-  wb_name: baseline-model-more-complex
   notes: Regression from AIA images (6 channels) to GOES SXR flux

     embed_dim: 512
     num_channels: 6
     num_classes: 1
+    patch_size: 8
+    num_patches: 4096
     hidden_dim: 512
+    num_heads: 12  # Increased from 8
+    num_layers: 4  # Reduced from 6
     dropout: 0.1
     lr: 0.0001
     - aia
     - sxr
     - regression
+  wb_name:
   notes: Regression from AIA images (6 channels) to GOES SXR flux

forecasting/training/config2.yaml CHANGED Viewed

@@ -3,12 +3,16 @@
 base_data_dir: "/mnt/data/COMBINED"  # Change this line for different datasets
 base_checkpoint_dir: "/mnt/data/COMBINED"    # Change this line for different datasets
 wavelengths: [94, 131, 171, 193, 211, 304]  # AIA wavelengths in Angstroms
 # Model configuration
 selected_model: "ViTPatch"  # Options: "hybrid", "vit", "fusion", "vitpatch"
 batch_size:    64
 epochs:        250
 oversample: false
 balance_strategy: "upsample_minority"
 megsai:
   architecture: "cnn"
@@ -67,5 +71,5 @@ wandb:
     - aia
     - sxr
     - regression
-  wb_name: vit-patch-model-2d-embeddings
   notes: Regression from AIA images (6 channels) to GOES SXR flux

 base_data_dir: "/mnt/data/COMBINED"  # Change this line for different datasets
 base_checkpoint_dir: "/mnt/data/COMBINED"    # Change this line for different datasets
 wavelengths: [94, 131, 171, 193, 211, 304]  # AIA wavelengths in Angstroms
+# GPU configuration
+gpu_id: 0  # GPU device ID to use (0, 1, 2, etc.) or -1 for CPU only
 # Model configuration
 selected_model: "ViTPatch"  # Options: "hybrid", "vit", "fusion", "vitpatch"
 batch_size:    64
 epochs:        250
 oversample: false
 balance_strategy: "upsample_minority"
+calculate_base_weights: false  # Whether to calculate class-based weights for loss function
 megsai:
   architecture: "cnn"
     - aia
     - sxr
     - regression
+  wb_name: vit-patch-model-2d-embeddings-reduced-sensitivity
   notes: Regression from AIA images (6 channels) to GOES SXR flux

forecasting/training/config4.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+#Base directories - change these to switch datasets
+base_data_dir: "/mnt/data/COMBINED"  # Change this line for different datasets
+base_checkpoint_dir: "/mnt/data/COMBINED"    # Change this line for different datasets
+wavelengths: [171, 193, 211, 304]  # AIA wavelengths in Angstroms
+# GPU configuration
+gpu_id: 1  # GPU device ID to use (0, 1, 2, etc.) or -1 for CPU only
+# Model configuration
+selected_model: "ViTPatch"  # Options: "hybrid", "vit", "fusion", "vitpatch"
+batch_size:    64
+epochs:        250
+oversample: false
+balance_strategy: "upsample_minority"
+calculate_base_weights: false  # Whether to calculate class-based weights for loss function
+megsai:
+  architecture: "cnn"
+  seed: 42
+  lr: 0.0001
+  cnn_model: "updated"
+  cnn_dp: 0.2
+  weight_decay: 1e-5
+  cosine_restart_T0: 50
+  cosine_restart_Tmult: 2
+  cosine_eta_min: 1e-7
+vit_custom:
+    embed_dim: 512
+    num_channels: 4
+    num_classes: 1
+    patch_size: 16
+    num_patches: 1024
+    hidden_dim: 512
+    num_heads: 8
+    num_layers: 6
+    dropout: 0.1
+    lr: 0.0001
+fusion:
+  scalar_branch: "hybrid"        # or "linear"
+  lr: 0.0001
+  lambda_vit_to_target: 0.3
+  lambda_scalar_to_target: 0.1
+  learnable_gate: true
+  gate_init_bias: 5.0
+  scalar_kwargs:
+    d_input: 6
+    d_output: 1
+    cnn_model: "updated"
+    cnn_dp: 0.75
+# Data paths (automatically constructed from base directories)
+data:
+  aia_dir:
+    "${base_data_dir}/AIA-SPLIT"
+  sxr_dir:
+    "${base_data_dir}/SXR-SPLIT"
+  sxr_norm_path:
+    "${base_data_dir}/SXR-SPLIT/normalized_sxr.npy"
+  checkpoints_dir:
+    "${base_checkpoint_dir}/new-checkpoint/"
+wandb:
+  entity: jayantbiradar619-university-of-arizona # Use your exact W&B username
+  project: Model Testing
+  job_type: training
+  tags:
+    - aia
+    - sxr
+    - regression
+  wb_name: vit-patch-model-2d-embeddings-reduced-sensitivity-STEREO
+  notes: Regression from AIA images (6 channels) to GOES SXR flux

forecasting/training/config5.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+#Base directories - change these to switch datasets
+base_data_dir: "/mnt/data/COMBINED"  # Change this line for different datasets
+base_checkpoint_dir: "/mnt/data/COMBINED"    # Change this line for different datasets
+wavelengths: [94, 131, 171, 193, 211, 304]  # AIA wavelengths in Angstroms
+# GPU configuration
+gpu_id: 2  # GPU device ID to use (0, 1, 2, etc.) or -1 for CPU only
+# Model configuration
+selected_model: "vit"  # Options: "hybrid", "vit", "fusion", "vitpatch"
+batch_size:    64
+epochs:        250
+oversample: false
+balance_strategy: "upsample_minority"
+calculate_base_weights: false  # Whether to calculate class-based weights for loss function
+megsai:
+  architecture: "cnn"
+  seed: 42
+  lr: 0.0001
+  cnn_model: "updated"
+  cnn_dp: 0.2
+  weight_decay: 1e-5
+  cosine_restart_T0: 50
+  cosine_restart_Tmult: 2
+  cosine_eta_min: 1e-7
+vit_custom:
+    embed_dim: 512
+    num_channels: 6
+    num_classes: 1
+    patch_size: 16
+    num_patches: 1024
+    hidden_dim: 512
+    num_heads: 8
+    num_layers: 6
+    dropout: 0.1
+    lr: 0.0001
+fusion:
+  scalar_branch: "hybrid"        # or "linear"
+  lr: 0.0001
+  lambda_vit_to_target: 0.3
+  lambda_scalar_to_target: 0.1
+  learnable_gate: true
+  gate_init_bias: 5.0
+  scalar_kwargs:
+    d_input: 6
+    d_output: 1
+    cnn_model: "updated"
+    cnn_dp: 0.75
+# Data paths (automatically constructed from base directories)
+data:
+  aia_dir:
+    "${base_data_dir}/AIA-SPLIT"
+  sxr_dir:
+    "${base_data_dir}/SXR-SPLIT"
+  sxr_norm_path:
+    "${base_data_dir}/SXR-SPLIT/normalized_sxr.npy"
+  checkpoints_dir:
+    "${base_checkpoint_dir}/new-checkpoint/"
+wandb:
+  entity: jayantbiradar619-university-of-arizona # Use your exact W&B username
+  project: Model Testing
+  job_type: training
+  tags:
+    - aia
+    - sxr
+    - regression
+  wb_name: vit-patch-model-2d-embeddings-reduced-sensitivity
+  notes: Regression from AIA images (6 channels) to GOES SXR flux

forecasting/training/config6.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+#Base directories - change these to switch datasets
+base_data_dir: "/mnt/data/COMBINED"  # Change this line for different datasets
+base_checkpoint_dir: "/mnt/data/COMBINED"    # Change this line for different datasets
+wavelengths: [94, 131, 171, 193, 211, 304]  # AIA wavelengths in Angstroms
+# GPU configuration
+gpu_id: 3  # GPU device ID to use (0, 1, 2, etc.) or -1 for CPU only
+# Model configuration
+selected_model: "ViTPatch"  # Options: "hybrid", "vit", "fusion", "vitpatch"
+batch_size:    64
+epochs:        250
+oversample: false
+balance_strategy: "upsample_minority"
+calculate_base_weights: false  # Whether to calculate class-based weights for loss function
+megsai:
+  architecture: "cnn"
+  seed: 42
+  lr: 0.0001
+  cnn_model: "updated"
+  cnn_dp: 0.2
+  weight_decay: 1e-5
+  cosine_restart_T0: 50
+  cosine_restart_Tmult: 2
+  cosine_eta_min: 1e-7
+vit_custom:
+    embed_dim: 512
+    num_channels: 6
+    num_classes: 1
+    patch_size: 16
+    num_patches: 1024
+    hidden_dim: 512
+    num_heads: 8
+    num_layers: 6
+    dropout: 0.1
+    lr: 0.001
+fusion:
+  scalar_branch: "hybrid"        # or "linear"
+  lr: 0.0001
+  lambda_vit_to_target: 0.3
+  lambda_scalar_to_target: 0.1
+  learnable_gate: true
+  gate_init_bias: 5.0
+  scalar_kwargs:
+    d_input: 6
+    d_output: 1
+    cnn_model: "updated"
+    cnn_dp: 0.75
+# Data paths (automatically constructed from base directories)
+data:
+  aia_dir:
+    "${base_data_dir}/AIA-SPLIT"
+  sxr_dir:
+    "${base_data_dir}/SXR-SPLIT"
+  sxr_norm_path:
+    "${base_data_dir}/SXR-SPLIT/normalized_sxr.npy"
+  checkpoints_dir:
+    "${base_checkpoint_dir}/new-checkpoint/"
+wandb:
+  entity: jayantbiradar619-university-of-arizona # Use your exact W&B username
+  project: Model Testing
+  job_type: training
+  tags:
+    - aia
+    - sxr
+    - regression
+  wb_name: vit-patch-model-2d-embeddings-reduced-sensitivity-higher-lr
+  notes: Regression from AIA images (6 channels) to GOES SXR flux

forecasting/training/train.py CHANGED Viewed

@@ -32,6 +32,18 @@ from forecasting.models.FastSpectralNet import FastViTFlaringModel
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["NCCL_DEBUG"] = "WARN"
 def resolve_config_variables(config_dict):
     """Recursively resolve ${variable} references within the config"""
@@ -75,11 +87,27 @@ with open(args.config, 'r') as stream:
 # Resolve variables like ${base_data_dir}
 config_data = resolve_config_variables(config_data)
-# Debug: Print resolved paths
-print("Resolved paths:")
-print(f"AIA dir: {config_data['data']['aia_dir']}")
-print(f"SXR dir: {config_data['data']['sxr_dir']}")
-print(f"Checkpoints dir: {config_data['data']['checkpoints_dir']}")
 # Debug: Print resolved paths
 print("Resolved paths:")
@@ -106,7 +134,7 @@ data_loader = AIA_GOESDataModule(
     sxr_val_dir=config_data['data']['sxr_dir']+"/val",
     sxr_test_dir=config_data['data']['sxr_dir']+"/test",
     batch_size=config_data['batch_size'],
-    num_workers=os.cpu_count(),
     sxr_norm=sxr_norm,
     wavelengths=training_wavelengths,
     oversample=config_data['oversample'],
@@ -114,6 +142,9 @@ data_loader = AIA_GOESDataModule(
 )
 data_loader.setup()
 # Logger
 #wb_name = f"{instrument}_{n}" if len(combined_parameters) > 1 else "aia_sxr_model"
 wandb_logger = WandbLogger(
@@ -133,8 +164,9 @@ plot_samples = plot_data  # Keep as list of ((aia, sxr), target)
 #sxr_callback = SXRPredictionLogger(plot_samples)
 sxr_plot_callback = ImagePredictionLogger_SXR(plot_samples, sxr_norm)
-# Attention map callback
-attention = AttentionMapCallback()
 class PTHCheckpointCallback(Callback):
@@ -308,7 +340,9 @@ elif config_data['selected_model'] == 'ViT':
     model = ViT(model_kwargs=config_data['vit_custom'], sxr_norm = sxr_norm)
 elif config_data['selected_model'] == 'ViTPatch':
-    model = ViTPatch(model_kwargs=config_data['vit_custom'], sxr_norm = sxr_norm, base_weights=get_base_weights(data_loader, sxr_norm))
 elif config_data['selected_model'] == 'FusionViTHybrid':
     # Expect a 'fusion' section in YAML
@@ -338,12 +372,32 @@ elif config_data['selected_model'] == 'FusionViTHybrid':
 else:
     raise NotImplementedError(f"Architecture {config_data['selected_model']} not supported.")
 # Trainer
 if config_data['selected_model'] == 'ViT' or config_data['selected_model'] == 'ViTPatch' or config_data['selected_model'] == 'FusionViTHybrid':
     trainer = Trainer(
         default_root_dir=config_data['data']['checkpoints_dir'],
-        accelerator="gpu" if torch.cuda.is_available() else "cpu",
-        devices=1,
         max_epochs=config_data['epochs'],
         callbacks=[attention, checkpoint_callback],
         logger=wandb_logger,
@@ -352,8 +406,8 @@ if config_data['selected_model'] == 'ViT' or config_data['selected_model'] == 'V
 else:
     trainer = Trainer(
         default_root_dir=config_data['data']['checkpoints_dir'],
-        accelerator="gpu" if torch.cuda.is_available() else "cpu",
-        devices=1,
         max_epochs=config_data['epochs'],
         callbacks=[checkpoint_callback],
         logger=wandb_logger,

 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["NCCL_DEBUG"] = "WARN"
+# Shared memory optimizations
+os.environ["OMP_NUM_THREADS"] = "1"  # Limit OpenMP threads
+os.environ["MKL_NUM_THREADS"] = "1"  # Limit MKL threads
+def print_gpu_memory(stage=""):
+    """Print GPU memory usage for monitoring"""
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1e9
+        reserved = torch.cuda.memory_reserved() / 1e9
+        print(f"GPU Memory {stage} - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
+    else:
+        print(f"No GPU available for memory monitoring {stage}")
 def resolve_config_variables(config_dict):
     """Recursively resolve ${variable} references within the config"""
 # Resolve variables like ${base_data_dir}
 config_data = resolve_config_variables(config_data)
+# GPU Memory Isolation for Multi-GPU Systems
+gpu_id = config_data.get('gpu_id', 0)
+if gpu_id != -1:  # Only if using GPU
+    # Set CUDA device visibility to only the specified GPU
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    print(f"Set CUDA_VISIBLE_DEVICES to GPU {gpu_id}")
+    # Clear any existing CUDA cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        print(f"Cleared CUDA cache for GPU {gpu_id}")
+    # Set memory allocation strategy for better isolation
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,roundup_power2_divisions:16"
+    # Disable memory sharing between processes
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    print(f"GPU Memory Isolation configured for GPU {gpu_id}")
+else:
+    print("Using CPU - no GPU memory isolation needed")
 # Debug: Print resolved paths
 print("Resolved paths:")
     sxr_val_dir=config_data['data']['sxr_dir']+"/val",
     sxr_test_dir=config_data['data']['sxr_dir']+"/test",
     batch_size=config_data['batch_size'],
+    num_workers=min(8, os.cpu_count()),  # Limit workers to prevent shm issues
     sxr_norm=sxr_norm,
     wavelengths=training_wavelengths,
     oversample=config_data['oversample'],
 )
 data_loader.setup()
+# Monitor memory after data loading
+print_gpu_memory("after data loading")
 # Logger
 #wb_name = f"{instrument}_{n}" if len(combined_parameters) > 1 else "aia_sxr_model"
 wandb_logger = WandbLogger(
 #sxr_callback = SXRPredictionLogger(plot_samples)
 sxr_plot_callback = ImagePredictionLogger_SXR(plot_samples, sxr_norm)
+# Attention map callback - get patch size from config
+patch_size = config_data.get('vit_custom', {}).get('patch_size', 8)
+attention = AttentionMapCallback(patch_size=patch_size)
 class PTHCheckpointCallback(Callback):
     model = ViT(model_kwargs=config_data['vit_custom'], sxr_norm = sxr_norm)
 elif config_data['selected_model'] == 'ViTPatch':
+    # Calculate base weights only if configured to do so
+    base_weights = get_base_weights(data_loader, sxr_norm) if config_data.get('calculate_base_weights', True) else None
+    model = ViTPatch(model_kwargs=config_data['vit_custom'], sxr_norm = sxr_norm, base_weights=base_weights)
 elif config_data['selected_model'] == 'FusionViTHybrid':
     # Expect a 'fusion' section in YAML
 else:
     raise NotImplementedError(f"Architecture {config_data['selected_model']} not supported.")
+# Monitor memory after model creation
+print_gpu_memory("after model creation")
+# Set device based on config
+gpu_id = config_data.get('gpu_id', 0)
+if gpu_id == -1:
+    accelerator = "cpu"
+    devices = 1
+    print("Using CPU for training")
+else:
+    if torch.cuda.is_available():
+        accelerator = "gpu"
+        # When CUDA_VISIBLE_DEVICES is set, PyTorch Lightning only sees GPU 0
+        devices = [0]  # Always use device 0 since we've isolated to specific GPU
+        print(f"Using GPU {gpu_id} for training (mapped to device 0 after CUDA_VISIBLE_DEVICES)")
+    else:
+        accelerator = "cpu"
+        devices = 1
+        print(f"GPU {gpu_id} not available, falling back to CPU")
 # Trainer
 if config_data['selected_model'] == 'ViT' or config_data['selected_model'] == 'ViTPatch' or config_data['selected_model'] == 'FusionViTHybrid':
     trainer = Trainer(
         default_root_dir=config_data['data']['checkpoints_dir'],
+        accelerator=accelerator,
+        devices=devices,
         max_epochs=config_data['epochs'],
         callbacks=[attention, checkpoint_callback],
         logger=wandb_logger,
 else:
     trainer = Trainer(
         default_root_dir=config_data['data']['checkpoints_dir'],
+        accelerator=accelerator,
+        devices=devices,
         max_epochs=config_data['epochs'],
         callbacks=[checkpoint_callback],
         logger=wandb_logger,