made evaluation easier by adding auto eval... makes configs on its own...

Browse files

Files changed (10) hide show

data/iti_data_processing.py +1 -1
forecasting/inference/auto_evaluate.py +279 -0
forecasting/inference/checkpoint_list.yaml +4 -4
forecasting/inference/evaluation.py +5 -51
forecasting/inference/evaluation_config.yaml +2 -2
forecasting/inference/inference.py +99 -264
forecasting/inference/inference_config.yaml +0 -45
forecasting/inference/inference_on_patch_config.yaml +0 -32
forecasting/inference/patch_analysis_config.yaml +0 -42
forecasting/models/vit_patch_model_local.py +7 -5

data/iti_data_processing.py CHANGED Viewed

@@ -144,7 +144,7 @@ else:
     print(f"Processing {len(unprocessed_indices)} unprocessed samples")
     if unprocessed_indices:
-        with Pool(processes=90) as pool:
             list(tqdm(pool.imap(save_sample, unprocessed_indices), total=len(unprocessed_indices)))
             print("AIA data processing completed.")
     else:

     print(f"Processing {len(unprocessed_indices)} unprocessed samples")
     if unprocessed_indices:
+        with Pool(processes=os.cpu_count()) as pool:
             list(tqdm(pool.imap(save_sample, unprocessed_indices), total=len(unprocessed_indices)))
             print("AIA data processing completed.")
     else:

forecasting/inference/auto_evaluate.py ADDED Viewed

	@@ -0,0 +1,279 @@

+#!/usr/bin/env python3
+"""
+Automated Evaluation Script for Solar Flare Models
+This script automatically generates inference and evaluation configs
+and runs the complete evaluation pipeline based on a directory input.
+Usage:
+    python auto_evaluate.py -checkpoint_dir /path/to/checkpoint/dir -model_name my_model
+    python auto_evaluate.py -checkpoint_path /path/to/checkpoint.pth -model_name my_model
+"""
+import argparse
+import os
+import subprocess
+import sys
+import yaml
+from pathlib import Path
+from datetime import datetime
+import glob
+# Add project root to Python path
+PROJECT_ROOT = Path(__file__).parent.parent.parent.absolute()
+sys.path.insert(0, str(PROJECT_ROOT))
+def find_checkpoint_files(checkpoint_dir):
+    """Find checkpoint files in directory"""
+    patterns = ['*.pth', '*.ckpt', '*.pt']
+    checkpoints = []
+    for pattern in patterns:
+        checkpoints.extend(glob.glob(str(Path(checkpoint_dir) / pattern)))
+        checkpoints.extend(glob.glob(str(Path(checkpoint_dir) / '**' / pattern), recursive=True))
+    return sorted(checkpoints)
+def detect_model_type(checkpoint_path):
+    """Detect model type from checkpoint filename or content"""
+    filename = Path(checkpoint_path).name.lower()
+    if 'local' in filename or 'localized' in filename:
+        return 'vitlocal'
+    elif 'patch' in filename:
+        return 'vitpatch'
+    elif 'fusion' in filename:
+        return 'fusion'
+    elif 'hybrid' in filename:
+        return 'hybrid'
+    elif 'linear' in filename:
+        return 'linear'
+    else:
+        # Default to vit for backward compatibility
+        return 'vit'
+def create_inference_config(checkpoint_path, model_name, base_data_dir="/mnt/data/COMBINED"):
+    """Create inference config for checkpoint"""
+    # Detect model type
+    model_type = detect_model_type(checkpoint_path)
+    # Create output directory
+    output_dir = f"/mnt/data/batch_results/{model_name}"
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(f"{output_dir}/weights", exist_ok=True)
+    # Generate config
+    config = {
+        'SolO': 'false',
+        'Stereo': 'false',
+        'base_data_dir': base_data_dir,
+        'data': {
+            'aia_dir': f"{base_data_dir}/AIA-SPLIT/",
+            'checkpoint_path': checkpoint_path,
+            'sxr_dir': f"{base_data_dir}/SXR-SPLIT/",
+            'sxr_norm_path': f"{base_data_dir}/SXR-SPLIT/normalized_sxr.npy"
+        },
+        'model': model_type,
+        'wavelengths': [94, 131, 171, 193, 211, 304],
+        'mc': {
+            'active': 'false',
+            'runs': 5
+        },
+        'model_params': {
+            'batch_size': 16,
+            'input_size': 512,
+            'no_weights': False,
+            'patch_size': 16
+        },
+        'vit_custom': {
+            'embed_dim': 512,
+            'hidden_dim': 512,
+            'num_channels': 6,
+            'num_classes': 1,
+            'patch_size': 16,
+            'num_patches': 1024,
+            'num_heads': 8,
+            'num_layers': 6,
+            'dropout': 0.1
+        },
+        'megsai': {
+            'cnn_model': 'updated',
+            'cnn_dp': 0.2,
+            'weight_decay': 1e-5,
+            'cosine_restart_T0': 50,
+            'cosine_restart_Tmult': 2,
+            'cosine_eta_min': 1e-7
+        },
+        'output_path': f"{output_dir}/{model_name}_predictions.csv",
+        'weight_path': f"{output_dir}/weights"
+    }
+    # Add model-specific configs
+    if model_type == 'fusion':
+        config['fusion'] = {
+            'scalar_branch': 'hybrid',
+            'lr': 0.0001,
+            'lambda_vit_to_target': 0.3,
+            'lambda_scalar_to_target': 0.1,
+            'learnable_gate': True,
+            'gate_init_bias': 5.0,
+            'scalar_kwargs': {
+                'd_input': 6,
+                'd_output': 1,
+                'cnn_model': 'updated',
+                'cnn_dp': 0.75
+            }
+        }
+    return config, output_dir
+def create_evaluation_config(model_name, output_dir, base_data_dir="/mnt/data/COMBINED"):
+    """Create evaluation config"""
+    config = {
+        'base_data_dir': base_data_dir,
+        'output_base_dir': f"{base_data_dir}/solar_flare_comparison_results",
+        'data': {
+            'aia_dir': f"{base_data_dir}/AIA-SPLIT/test/",
+            'weight_path': f"{output_dir}/weights"
+        },
+        'model_predictions': {
+            'main_model_csv': f"{output_dir}/{model_name}_predictions.csv",
+            'baseline_csv': ''
+        },
+        'evaluation': {
+            'output_dir': output_dir,
+            'sxr_cutoff': 1e-7
+        },
+        'time_range': {
+            'start_time': '2023-08-05T00:00:00',
+            'end_time': '2023-08-07T23:59:00',
+            'interval_minutes': 1
+        },
+        'plotting': {
+            'figure_size': [12, 8],
+            'dpi': 300,
+            'colormap': 'sdoaia171'
+        },
+        'metrics': {
+            'include_rmse': True,
+            'include_mae': True,
+            'include_r2': True,
+            'include_correlation': True
+        }
+    }
+    return config
+def run_inference(inference_config_path):
+    """Run inference with the generated config"""
+    print(f"Running inference with config: {inference_config_path}")
+    cmd = [
+        sys.executable,
+        str(PROJECT_ROOT / "forecasting/inference/inference.py"),
+        "-config", inference_config_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"Error running inference: {result.stderr}")
+        return False
+    print("Inference completed successfully!")
+    return True
+def run_evaluation(evaluation_config_path):
+    """Run evaluation with the generated config"""
+    print(f"Running evaluation with config: {evaluation_config_path}")
+    cmd = [
+        sys.executable,
+        str(PROJECT_ROOT / "forecasting/inference/evaluation.py"),
+        "-config", evaluation_config_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"Error running evaluation: {result.stderr}")
+        return False
+    print("Evaluation completed successfully!")
+    return True
+def main():
+    parser = argparse.ArgumentParser(description='Automated evaluation for solar flare models')
+    parser.add_argument('-checkpoint_dir', type=str, help='Directory containing checkpoint files')
+    parser.add_argument('-checkpoint_path', type=str, help='Specific checkpoint file path')
+    parser.add_argument('-model_name', type=str, required=True, help='Name for the model (used for output naming)')
+    parser.add_argument('-base_data_dir', type=str, default='/mnt/data/COMBINED', help='Base data directory')
+    parser.add_argument('-skip_inference', action='store_true', help='Skip inference and only run evaluation')
+    parser.add_argument('-skip_evaluation', action='store_true', help='Skip evaluation and only run inference')
+    args = parser.parse_args()
+    # Determine checkpoint path
+    if args.checkpoint_path:
+        checkpoint_path = args.checkpoint_path
+        if not os.path.exists(checkpoint_path):
+            print(f"Error: Checkpoint file not found: {checkpoint_path}")
+            sys.exit(1)
+    elif args.checkpoint_dir:
+        checkpoints = find_checkpoint_files(args.checkpoint_dir)
+        if not checkpoints:
+            print(f"Error: No checkpoint files found in {args.checkpoint_dir}")
+            sys.exit(1)
+        elif len(checkpoints) > 1:
+            print(f"Found multiple checkpoints: {checkpoints}")
+            print("Using the first one. Use -checkpoint_path to specify a specific file.")
+        checkpoint_path = checkpoints[0]
+    else:
+        print("Error: Must specify either -checkpoint_dir or -checkpoint_path")
+        sys.exit(1)
+    print(f"Using checkpoint: {checkpoint_path}")
+    print(f"Model name: {args.model_name}")
+    # Create configs
+    inference_config, output_dir = create_inference_config(checkpoint_path, args.model_name, args.base_data_dir)
+    evaluation_config = create_evaluation_config(args.model_name, output_dir, args.base_data_dir)
+    # Save configs
+    inference_config_path = f"/tmp/inference_config_{args.model_name}.yaml"
+    evaluation_config_path = f"/tmp/evaluation_config_{args.model_name}.yaml"
+    with open(inference_config_path, 'w') as f:
+        yaml.dump(inference_config, f, default_flow_style=False)
+    with open(evaluation_config_path, 'w') as f:
+        yaml.dump(evaluation_config, f, default_flow_style=False)
+    print(f"Configs saved to:")
+    print(f"  Inference: {inference_config_path}")
+    print(f"  Evaluation: {evaluation_config_path}")
+    print(f"  Output directory: {output_dir}")
+    # Run inference
+    if not args.skip_inference:
+        if not run_inference(inference_config_path):
+            print("Inference failed. Stopping.")
+            sys.exit(1)
+    else:
+        print("Skipping inference...")
+    # Run evaluation
+    if not args.skip_evaluation:
+        if not run_evaluation(evaluation_config_path):
+            print("Evaluation failed. Stopping.")
+            sys.exit(1)
+    else:
+        print("Skipping evaluation...")
+    print(f"\n✅ Complete! Results saved to: {output_dir}")
+    print(f"📊 Check the plots and metrics in: {output_dir}")
+if __name__ == '__main__':
+    main()

forecasting/inference/checkpoint_list.yaml CHANGED Viewed

@@ -8,10 +8,10 @@ checkpoints:
   #   checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-final-20250921_185953.pth"
   # - name: "baseweights-final"
   #   checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-changed-base-weights-final-20250921_223323.pth"
-  - name: "claude-mse"
-    checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-mse-claude-epoch=62-val_total_loss=0.1904.ckpt"
-  - name: "baseweights-mse"
-    checkpoint_path: /mnt/data/COMBINED/new-checkpoint/vit-mse-base-weights-epoch=62-val_total_loss=0.2893.ckpt"
   # - name: "stereo-final"
   #   checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-STEREO-final-20250921_183739.pth"

   #   checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-final-20250921_185953.pth"
   # - name: "baseweights-final"
   #   checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-changed-base-weights-final-20250921_223323.pth"
+  - name: "claude-localized"
+    checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-local-patch-mse-claude-final-20250929_050650.pth"
+  # - name: "baseweights-mse"
+  #   checkpoint_path: /mnt/data/COMBINED/new-checkpoint/vit-mse-base-weights-epoch=62-val_total_loss=0.2893.ckpt"
   # - name: "stereo-final"
   #   checkpoint_path: "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-STEREO-final-20250921_183739.pth"

forecasting/inference/evaluation.py CHANGED Viewed

@@ -622,7 +622,7 @@ class SolarFlareEvaluator:
             return None, None, None
     def generate_frame_worker(self, timestamp):
-        """Worker function to generate a single frame with uncertainty bands"""
         try:
             print(f"Worker {os.getpid()}: Processing {timestamp}")
@@ -665,9 +665,7 @@ class SolarFlareEvaluator:
             ax.imshow(aia_img, cmap=cm.cmlist['sdoaia131'], origin='lower')
             ax.imshow(attention_data, cmap='hot', origin='lower', alpha=0.5,norm=att_norm)
-            # Plot star at maximum attention location
-            # ax.plot(max_x, max_y, marker='*', markersize=10, color='cyan',
-            #         markeredgecolor='white', markeredgewidth=1)
             ax.set_title(f'AIA {wavs[1]} Å', fontsize=12, fontfamily='Barlow', color='white')
             ax.axis('off')
@@ -687,67 +685,29 @@ class SolarFlareEvaluator:
                 gt = sxr_window['groundtruth'].values
                 uncertainties = sxr_window['groundtruth_uncertainty'].values
-                # Create upper and lower bounds (assuming uncertainty is standard deviation)
-                upper_bound = gt + uncertainties
-                lower_bound = gt - uncertainties
                 # Ensure bounds are positive for log scale
                 lower_bound = np.maximum(lower_bound, 1e-12)
-                #sxr_ax.fill_between(sxr_window['timestamp'], lower_bound, upper_bound,
-                                    #alpha=0.3, color="#F78E69")
                 # Plot model predictions with uncertainty bands
                 model_label = 'Baseline Model' if self.baseline_only_mode else 'FOXES Model'
                 model_color = "#94ECBE" if self.baseline_only_mode else "#C0B9DD"
-                vit_prediction_line = sxr_ax.plot(sxr_window['timestamp'], sxr_window['predictions'],
                                                   label=model_label, linewidth=2.5, alpha=1, markersize=5,
                                                   color=model_color)
-                # Add uncertainty bands for model if available
-                if 'uncertainty' in sxr_window.columns and sxr_window['uncertainty'].notna().any():
-                    predictions = sxr_window['predictions'].values
-                    uncertainties = sxr_window['uncertainty'].values
-                    # Create upper and lower bounds (assuming uncertainty is standard deviation)
-                    upper_bound = predictions + uncertainties
-                    lower_bound = predictions - uncertainties
-                    # Ensure bounds are positive for log scale
-                    lower_bound = np.maximum(lower_bound, 1e-12)
-                    sxr_ax.fill_between(sxr_window['timestamp'], lower_bound, upper_bound,
-                                        alpha=0.3, color=model_color)
-                # Plot baseline predictions with uncertainty bands if available and not in baseline-only mode
                 if not self.baseline_only_mode and 'baseline_predictions' in sxr_window.columns and sxr_window[
                     'baseline_predictions'].notna().any():
                     baseline_line = sxr_ax.plot(sxr_window['timestamp'], sxr_window['baseline_predictions'],
                                                 label='Baseline Model', linewidth=1.5, alpha=1, markersize=5,
                                                 color="#94ECBE")
-                    # Add uncertainty bands for baseline model if available
-                    if 'baseline_uncertainty' in sxr_window.columns and sxr_window[
-                        'baseline_uncertainty'].notna().any():
-                        baseline_predictions = sxr_window['baseline_predictions'].values
-                        baseline_uncertainties = sxr_window['baseline_uncertainty'].values
-                        # Create upper and lower bounds
-                        baseline_upper = baseline_predictions + baseline_uncertainties
-                        baseline_lower = baseline_predictions - baseline_uncertainties
-                        # Ensure bounds are positive for log scale
-                        baseline_lower = np.maximum(baseline_lower, 1e-12)
-                        sxr_ax.fill_between(sxr_window['timestamp'], baseline_lower, baseline_upper,
-                                           alpha=0.3, color="#94ECBE")
                 # Mark current time
                 if sxr_current is not None:
                     sxr_ax.axvline(target_time, color='black', linestyle='--',
                                    linewidth=2, alpha=0.4, label='Current Time')
-                    # Create info text with all available values including uncertainties
                     model_name = 'Baseline' if self.baseline_only_mode else 'FOXES'
                     info_lines = ["Current Values:",
                                   f"Ground Truth: {sxr_current['groundtruth']:.2e}",
@@ -812,12 +772,6 @@ class SolarFlareEvaluator:
                             transform=sxr_ax.transAxes, fontsize=12, fontfamily='Barlow',
                             horizontalalignment='center', verticalalignment='center')
                 sxr_ax.set_title('SXR Data Comparison with Uncertainties', fontsize=12, fontfamily='Barlow')
-            #
-            # for spine in sxr_ax.spines.values():
-            #     spine.set_color('white')
-            #plt.suptitle(f'Timestamp: {timestamp}', fontsize=14)
-            #plt.tight_layout()
             plt.savefig(save_path, dpi=500, facecolor='none',bbox_inches='tight')
             plt.close()

             return None, None, None
     def generate_frame_worker(self, timestamp):
+        """Worker function to generate a single frame"""
         try:
             print(f"Worker {os.getpid()}: Processing {timestamp}")
             ax.imshow(aia_img, cmap=cm.cmlist['sdoaia131'], origin='lower')
             ax.imshow(attention_data, cmap='hot', origin='lower', alpha=0.5,norm=att_norm)
             ax.set_title(f'AIA {wavs[1]} Å', fontsize=12, fontfamily='Barlow', color='white')
             ax.axis('off')
                 gt = sxr_window['groundtruth'].values
                 uncertainties = sxr_window['groundtruth_uncertainty'].values
                 # Ensure bounds are positive for log scale
                 lower_bound = np.maximum(lower_bound, 1e-12)
                 # Plot model predictions with uncertainty bands
                 model_label = 'Baseline Model' if self.baseline_only_mode else 'FOXES Model'
                 model_color = "#94ECBE" if self.baseline_only_mode else "#C0B9DD"
+                sxr_ax.plot(sxr_window['timestamp'], sxr_window['predictions'],
                                                   label=model_label, linewidth=2.5, alpha=1, markersize=5,
                                                   color=model_color)
+                # Plot baseline predictions if available and not in baseline-only mode
                 if not self.baseline_only_mode and 'baseline_predictions' in sxr_window.columns and sxr_window[
                     'baseline_predictions'].notna().any():
                     baseline_line = sxr_ax.plot(sxr_window['timestamp'], sxr_window['baseline_predictions'],
                                                 label='Baseline Model', linewidth=1.5, alpha=1, markersize=5,
                                                 color="#94ECBE")
                 # Mark current time
                 if sxr_current is not None:
                     sxr_ax.axvline(target_time, color='black', linestyle='--',
                                    linewidth=2, alpha=0.4, label='Current Time')
+                    # Create info text with all available values
                     model_name = 'Baseline' if self.baseline_only_mode else 'FOXES'
                     info_lines = ["Current Values:",
                                   f"Ground Truth: {sxr_current['groundtruth']:.2e}",
                             transform=sxr_ax.transAxes, fontsize=12, fontfamily='Barlow',
                             horizontalalignment='center', verticalalignment='center')
                 sxr_ax.set_title('SXR Data Comparison with Uncertainties', fontsize=12, fontfamily='Barlow')
             plt.savefig(save_path, dpi=500, facecolor='none',bbox_inches='tight')
             plt.close()

forecasting/inference/evaluation_config.yaml CHANGED Viewed

@@ -27,8 +27,8 @@ evaluation:
 #   interval_minutes: 1
 time_range:
-  start_time: "2014-08-01T00:00:00"
-  end_time: "2014-08-31T23:59:00"
   interval_minutes: 1
 # Plotting parameters

 #   interval_minutes: 1
 time_range:
+  start_time: "2023-08-05T00:00:00"
+  end_time: "2023-08-07T23:59:00"
   interval_minutes: 1
 # Plotting parameters

forecasting/inference/inference.py CHANGED Viewed

@@ -15,7 +15,9 @@ sys.path.insert(0, str(PROJECT_ROOT))
 from forecasting.data_loaders.SDOAIA_dataloader import AIA_GOESDataset
 import forecasting.models as models
-from forecasting.models.vision_transformer_custom import ViT
 from forecasting.models.linear_and_hybrid import HybridIrradianceModel, LinearIrradianceModel  # Add your hybrid and linear model imports
 from torch.nn import HuberLoss
 from forecasting.training.callback import unnormalize_sxr
@@ -30,58 +32,11 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 def has_attention_weights(model):
     """Check if model supports attention weights"""
-    return hasattr(model, 'attention') or isinstance(model, ViT)
-#Does not return SXR data or use Dataloader for solo dataset
-def evaluate_solo_dataset(model, dataset, batch_size=16, times=None, config_data=None, save_weights=True, input_size = 512, patch_size = 16):
-    """Optimized generator for SolO dataset without Dataloader"""
-    model.eval()
-    supports_attention = has_attention_weights(model) and save_weights
-    with torch.no_grad():
-        for batch_idx, batch in enumerate(dataset):
-            # Correct unpacking based on your data structure
-            aia_imgs = batch[0]  # Get aia_img from inputs
-            # Move to device (it's already a tensor)
-            aia_imgs = aia_imgs.to(device, non_blocking=True)
-            # Get model predictions for entire batch
-            pred = model(aia_imgs)
-            # Handle different model output formats
-            if isinstance(pred, tuple) and len(pred) > 1:
-                predictions = pred[0]  # Shape: [batch_size, ...]
-                weights = pred[1] if supports_attention else None  # Shape: [batch_size, heads, L, S ...]
-            else:
-                predictions = pred
-                weights = None
-            # Process entire batch at once for weights if needed
-            batch_weights = []
-            if supports_attention and weights is not None:
-                current_batch_size = predictions.shape[0]
-                for i in range(current_batch_size):
-                    last_layer_attention = weights[-1][i]  # Get i-th item from batch [num_heads, seq_len, seq_len]
-                    avg_attention = last_layer_attention.mean(dim=0)  # [seq_len, seq_len]
-                    cls_attention = avg_attention[0, 1:].cpu()  # [num_patches] - 1D array
-                    grid_h, grid_w = input_size // patch_size, input_size // patch_size  # Should be 64, 64
-                    attention_map = cls_attention.reshape(grid_h, grid_w)  # [64, 64]
-                    batch_weights.append(attention_map.numpy())
-                if config_data and 'weight_path' in config_data:
-                    save_batch_weights(batch_weights, batch_idx, batch_size, times, config_data['weight_path'])
-            current_batch_size = predictions.shape[0]
-            for i in range(current_batch_size):
-                global_idx = batch_idx * batch_size + i
-                weight_data = batch_weights[i] if (supports_attention and batch_weights) else None
-                yield (predictions[i].cpu().numpy(),
-                       weight_data, global_idx)
 def evaluate_model_on_dataset(model, dataset, batch_size=16, times=None, config_data=None, save_weights=True, input_size = 512, patch_size = 16):
@@ -101,7 +56,10 @@ def evaluate_model_on_dataset(model, dataset, batch_size=16, times=None, config_
             aia_imgs = aia_imgs.to(device, non_blocking=True)
             # Get model predictions for entire batch
-            pred = model(aia_imgs)
             # Handle different model output formats
             if isinstance(pred, tuple) and len(pred) > 1:
@@ -115,23 +73,49 @@ def evaluate_model_on_dataset(model, dataset, batch_size=16, times=None, config_
             batch_weights = []
             if supports_attention and weights is not None:
                 current_batch_size = predictions.shape[0]
                 for i in range(current_batch_size):
-                    # Process attention weights for this item - matching callback approach
-                    #select last layer and appropriate item from batch
-                    last_layer_attention = weights[-1][i]  # Get i-th item from batch [num_heads, seq_len, seq_len]
-                    # Average across attention heads
-                    avg_attention = last_layer_attention.mean(dim=0)  # [seq_len, seq_len]
-                    # Get attention from CLS token to patches (exclude CLS->CLS)
-                    cls_attention = avg_attention[0, 1:].cpu()  # [num_patches] - 1D array
-                    # Calculate grid size based on patch size (assuming 8x8 patches)
-                    grid_h, grid_w = input_size // patch_size, input_size // patch_size  # Should be 64, 64
-                    # Reshape CLS attention to spatial grid
-                    attention_map = cls_attention.reshape(grid_h, grid_w)  # [64, 64]
-                    batch_weights.append(attention_map.numpy())
                 # Save all weights in this batch at once
                 if config_data and 'weight_path' in config_data:
@@ -145,98 +129,6 @@ def evaluate_model_on_dataset(model, dataset, batch_size=16, times=None, config_
                 yield (predictions[i].cpu().numpy(), sxr[i].cpu().numpy(),
                        weight_data, global_idx)
-#Evaluate model with batches using mc dropout
-def evaluate_model_on_dataset_mc_dropout(model, dataset, batch_size=16, times=None, config_data=None, save_weights=True,
-                                         input_size=512, patch_size=16, runs=100, sxr_norm=None):
-    """Streaming MC Dropout - processes each batch with multiple forward passes without loading all data"""
-    loader = DataLoader(dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
-    supports_attention = has_attention_weights(model) and save_weights
-    print(f"Starting streaming MC Dropout with {runs} forward passes per batch...")
-    for batch_idx, batch in enumerate(loader):
-        aia_imgs = batch[0]  # Shape: [batch_size, ...]
-        sxr = batch[1]
-        aia_imgs = aia_imgs.to(device, non_blocking=True)
-        current_batch_size = aia_imgs.shape[0]
-        if (batch_idx * batch_size) % 100 == 0:
-            print(
-                f"Processing batch {batch_idx + 1}, samples {batch_idx * batch_size + 1}-{batch_idx * batch_size + current_batch_size}")
-        # Storage for this batch's MC predictions
-        # Shape: [runs, batch_size, prediction_dims...]
-        batch_predictions = []
-        batch_weights = [] if supports_attention else None
-        # Perform MC dropout runs for this batch
-        for run in range(runs):
-            #Set seed based on run
-            torch.manual_seed(run)  # Ensure different dropout masks for each run
-            model.train()  # Enable dropout
-            with torch.no_grad():
-                pred = model(aia_imgs)
-                if isinstance(pred, tuple) and len(pred) > 1:
-                    predictions = pred[0]  # [batch_size, ...]
-                    weights = pred[1] if supports_attention else None
-                else:
-                    predictions = pred
-                    weights = None
-                # Store predictions for this run
-                batch_predictions.append(predictions.cpu().numpy())
-                # Process attention weights for this run
-                if supports_attention and weights is not None:
-                    run_weights = []
-                    for i in range(current_batch_size):
-                        last_layer_attention = weights[-1][i]  # [num_heads, seq_len, seq_len]
-                        avg_attention = last_layer_attention.mean(dim=0)  # [seq_len, seq_len]
-                        cls_attention = avg_attention[0, 1:].cpu()  # [num_patches]
-                        grid_h, grid_w = input_size // patch_size, input_size // patch_size
-                        attention_map = cls_attention.reshape(grid_h, grid_w)
-                        run_weights.append(attention_map.numpy())
-                    if batch_weights is None:
-                        batch_weights = []
-                    batch_weights.append(run_weights)  # [runs, batch_size, grid_h, grid_w]
-        # Convert to numpy and compute statistics
-        # batch_predictions: [runs, batch_size, prediction_dims...]
-        batch_predictions = np.array(batch_predictions)
-        # Compute mean and std across runs (axis=0)
-        # Result shapes: [batch_size, prediction_dims...]
-        mean_predictions = np.mean(unnormalize_sxr(batch_predictions,sxr_norm=sxr_norm), axis=0)
-        uncertainties = np.std(unnormalize_sxr(batch_predictions,sxr_norm=sxr_norm), axis=0)
-        # Process attention weights if available
-        mean_weights = None
-        if supports_attention and batch_weights:
-            # batch_weights: [runs, batch_size, grid_h, grid_w]
-            batch_weights = np.array(batch_weights)
-            # mean_weights: [batch_size, grid_h, grid_w]
-            mean_weights = np.mean(batch_weights, axis=0)
-            # Save weights if required
-            if config_data and 'weight_path' in config_data:
-                save_batch_weights(list(mean_weights), batch_idx, batch_size, times, config_data['weight_path'])
-        # Yield results for each sample in the batch
-        for i in range(current_batch_size):
-            global_idx = batch_idx * batch_size + i
-            weight_data = mean_weights[i] if mean_weights is not None else None
-            yield (mean_predictions[i],  # Mean prediction across MC runs
-                   sxr[i].cpu().numpy(),  # Ground truth
-                   uncertainties[i],  # Uncertainty (std) across MC runs
-                   weight_data,  # Mean attention weights
-                   global_idx)  # Sample index
 def save_batch_weights(batch_weights, batch_idx, batch_size, times, weight_path):
     """Save all weights in a batch efficiently"""
@@ -252,8 +144,9 @@ def save_batch_weights(batch_weights, batch_idx, batch_size, times, weight_path)
     save_args = []
     for i, weight in enumerate(batch_weights):
         global_idx = batch_idx * batch_size + i
-        if global_idx < len(times):  # Make sure we don't go out of bounds
-            filepath = weight_path + f"{times[global_idx]}"
             save_args.append((weight, filepath))
     # Save all weights in this batch in parallel
@@ -283,7 +176,11 @@ def load_model_from_config(config_data):
     if ".ckpt" in checkpoint_path:
         # Lightning checkpoint format
         if model_type.lower() == 'vit':
-            model = ViT.load_from_checkpoint(checkpoint_path)
         elif model_type.lower() == 'hybrid' or model_type.lower() == 'hybridirradiancemodel':
             # Try to load with saved hyperparameters first, then fall back to config parameters
             try:
@@ -427,113 +324,51 @@ def main():
     print(f"Processing {total_samples} samples with batch size {batch_size}...")
-    if config_data['mc']['active'] == "false":
-        print("Running inference without MC Dropout")
-        for prediction, sxr, weight, idx in evaluate_model_on_dataset(
-                model, dataset, batch_size, times, config_data, save_weights, input_size, patch_size
-        ):
-            # Unnormalize prediction
             pred = unnormalize_sxr(prediction, sxr_norm)
-            # Store results
-            predictions.append(pred.item() if hasattr(pred, 'item') else float(pred))
-            ground.append(sxr.item() if hasattr(sxr, 'item') else float(sxr))
-            timestamp.append(str(times[idx]))
-            # Progress update
-            if (idx + 1) % 50 == 0:
-                print(f"Processed {idx + 1}/{total_samples}")
-        if save_weights:
-            print("All weights saved during batch processing!")
-        else:
-            print("Inference completed (no weights saved)!")
-        # Create and save results DataFrame
-        print("Creating output DataFrame...")
-        output_df = pd.DataFrame({
-            'timestamp': timestamp,
-            'predictions': predictions,
-            'groundtruth': ground
-        })
-        print(output_df.head())
-        #Make output directory if it doesn't exist
-        output_dir = Path(config_data['output_path']).parent
-        output_dir.mkdir(parents=True, exist_ok=True)
-        output_df.to_csv(config_data['output_path'], index=False)
-        print(f"Predictions saved to {config_data['output_path']}")
     else:
-        print("Running inference with MC Dropout")
-        if config_data['mc']['active'] == "false":
-            print("Running inference without MC Dropout")
-            for prediction, sxr, weight, idx in evaluate_model_on_dataset(
-                    model, dataset, batch_size, times, config_data, save_weights, input_size, patch_size
-            ):
-                # Unnormalize prediction
-                pred = unnormalize_sxr(prediction, sxr_norm)
-                # Store results
-                predictions.append(pred.item() if hasattr(pred, 'item') else float(pred))
-                ground.append(sxr.item() if hasattr(sxr, 'item') else float(sxr))
-                timestamp.append(str(times[idx]))
-                # Progress update
-                if (idx + 1) % 50 == 0:
-                    print(f"Processed {idx + 1}/{total_samples}")
-            # Create and save results DataFrame
-            print("Creating output DataFrame...")
-            output_df = pd.DataFrame({
-                'timestamp': timestamp,
-                'predictions': predictions,
-                'groundtruth': ground
-            })
-        else:
-            #print("Running inference with MC Dropout")
-            uncertainties = []  # Add this to store uncertainties
-            mc_runs = config_data['mc']['runs']  # Allow configurable MC runs
-            # Choose between batch processing or single-sample processing
-            # Use single-sample for very large datasets or memory constraints
-            print(f"Using batch MC Dropout with {mc_runs} runs per batch")
-            mc_generator = evaluate_model_on_dataset_mc_dropout(
-                model, dataset, batch_size, times, config_data, save_weights,
-                input_size, patch_size, runs=mc_runs, sxr_norm=sxr_norm
-            )
-            for prediction, sxr, uncertainty, weight, idx in mc_generator:
-                # Unnormalize prediction and uncertainty
-                #pred = unnormalize_sxr(prediction, sxr_norm)
-                #unc = unnormalize_sxr(uncertainty, sxr_norm)
-                # Store results
-                predictions.append(prediction.item() if hasattr(prediction, 'item') else float(prediction))
-                ground.append(sxr.item() if hasattr(sxr, 'item') else float(sxr))
-                uncertainties.append(uncertainty.item() if hasattr(uncertainty, 'item') else float(uncertainty))
-                timestamp.append(str(times[idx]))
-                # Progress update
-                if (idx + 1) % 50 == 0:
-                    print(f"Processed {idx + 1}/{total_samples}")
-            # Create and save results DataFrame with uncertainty
-            print("Creating output DataFrame with uncertainty...")
-            output_df = pd.DataFrame({
-                'timestamp': timestamp,
-                'predictions': predictions,
-                'groundtruth': ground,
-                'uncertainty': uncertainties  # Add uncertainty column
-            })
-        print(output_df.head())
-        # Make output directory if it doesn't exist
-        output_dir = Path(config_data['output_path']).parent
-        output_dir.mkdir(parents=True, exist_ok=True)
-        output_df.to_csv(config_data['output_path'], index=False)
-        print(f"Predictions saved to {config_data['output_path']}")
 if __name__ == '__main__':

 from forecasting.data_loaders.SDOAIA_dataloader import AIA_GOESDataset
 import forecasting.models as models
+from forecasting.models.vision_transformer_custom import ViT as ViTCustom
+from forecasting.models.vit_patch_model import ViT as ViTPatch
+from forecasting.models.vit_patch_model_local import ViTLocal
 from forecasting.models.linear_and_hybrid import HybridIrradianceModel, LinearIrradianceModel  # Add your hybrid and linear model imports
 from torch.nn import HuberLoss
 from forecasting.training.callback import unnormalize_sxr
 def has_attention_weights(model):
     """Check if model supports attention weights"""
+    return hasattr(model, 'attention') or isinstance(model, ViTCustom) or isinstance(model, ViTPatch) or isinstance(model, ViTLocal)
+def is_localized_attention_model(model):
+    """Check if model uses localized attention (no CLS token)"""
+    return isinstance(model, ViTLocal)
 def evaluate_model_on_dataset(model, dataset, batch_size=16, times=None, config_data=None, save_weights=True, input_size = 512, patch_size = 16):
             aia_imgs = aia_imgs.to(device, non_blocking=True)
             # Get model predictions for entire batch
+            if supports_attention:
+                pred = model(aia_imgs, return_attention=True)
+            else:
+                pred = model(aia_imgs)
             # Handle different model output formats
             if isinstance(pred, tuple) and len(pred) > 1:
             batch_weights = []
             if supports_attention and weights is not None:
                 current_batch_size = predictions.shape[0]
+                is_localized = is_localized_attention_model(model)
                 for i in range(current_batch_size):
+                    try:
+                        # Process attention weights for this item
+                        last_layer_attention = weights[-1][i]  # Get i-th item from batch [num_heads, seq_len, seq_len]
+                        # Check for None or invalid values
+                        if last_layer_attention is None:
+                            print(f"Warning: last_layer_attention is None for sample {i}")
+                            continue
+                        # Average across attention heads
+                        avg_attention = last_layer_attention.mean(dim=0)  # [seq_len, seq_len]
+                        # Check for NaN or invalid values
+                        if torch.isnan(avg_attention).any():
+                            print(f"Warning: NaN values in avg_attention for sample {i}")
+                            continue
+                        if is_localized:
+                            # For ViTLocal (no CLS token), create attention map by averaging attention TO each patch
+                            # This gives us how much each patch is "attended to" by its neighbors
+                            patch_attention = avg_attention.mean(dim=0).cpu()  # [num_patches] - average attention received by each patch
+                        else:
+                            # For regular ViT (with CLS token), get attention from CLS token to patches
+                            cls_attention = avg_attention[0, 1:].cpu()  # [num_patches] - CLS token attention to patches
+                            patch_attention = cls_attention
+                        # Calculate grid size based on patch size
+                        grid_h, grid_w = input_size // patch_size, input_size // patch_size
+                        # Reshape patch attention to spatial grid
+                        attention_map = patch_attention.reshape(grid_h, grid_w)
+                        batch_weights.append(attention_map.numpy())
+                    except Exception as e:
+                        print(f"Error processing attention weights for sample {i}: {e}")
+                        # Add a zero attention map as fallback
+                        grid_h, grid_w = input_size // patch_size, input_size // patch_size
+                        fallback_map = torch.zeros(grid_h * grid_w).reshape(grid_h, grid_w).numpy()
+                        batch_weights.append(fallback_map)
                 # Save all weights in this batch at once
                 if config_data and 'weight_path' in config_data:
                 yield (predictions[i].cpu().numpy(), sxr[i].cpu().numpy(),
                        weight_data, global_idx)
 def save_batch_weights(batch_weights, batch_idx, batch_size, times, weight_path):
     """Save all weights in a batch efficiently"""
     save_args = []
     for i, weight in enumerate(batch_weights):
         global_idx = batch_idx * batch_size + i
+        if global_idx < len(times):# Make sure we don't go out of bounds
+            #Save to weight path using os join
+            filepath = os.path.join(weight_path, f"{times[global_idx]}")
             save_args.append((weight, filepath))
     # Save all weights in this batch in parallel
     if ".ckpt" in checkpoint_path:
         # Lightning checkpoint format
         if model_type.lower() == 'vit':
+            model = ViTCustom.load_from_checkpoint(checkpoint_path)
+        elif model_type.lower() == 'vitpatch':
+            model = ViTPatch.load_from_checkpoint(checkpoint_path)
+        elif model_type.lower() == 'vitlocal':
+            model = ViTLocal.load_from_checkpoint(checkpoint_path)
         elif model_type.lower() == 'hybrid' or model_type.lower() == 'hybridirradiancemodel':
             # Try to load with saved hyperparameters first, then fall back to config parameters
             try:
     print(f"Processing {total_samples} samples with batch size {batch_size}...")
+    print("Running inference...")
+    for prediction, sxr, weight, idx in evaluate_model_on_dataset(
+            model, dataset, batch_size, times, config_data, save_weights, input_size, patch_size
+    ):
+        # Unnormalize prediction only if not ViTPatch / ViTLocal
+        if not isinstance(model, ViTPatch) and not isinstance(model, ViTLocal):
             pred = unnormalize_sxr(prediction, sxr_norm)
+        else:
+            pred = prediction
+        # Store results
+        predictions.append(pred.item() if hasattr(pred, 'item') else float(pred))
+        ground.append(sxr.item() if hasattr(sxr, 'item') else float(sxr))
+        timestamp.append(str(times[idx]))
+        # Progress update
+        if (idx + 1) % 50 == 0:
+            print(f"Processed {idx + 1}/{total_samples}")
+    if save_weights:
+        print("All weights saved during batch processing!")
     else:
+        print("Inference completed (no weights saved)!")
+    # Create and save results DataFrame
+    print("Creating output DataFrame...")
+    output_df = pd.DataFrame({
+        'timestamp': timestamp,
+        'predictions': predictions,
+        'groundtruth': ground
+    })
+    print(output_df.head())
+    #Make output directory if it doesn't exist
+    output_dir = Path(config_data['output_path']).parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_df.to_csv(config_data['output_path'], index=False)
+    print(f"Predictions saved to {config_data['output_path']}")
+    print(output_df.head())
+    # Make output directory if it doesn't exist
+    output_dir = Path(config_data['output_path']).parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_df.to_csv(config_data['output_path'], index=False)
+    print(f"Predictions saved to {config_data['output_path']}")
 if __name__ == '__main__':

forecasting/inference/inference_config.yaml DELETED Viewed

@@ -1,45 +0,0 @@
-# Base directories - change these to switch datasets
-base_data_dir: "/mnt/data/ML-READY/"  # Change this line for different datasets
-output_path: "${base_data_dir}/output/baseline-model-more-complex-STEREO.csv"
-weight_path: "${base_data_dir}/baseline-model/"
-mc:
-  active: "false"
-  runs: 5
-# Enable or disable MC Dropout
-# Data paths (automatically constructed from base directories)
-Stereo: "false"
-Stereo_data:
-    stereo_img_dir: "/mnt/data/ML-Ready-mixed/STEREO_processed"
-    sxr_dir: "/mnt/data/ML-Ready-mixed/ML-Ready-mixed/SXR"
-    sxr_norm_path: "/mnt/data/ML-READY/SXR/normalized_sxr.npy"
-SolO: "false"
-SolO_data:
-    solo_img_dir: "/mnt/data/ML-Ready_clean/SolO/SolO/ML-Ready-SolO"
-    sxr_dir: "${base_data_dir}/SXR"
-    sxr_norm_path: "${base_data_dir}/SolO/SXR/normalized_sxr.npy"
-model: "hybrid"  # Options: "vit", "hybrid"
-wavelengths: [171, 193, 211, 304]  # AIA wavelengths in Angstroms
-# Model parameters
-model_params:
-  input_size: 512
-  patch_size: 16
-  batch_size: 100
-  no_weights: false  # Set to true to skip saving attention weights
-megsai:
-  cnn_model: "updated"  # Must match the training config
-  cnn_dp: 0.2
-data:
-  aia_dir:
-    "${base_data_dir}/AIA"
-  sxr_dir:
-    "${base_data_dir}/SXR"
-  sxr_norm_path:
-    "/mnt/data/ML-READY/SXR/normalized_sxr.npy"
-  checkpoint_path:
-    "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-claude-suggested-weights-epoch=30-val_total_loss=0.0385.ckpt"

forecasting/inference/inference_on_patch_config.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-base_data_dir: "/mnt/data/COMBINED/"  # Change this line for different datasets
-output_path: "${base_data_dir}/output/rs.csv"
-weight_path: "${base_data_dir}/rs_weights/"
-flux_path: "${base_data_dir}/rs_flux/"
-mc:
-  active: "false"
-  runs: 5
-# Enable or disable MC Dropout
-# Data paths (automatically constructed from base directories)
-Stereo: "false"
-Stereo_data:
-    stereo_img_dir: "/mnt/data/ML-Ready-mixed/STEREO_processed"
-    sxr_dir: "/mnt/data/ML-Ready-mixed/ML-Ready-mixed/SXR"
-    sxr_norm_path: "/mnt/data/ML-Ready-mixed/ML-Ready-mixed/SXR/normalized_sxr.npy"
-SolO: "false"
-SolO_data:
-    solo_img_dir: "/mnt/data/ML-Ready_clean/SolO/SolO/ML-Ready-SolO"
-    sxr_dir: "${base_data_dir}/SXR"
-    sxr_norm_path: "${base_data_dir}/SolO/SXR/normalized_sxr.npy"
-model: "vit"  # Options: "cnn", "vit", "ViT Custom"
-wavelengths: [94,131,171, 193, 211, 304]  # AIA wavelengths in Angstroms
-data:
-  aia_dir:
-    "${base_data_dir}/AIA-SPLIT"
-  sxr_dir:
-    "${base_data_dir}/SXR-SPLIT"
-  sxr_norm_path:
-    "${base_data_dir}/SXR-SPLIT/normalized_sxr.npy"
-  checkpoint_path:
-    "/mnt/data/COMBINED/new-checkpoint/vit-patch-model-2d-embeddings-reduced-sensitivity-epoch=42-val_total_loss=0.0393.ckpt"

forecasting/inference/patch_analysis_config.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-# Analysis configuration
-base_data_dir: "/mnt/data/COMBINED"
-output_path: "${base_data_dir}/output/patch.csv"
-aia_path: "${base_data_dir}/AIA-SPLIT/train/"
-weight_path: "${base_data_dir}/patch_weights/"
-flux_path: "${base_data_dir}/patch_flux/"
-attention_path: "${base_data_dir}/patch_attention/"
-data:
-  aia_dir:
-    "${base_data_dir}/AIA-SPLIT"
-  sxr_dir:
-    "${base_data_dir}/SXR-SPLIT"
-  sxr_norm_path:
-    "${base_data_dir}/SXR-SPLIT/normalized_sxr.npy"
-analysis:
-  # Time period selection for testing analysis
-  time_period:
-    start_time: "2023-08-05 00:00:00"  # Start time for analysis
-    end_time: "2023-08-06 00:00:00"    # End time for analysis
-    # Set to null to analyze all available data
-    # start_time: null
-    # end_time: null
-  # Flare detection parameters
-  flare_detection:
-    threshold_percentile: 97.0
-    min_patches: 2
-    max_patches: 50
-    simultaneous_flare_threshold: 0.000005  # Threshold for detecting simultaneous flares
-  # Output configuration
-  output:
-    output_dir: "${base_data_dir}/flux_analysis_output"
-    create_visualizations: true
-    max_visualizations: 100
-    visualization_threshold: 0.00005  # Only save figures for predictions above this threshold (5e-5)
-    create_movie: true
-    movie_fps: 2

forecasting/models/vit_patch_model_local.py CHANGED Viewed

@@ -200,8 +200,7 @@ class VisionTransformerLocal(nn.Module):
         B, T, _ = x.shape
         x = self.input_layer(x)
-        # Add CLS token and positional encoding
-        #x = x + self.pos_embedding[:, : T + 1]
         x = self._add_2d_positional_encoding(x)
         # Apply Transformer blocks
@@ -237,7 +236,7 @@ class VisionTransformerLocal(nn.Module):
     def _add_2d_positional_encoding(self, x):
         """Add learned 2D positional encoding to patch embeddings"""
         B, T, embed_dim = x.shape
-        num_patches = T  # Exclude CLS token
         # Reshape patches to 2D grid: [B, grid_h, grid_w, embed_dim]
         patch_embeddings = x.reshape(B, self.grid_h, self.grid_w, embed_dim)
@@ -247,9 +246,9 @@ class VisionTransformerLocal(nn.Module):
         patch_embeddings = patch_embeddings + self.pos_embedding_2d
         # Reshape back to sequence format: [B, num_patches, embed_dim]
-        patch_embeddings = patch_embeddings.reshape(B, num_patches, embed_dim)
-        return patch_embeddings
     def forward_for_callback(self, x, return_attention=True):
         """Forward method compatible with AttentionMapCallback"""
@@ -329,7 +328,10 @@ class LocalAttentionBlock(nn.Module):
         num_patches = self.num_patches  # 32x32 patches
         grid_size = int(math.sqrt(num_patches))
         mask = torch.zeros(num_patches, num_patches)
         for i in range(num_patches):
             row_i, col_i = i // grid_size, i % grid_size
             for j in range(num_patches):

         B, T, _ = x.shape
         x = self.input_layer(x)
+        # Add positional encoding (no CLS token for local attention)
         x = self._add_2d_positional_encoding(x)
         # Apply Transformer blocks
     def _add_2d_positional_encoding(self, x):
         """Add learned 2D positional encoding to patch embeddings"""
         B, T, embed_dim = x.shape
+        num_patches = T  # All tokens are patches (no CLS token)
         # Reshape patches to 2D grid: [B, grid_h, grid_w, embed_dim]
         patch_embeddings = x.reshape(B, self.grid_h, self.grid_w, embed_dim)
         patch_embeddings = patch_embeddings + self.pos_embedding_2d
         # Reshape back to sequence format: [B, num_patches, embed_dim]
+        x = patch_embeddings.reshape(B, num_patches, embed_dim)
+        return x
     def forward_for_callback(self, x, return_attention=True):
         """Forward method compatible with AttentionMapCallback"""
         num_patches = self.num_patches  # 32x32 patches
         grid_size = int(math.sqrt(num_patches))
+        # Create mask for patches only: [num_patches, num_patches]
         mask = torch.zeros(num_patches, num_patches)
+        # Patches can only attend to nearby patches
         for i in range(num_patches):
             row_i, col_i = i // grid_size, i % grid_size
             for j in range(num_patches):