griffingoodwin04 commited on 23 days ago

Commit

ec2b4e7

1 Parent(s): 10ad6fc

Refactor pipeline configuration and update data processing scripts

Browse files

Files changed (18) hide show

.gitignore +1 -1
.dockerignore → Untracked/.dockerignore +1 -1
QUICKSTART_DOCKER.md → Untracked/QUICKSTART_DOCKER.md +0 -0
data/align_data.py +10 -29
data/euv_data_cleaning.py +48 -62
data/iti_data_processing.py +61 -81
data/pipeline_config.py +106 -0
data/pipeline_config.yaml +47 -0
data/process_data_pipeline.py +5 -13
data/sxr_data_processing.py +29 -29
download/download_sdo.py +27 -50
download/sxr_downloader.py +1 -3
forecasting/data_loaders/SDOAIA_dataloader.py +13 -3
forecasting/training/train.py +122 -212
forecasting/training/train_config.yaml +5 -5
pipeline_config.yaml +87 -0
requirements.txt +1 -0
run_pipeline.py +364 -0

.gitignore CHANGED Viewed

@@ -154,4 +154,4 @@ wandb/
 *.code-workspace
 .claude/
-misc/

 *.code-workspace
 .claude/
+Untracked/

.dockerignore → Untracked/.dockerignore RENAMED Viewed

@@ -24,7 +24,7 @@ venv.bak/
 # IDE
 .vscode/
-.idea/
 *.swp
 *.swo
 *~

 # IDE
 .vscode/
+../.idea/
 *.swp
 *.swo
 *~

QUICKSTART_DOCKER.md → Untracked/QUICKSTART_DOCKER.md RENAMED Viewed

File without changes

data/align_data.py CHANGED Viewed

@@ -38,9 +38,8 @@ def load_config():
         'alignment': {
             'goes_data_dir': "/mnt/data/PAPER/GOES-timespan/combined",
             'aia_processed_dir': "/mnt/data/PAPER/SDOITI",
-            'output_sxr_a_dir': "/mnt/data/PAPER/GOES-SXR-A",
-            'output_sxr_b_dir': "/mnt/data/PAPER/GOES-SXR-B",
-            'aia_missing_dir': "/mnt/data/PAPER/AIA_ITI_MISSING"
         },
         'processing': {
             'batch_size_multiplier': 4,
@@ -56,8 +55,7 @@ GOES_DATA_DIR = config['alignment']['goes_data_dir']
 AIA_PROCESSED_DIR = config['alignment']['aia_processed_dir']
 # Output directories
-OUTPUT_SXR_A_DIR = config['alignment']['output_sxr_a_dir']
-OUTPUT_SXR_B_DIR = config['alignment']['output_sxr_b_dir']
 AIA_MISSING_DIR = config['alignment']['aia_missing_dir']
 # Processing configuration
@@ -136,7 +134,6 @@ def create_combined_lookup_table(goes_data_dict, target_timestamps):
     # For each target timestamp, average over all available instruments at that time
     for target_time in tqdm(target_times, desc="Building lookup table"):
-        sxr_a_values = []
         sxr_b_values = []
         available_instruments = []
@@ -144,22 +141,15 @@ def create_combined_lookup_table(goes_data_dict, target_timestamps):
             goes_data = goes_data_dict[g_number]
             if target_time in goes_data.index:
                 row = goes_data.loc[target_time]
-                sxr_a = row['xrsa_flux']
                 sxr_b = row['xrsb_flux']
-                # Only care about xrsb_flux for validity
                 if not pd.isna(sxr_b):
                     sxr_b_values.append(float(sxr_b))
-                    if not pd.isna(sxr_a):
-                        sxr_a_values.append(float(sxr_a))
                     available_instruments.append(f"GOES-{g_number}")
         if sxr_b_values:
-            avg_sxr_b = float(np.mean(sxr_b_values))
-            avg_sxr_a = float(np.mean(sxr_a_values)) if sxr_a_values else float('nan')
             lookup_data.append({
                 'timestamp': target_time.strftime('%Y-%m-%dT%H:%M:%S'),
-                'sxr_a': avg_sxr_a,
-                'sxr_b': avg_sxr_b,
                 'instrument': ",".join(available_instruments)
             })
@@ -179,21 +169,14 @@ def process_batch(batch_data):
     for data in batch_data:
         try:
             timestamp = data['timestamp']
-            sxr_a = data['sxr_a']
             sxr_b = data['sxr_b']
             instrument = data['instrument']
-            # Create arrays
-            sxr_a_data = np.array([sxr_a], dtype=np.float32)
-            sxr_b_data = np.array([sxr_b], dtype=np.float32)
-            # Save data to disk using configured directories
-            np.save(f"{OUTPUT_SXR_A_DIR}/{timestamp}.npy", sxr_a_data)
-            np.save(f"{OUTPUT_SXR_B_DIR}/{timestamp}.npy", sxr_b_data)
             successful_count += 1
             results.append((timestamp, True, f"Success using {instrument}"))
         except Exception as e:
             failed_count += 1
             results.append((timestamp, False, f"Error processing timestamp {timestamp}: {e}"))
@@ -213,14 +196,12 @@ def main():
     print("=" * 60)
     print(f"GOES data directory: {GOES_DATA_DIR}")
     print(f"AIA processed directory: {AIA_PROCESSED_DIR}")
-    print(f"Output SXR-A directory: {OUTPUT_SXR_A_DIR}")
-    print(f"Output SXR-B directory: {OUTPUT_SXR_B_DIR}")
     print(f"AIA missing directory: {AIA_MISSING_DIR}")
     print("=" * 60)
     # Make output directories if they don't exist
-    os.makedirs(OUTPUT_SXR_A_DIR, exist_ok=True)
-    os.makedirs(OUTPUT_SXR_B_DIR, exist_ok=True)
     os.makedirs(AIA_MISSING_DIR, exist_ok=True)
     # Load and prepare GOES data with optimizations

         'alignment': {
             'goes_data_dir': "/mnt/data/PAPER/GOES-timespan/combined",
             'aia_processed_dir': "/mnt/data/PAPER/SDOITI",
+            'output_sxr_dir':  "/Volumes/T9/Data_FOXES/SXR_processed",
+            'aia_missing_dir': "/Volumes/T9/Data_FOXES/AIA_missing"
         },
         'processing': {
             'batch_size_multiplier': 4,
 AIA_PROCESSED_DIR = config['alignment']['aia_processed_dir']
 # Output directories
+OUTPUT_SXR_DIR = config['alignment']['output_sxr_dir']
 AIA_MISSING_DIR = config['alignment']['aia_missing_dir']
 # Processing configuration
     # For each target timestamp, average over all available instruments at that time
     for target_time in tqdm(target_times, desc="Building lookup table"):
         sxr_b_values = []
         available_instruments = []
             goes_data = goes_data_dict[g_number]
             if target_time in goes_data.index:
                 row = goes_data.loc[target_time]
                 sxr_b = row['xrsb_flux']
                 if not pd.isna(sxr_b):
                     sxr_b_values.append(float(sxr_b))
                     available_instruments.append(f"GOES-{g_number}")
         if sxr_b_values:
             lookup_data.append({
                 'timestamp': target_time.strftime('%Y-%m-%dT%H:%M:%S'),
+                'sxr_b': float(np.mean(sxr_b_values)),
                 'instrument': ",".join(available_instruments)
             })
     for data in batch_data:
         try:
             timestamp = data['timestamp']
             sxr_b = data['sxr_b']
             instrument = data['instrument']
+            np.save(f"{OUTPUT_SXR_DIR}/{timestamp}.npy", np.array([sxr_b], dtype=np.float32))
             successful_count += 1
             results.append((timestamp, True, f"Success using {instrument}"))
         except Exception as e:
             failed_count += 1
             results.append((timestamp, False, f"Error processing timestamp {timestamp}: {e}"))
     print("=" * 60)
     print(f"GOES data directory: {GOES_DATA_DIR}")
     print(f"AIA processed directory: {AIA_PROCESSED_DIR}")
+    print(f"Output SXR directory: {OUTPUT_SXR_DIR}")
     print(f"AIA missing directory: {AIA_MISSING_DIR}")
     print("=" * 60)
     # Make output directories if they don't exist
+    os.makedirs(OUTPUT_SXR_DIR, exist_ok=True)
     os.makedirs(AIA_MISSING_DIR, exist_ok=True)
     # Load and prepare GOES data with optimizations

data/euv_data_cleaning.py CHANGED Viewed

@@ -14,36 +14,18 @@ collections.MutableMapping = collections.abc.MutableMapping
 from itipy.data.dataset import get_intersecting_files
 from astropy.io import fits
-# Configuration for all wavelengths to process
-# Load configuration from environment or use defaults
-import os
 import json
 def load_config():
     """Load configuration from environment or use defaults."""
-    if 'PIPELINE_CONFIG' in os.environ:
-        try:
-            config = json.loads(os.environ['PIPELINE_CONFIG'])
-            return config
-        except:
-            pass
-    # Default configuration
-    return {
-        'euv': {
-            'wavelengths': [94, 131, 171, 193, 211, 304],
-            'input_folder': '/mnt/data/PAPER/SDOData',
-            'bad_files_dir': '/mnt/data/PAPER/SDO-AIA_bad'
-        }
-    }
-config = load_config()
-wavelengths = config['euv']['wavelengths']
-base_input_folder = config['euv']['input_folder']
-aia_files = get_intersecting_files(base_input_folder, wavelengths)
-# Function to process a single file
 def process_fits_file(file_path):
     try:
         with fits.open(file_path) as hdu:
@@ -59,39 +41,43 @@ def process_fits_file(file_path):
         print(f"Error processing {file_path}: {e}")
         return None
-file_list = aia_files[0]  # List of FITS file paths
-with Pool(processes=os.cpu_count()) as pool:
-    results = list(tqdm(pool.imap(process_fits_file, file_list), total=len(file_list)))
-# Filter out None results (in case of failed files)
-results = [r for r in results if r is not None]
-# Convert to DataFrame
-aia_header = pd.DataFrame(results)
-# Ensure DATE-OBS is datetime (already timezone-naive from processing)
-aia_header['DATE-OBS'] = pd.to_datetime(aia_header['DATE-OBS'])
-# add a column for date difference between DATE-OBS and FILENAME
-aia_header['DATE_DIFF'] = (
-            pd.to_datetime(aia_header['FILENAME']) - pd.to_datetime(aia_header['DATE-OBS'])).dt.total_seconds()
-# remove rows where DATE_DIFF is greater than plus or minus 60 seconds in a list
-files_to_remove = aia_header[(aia_header['DATE_DIFF'] <= -60) | (aia_header['DATE_DIFF'] >= 60)]
-print(len(files_to_remove))
-# Loop through each wavelength
-for wavelength in wavelengths:
-    #print(f"\nProcessing wavelength: {wavelength}")
-    for names in files_to_remove['FILENAME'].to_numpy():
-        # Construct file path
-        filename = pd.to_datetime(names).strftime('%Y-%m-%dT%H:%M:%S') + ".fits"
-        file_path = os.path.join(base_input_folder, f"{wavelength}/{filename}")
-        # Destination path
-        destination_folder = os.path.join(config['euv']['bad_files_dir'], str(wavelength))
-        os.makedirs(destination_folder, exist_ok=True)
-        # Move or report missing
-        if os.path.exists(file_path):
-            shutil.move(file_path, destination_folder)
-            print(f"Removed file: {file_path}")
-        else:
-            print(f"File not found: {file_path}")

 from itipy.data.dataset import get_intersecting_files
 from astropy.io import fits
 import json
 def load_config():
     """Load configuration from environment or use defaults."""
+    try:
+        config = json.loads(os.environ['PIPELINE_CONFIG'])
+        return config
+    except:
+        pass
 def process_fits_file(file_path):
     try:
         with fits.open(file_path) as hdu:
         print(f"Error processing {file_path}: {e}")
         return None
+if __name__ == '__main__':
+    config = load_config()
+    wavelengths = config['euv']['wavelengths']
+    base_input_folder = config['euv']['input_folder']
+    aia_files = get_intersecting_files(base_input_folder, wavelengths)
+    file_list = aia_files[0]  # List of FITS file paths
+    with Pool(processes=os.cpu_count()) as pool:
+        results = list(tqdm(pool.imap(process_fits_file, file_list), total=len(file_list)))
+    # Filter out None results (in case of failed files)
+    results = [r for r in results if r is not None]
+    # Convert to DataFrame
+    aia_header = pd.DataFrame(results)
+    aia_header['DATE-OBS'] = pd.to_datetime(aia_header['DATE-OBS'])
+    # Add a column for date difference between DATE-OBS and FILENAME
+    aia_header['DATE_DIFF'] = (
+        pd.to_datetime(aia_header['FILENAME']) - pd.to_datetime(aia_header['DATE-OBS'])
+    ).dt.total_seconds()
+    # Remove rows where DATE_DIFF is greater than ±60 seconds
+    files_to_remove = aia_header[(aia_header['DATE_DIFF'] <= -60) | (aia_header['DATE_DIFF'] >= 60)]
+    print(f"{len(files_to_remove)} bad files found")
+    for wavelength in wavelengths:
+        print(f"\nProcessing wavelength: {wavelength}")
+        for names in files_to_remove['FILENAME'].to_numpy():
+            filename = pd.to_datetime(names).strftime('%Y-%m-%dT%H:%M:%S') + ".fits"
+            file_path = os.path.join(base_input_folder, f"{wavelength}/{filename}")
+            destination_folder = os.path.join(config['euv']['bad_files_dir'], str(wavelength))
+            os.makedirs(destination_folder, exist_ok=True)
+            if os.path.exists(file_path):
+                shutil.move(file_path, destination_folder)
+                print(f"Moved: {file_path}")
+            else:
+                print(f"Not found: {file_path}")

data/iti_data_processing.py CHANGED Viewed

@@ -10,38 +10,18 @@ from astropy.visualization import ImageNormalize, AsinhStretch
 from itipy.data.dataset import StackDataset, get_intersecting_files, AIADataset
 from itipy.data.editor import BrightestPixelPatchEditor, sdo_norms
 import os
 from multiprocessing import Pool
 from tqdm import tqdm
-# Configuration for all wavelengths to process
-# Load configuration from environment or use defaults
-import os
-import json
 def load_config():
     """Load configuration from environment or use defaults."""
-    if 'PIPELINE_CONFIG' in os.environ:
-        try:
-            config = json.loads(os.environ['PIPELINE_CONFIG'])
-            return config
-        except:
-            pass
-    # Default configuration
-    return {
-        'iti': {
-            'wavelengths': [94, 131, 171, 193, 211, 304],
-            'input_folder': '/mnt/data/PAPER/SDOData',
-            'output_folder': '/mnt/data/PAPER/SDOITI'
-        }
-    }
-config = load_config()
-wavelengths = config['iti']['wavelengths']
-base_input_folder = config['iti']['input_folder']
-output_folder = config['iti']['output_folder']
-os.makedirs(output_folder, exist_ok=True)
@@ -72,72 +52,72 @@ class SDODataset_flaring(StackDataset):
             self.addEditor(BrightestPixelPatchEditor(patch_shape))
-# Check if we need to process anything before loading the dataset
-def check_existing_files():
-    """Check how many files already exist without loading the full dataset"""
-    # Get file list from the base folder to estimate total samples
-    from itipy.data.dataset import get_intersecting_files
     files = get_intersecting_files(base_input_folder, wavelengths, ext='.fits')
     if not files or len(files) == 0:
         return 0, 0
-    # Count existing output files - need to check for each wavelength combination
     existing_count = 0
-    total_expected = len(files[0])  # All wavelength lists should have same length
-    # Check each time step (index across all wavelengths)
     for i in range(total_expected):
-        # Check if output file exists for this time step
-        # The output filename should be based on the first wavelength's filename
-        first_wl_file = files[0][i]  # Use first wavelength as reference
         base_name = os.path.splitext(os.path.basename(first_wl_file))[0]
-        # Remove wavelength suffix if present (e.g., "_171" from filename)
         if '_' in base_name:
             base_name = '_'.join(base_name.split('_')[:-1])
         output_path = os.path.join(output_folder, base_name) + '.npy'
         if os.path.exists(output_path):
             existing_count += 1
     return existing_count, total_expected
-# Check existing files first
-existing_files, total_expected = check_existing_files()
-print(f"Found {existing_files} existing files out of {total_expected} expected files")
-if existing_files >= total_expected:
-    print("All files already processed. Nothing to do.")
-else:
-    print(f"Need to process {total_expected - existing_files} remaining files")
-    # Only load the dataset if we need to process files
-    aia_dataset = SDODataset_flaring(data=base_input_folder, wavelengths=wavelengths, resolution=512, allow_errors=True)
-    # Filter out indices that already have processed files
-    def get_unprocessed_indices():
-        unprocessed = []
-        for i in range(len(aia_dataset)):
-            file_path = os.path.join(output_folder, aia_dataset.getId(i)) + '.npy'
-            if not os.path.exists(file_path):
-                unprocessed.append(i)
-        return unprocessed
-    def save_sample(i):
-        try:
-            data = aia_dataset[i]
-            file_path = os.path.join(output_folder, aia_dataset.getId(i)) + '.npy'
-            np.save(file_path, data)
-        except Exception as e:
-            print(f"Warning: Could not process sample {i} (ID: {aia_dataset.getId(i)}): {e}")
-            return  # Skip this sample and continue with the next one
-    # Get only unprocessed indices
-    unprocessed_indices = get_unprocessed_indices()
-    print(f"Processing {len(unprocessed_indices)} unprocessed samples")
-    if unprocessed_indices:
-        with Pool(processes=os.cpu_count()) as pool:
-            list(tqdm(pool.imap(save_sample, unprocessed_indices), total=len(unprocessed_indices)))
-            print("AIA data processing completed.")
     else:
-        print("All samples already processed. Nothing to do.")

 from itipy.data.dataset import StackDataset, get_intersecting_files, AIADataset
 from itipy.data.editor import BrightestPixelPatchEditor, sdo_norms
 import os
+import json
 from multiprocessing import Pool
 from tqdm import tqdm
 def load_config():
     """Load configuration from environment or use defaults."""
+    try:
+        config = json.loads(os.environ['PIPELINE_CONFIG'])
+        return config
+    except:
+        pass
             self.addEditor(BrightestPixelPatchEditor(patch_shape))
+_aia_dataset = None
+_output_folder = None
+def _init_worker(dataset, out_folder):
+    global _aia_dataset, _output_folder
+    _aia_dataset = dataset
+    _output_folder = out_folder
+def save_sample(i):
+    try:
+        data = _aia_dataset[i]
+        file_path = os.path.join(_output_folder, _aia_dataset.getId(i)) + '.npy'
+        np.save(file_path, data)
+    except Exception as e:
+        print(f"Warning: Could not process sample {i} (ID: {_aia_dataset.getId(i)}): {e}")
+def check_existing_files(base_input_folder, wavelengths, output_folder):
+    """Check how many files already exist without loading the full dataset."""
     files = get_intersecting_files(base_input_folder, wavelengths, ext='.fits')
     if not files or len(files) == 0:
         return 0, 0
     existing_count = 0
+    total_expected = len(files[0])
     for i in range(total_expected):
+        first_wl_file = files[0][i]
         base_name = os.path.splitext(os.path.basename(first_wl_file))[0]
         if '_' in base_name:
             base_name = '_'.join(base_name.split('_')[:-1])
         output_path = os.path.join(output_folder, base_name) + '.npy'
         if os.path.exists(output_path):
             existing_count += 1
     return existing_count, total_expected
+if __name__ == '__main__':
+    config = load_config()
+    wavelengths = config['iti']['wavelengths']
+    base_input_folder = config['iti']['input_folder']
+    output_folder = config['iti']['output_folder']
+    os.makedirs(output_folder, exist_ok=True)
+    existing_files, total_expected = check_existing_files(base_input_folder, wavelengths, output_folder)
+    print(f"Found {existing_files} existing files out of {total_expected} expected files")
+    if existing_files >= total_expected:
+        print("All files already processed. Nothing to do.")
     else:
+        print(f"Need to process {total_expected - existing_files} remaining files")
+        aia_dataset = SDODataset_flaring(data=base_input_folder, wavelengths=wavelengths, resolution=512, allow_errors=True)
+        unprocessed_indices = [
+            i for i in range(len(aia_dataset))
+            if not os.path.exists(os.path.join(output_folder, aia_dataset.getId(i)) + '.npy')
+        ]
+        print(f"Processing {len(unprocessed_indices)} unprocessed samples")
+        if unprocessed_indices:
+            with Pool(processes=os.cpu_count(), initializer=_init_worker, initargs=(aia_dataset, output_folder)) as pool:
+                list(tqdm(pool.imap(save_sample, unprocessed_indices), total=len(unprocessed_indices)))
+            print("AIA data processing completed.")
+        else:
+            print("All samples already processed. Nothing to do.")

data/pipeline_config.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+PipelineConfig: loads and validates the data processing pipeline configuration.
+Used by process_data_pipeline.py. Config is read from a YAML file and passed
+to each sub-script as a JSON string via the PIPELINE_CONFIG environment variable.
+"""
+import json
+from pathlib import Path
+import yaml
+TEMPLATE_PATH = Path(__file__).parent / "pipeline_config.yaml"
+# Paths that must exist before the pipeline runs
+REQUIRED_INPUT_PATHS = [
+    ("euv", "input_folder"),
+    ("iti", "input_folder"),
+]
+# Paths that should be created before the pipeline runs
+OUTPUT_PATHS = [
+    ("euv",       "bad_files_dir"),
+    ("iti",       "output_folder"),
+    ("alignment", "output_sxr_dir"),
+    ("alignment", "aia_missing_dir"),
+]
+class PipelineConfig:
+    def __init__(self, config_path: str = None):
+        if config_path:
+            with open(config_path, "r") as f:
+                self.config = yaml.safe_load(f)
+        else:
+            self.config = self._defaults()
+    # ------------------------------------------------------------------
+    def get_path(self, section: str, key: str) -> str:
+        """Return config[section][key], or config[section] if key == section."""
+        section_data = self.config.get(section, {})
+        if isinstance(section_data, dict):
+            return section_data.get(key, "")
+        return section_data  # scalar value (e.g. base_data_dir)
+    def to_json(self) -> str:
+        """Serialize config to JSON string for passing via environment variable."""
+        return json.dumps(self.config)
+    # ------------------------------------------------------------------
+    def validate_paths(self) -> tuple[bool, list[str]]:
+        """Check that all required input paths exist. Returns (valid, missing)."""
+        missing = []
+        for section, key in REQUIRED_INPUT_PATHS:
+            p = self.get_path(section, key)
+            if p and not Path(p).exists():
+                missing.append(f"{section}.{key}: {p}")
+        return (len(missing) == 0, missing)
+    def create_directories(self):
+        """Create all output directories."""
+        for section, key in OUTPUT_PATHS:
+            p = self.get_path(section, key)
+            if p:
+                Path(p).mkdir(parents=True, exist_ok=True)
+    def print_config(self):
+        print(yaml.dump(self.config, default_flow_style=False))
+    def save_config_template(self, path: str = None):
+        dest = Path(path) if path else TEMPLATE_PATH
+        with open(dest, "w") as f:
+            yaml.dump(self._defaults(), f, default_flow_style=False)
+        print(f"Template saved to {dest}")
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _defaults() -> dict:
+        return {
+            "base_data_dir": "/Volumes/T9/Data_FOXES",
+            "euv": {
+                "input_folder":  "/Volumes/T9/Data_FOXES/AIA_raw",
+                "bad_files_dir": "/Volumes/T9/Data_FOXES/AIA_bad",
+                "wavelengths":   [94, 131, 171, 193, 211, 304, 335],
+            },
+            "iti": {
+                "input_folder":  "/Volumes/T9/Data_FOXES/AIA_raw",
+                "output_folder": "/Volumes/T9/Data_FOXES/AIA_processed",
+                "wavelengths":   [94, 131, 171, 193, 211, 304, 335],
+            },
+            "alignment": {
+                "goes_data_dir":    "/Volumes/T9/Data_FOXES/SXR_raw/combined",
+                "aia_processed_dir": "/Volumes/T9/Data_FOXES/AIA_processed",
+                "output_sxr_dir":   "/Volumes/T9/Data_FOXES/SXR_processed",
+                "aia_missing_dir":  "/Volumes/T9/Data_FOXES/AIA_missing",
+            },
+            "processing": {
+                "max_processes":        None,
+                "batch_size_multiplier": 4,
+                "min_batch_size":        1,
+            },
+        }

data/pipeline_config.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+# Data Processing Pipeline Configuration
+#
+# Usage: python process_data_pipeline.py --config pipeline_config.yaml
+#
+# Directory flow:
+#   AIA_raw        → euv_data_cleaning   → bad files moved to AIA_bad
+#   AIA_raw        → iti_data_processing → AIA_processed  (512x512 .npy)
+#   AIA_processed  ┐
+#                  ├─ align_data ──────→ SXR_processed  (xrsb_flux .npy per timestamp)
+#   SXR_raw/combined ┘                  AIA_missing     (AIA files with no SXR match)
+base_data_dir: /Volumes/T9/Data_FOXES
+euv:
+  input_folder:  /Volumes/T9/Data_FOXES/AIA_raw
+  bad_files_dir: /Volumes/T9/Data_FOXES/AIA_bad
+  wavelengths:
+    - 94
+    - 131
+    - 171
+    - 193
+    - 211
+    - 304
+    - 335
+iti:
+  input_folder:  /Volumes/T9/Data_FOXES/AIA_raw       # same as euv (bad files already moved out)
+  output_folder: /Volumes/T9/Data_FOXES/AIA_processed
+  wavelengths:
+    - 94
+    - 131
+    - 171
+    - 193
+    - 211
+    - 304
+    - 335
+alignment:
+  goes_data_dir:     /Volumes/T9/Data_FOXES/SXR_raw/combined  # output of sxr_downloader concat
+  aia_processed_dir: /Volumes/T9/Data_FOXES/AIA_processed     # must match iti.output_folder
+  output_sxr_dir:    /Volumes/T9/Data_FOXES/SXR_processed
+  aia_missing_dir:   /Volumes/T9/Data_FOXES/AIA_missing
+processing:
+  max_processes: null        # null = use all available cores
+  batch_size_multiplier: 4
+  min_batch_size: 1

data/process_data_pipeline.py CHANGED Viewed

@@ -105,11 +105,8 @@ class DataProcessingPipeline:
         """
         Check if data alignment is complete by looking for output directories.
         """
-        output_dirs = [
-            Path(self.config.get_path('alignment', 'output_sxr_a_dir')),
-            Path(self.config.get_path('alignment', 'output_sxr_b_dir'))
-        ]
-        return all(d.exists() and any(d.iterdir()) for d in output_dirs)
     def run_script(self, script_name, step_info):
         """
@@ -135,7 +132,7 @@ class DataProcessingPipeline:
         # Create environment variables for configuration
         env = os.environ.copy()
         env.update({
-            'PIPELINE_CONFIG': str(self.config.config),
             'BASE_DATA_DIR': self.config.get_path('base_data_dir', 'base_data_dir')
         })
@@ -145,23 +142,18 @@ class DataProcessingPipeline:
             # Run the script
             result = subprocess.run(
                 [sys.executable, str(script_path)],
-                capture_output=True,
-                text=True,
                 cwd=self.base_dir,
                 env=env
             )
             end_time = time.time()
             duration = end_time - start_time
             if result.returncode == 0:
                 logger.info(f"✓ {step_info['name']} completed successfully in {duration:.2f} seconds")
-                if result.stdout:
-                    logger.debug(f"Output: {result.stdout}")
                 return True
             else:
                 logger.error(f"✗ {step_info['name']} failed with return code {result.returncode}")
-                logger.error(f"Error output: {result.stderr}")
                 return False
         except Exception as e:

         """
         Check if data alignment is complete by looking for output directories.
         """
+        output_dir = Path(self.config.get_path('alignment', 'output_sxr_dir'))
+        return output_dir.exists() and any(output_dir.iterdir())
     def run_script(self, script_name, step_info):
         """
         # Create environment variables for configuration
         env = os.environ.copy()
         env.update({
+            'PIPELINE_CONFIG': self.config.to_json(),
             'BASE_DATA_DIR': self.config.get_path('base_data_dir', 'base_data_dir')
         })
             # Run the script
             result = subprocess.run(
                 [sys.executable, str(script_path)],
                 cwd=self.base_dir,
                 env=env
             )
             end_time = time.time()
             duration = end_time - start_time
             if result.returncode == 0:
                 logger.info(f"✓ {step_info['name']} completed successfully in {duration:.2f} seconds")
                 return True
             else:
                 logger.error(f"✗ {step_info['name']} failed with return code {result.returncode}")
                 return False
         except Exception as e:

data/sxr_data_processing.py CHANGED Viewed

@@ -50,10 +50,10 @@ class SXRDataProcessor:
         total_files = len(g13_files) + len(g14_files) + len(g15_files) + len(g16_files) + len(g17_files) + len(g18_files)
         logging.info(
             f"Found {len(g13_files)} GOES-13 files, {len(g14_files)} GOES-14 files, {len(g15_files)} GOES-15 files, {len(g16_files)} GOES-16 files, {len(g17_files)} GOES-17 files, and {len(g18_files)} GOES-18 files.")
-        print(f"📊 Total files found: {total_files}")
         if total_files == 0:
-            print("⚠️  No GOES data files found in the specified directory.")
             return
         def process_files(files, satellite_name, output_file, used_file_list):
@@ -84,37 +84,37 @@ class SXRDataProcessor:
                             ds.close()
             if not datasets:
-                print(f"❌ No valid datasets for {satellite_name}")
                 logging.warning(f"No valid datasets for {satellite_name}")
                 return
-            print(f"📊 Processing {len(datasets)} datasets for {satellite_name}...")
             try:
-                print(f"🔗 Concatenating datasets...")
                 combined_ds = xr.concat(datasets, dim='time').sortby('time')
                 # Scaling factors for GOES-13, GOES-14, and GOES-15
                 if satellite_name in ['GOES-13', 'GOES-14', 'GOES-15']:
-                    print(f"⚖️  Applying scaling factors for {satellite_name}...")
                     combined_ds['xrsa_flux'] = combined_ds['xrsa_flux'] / .85
                     combined_ds['xrsb_flux'] = combined_ds['xrsb_flux'] / .7
-                print(f"🔄 Converting to DataFrame...")
                 df = combined_ds.to_dataframe().reset_index()
                 if 'quad_diode' in df.columns:
-                    print(f"🔍 Filtering quad diode data...")
                     df = df[df['quad_diode'] == 0]  # Filter out quad diode data
                 #Filter out data where xrsb_flux has a quality flag of >0
-                print(f"🔍 Filtering out data where xrsb_flux has a quality flag of >0...")
-                df = df[df['xrsb_quality'] == 0]
                 df['time'] = pd.to_datetime(df['time'])
                 df.set_index('time', inplace=True)
-                print(f"📈 Applying log interpolation...")
                 df_log = np.log10(df[columns_to_interp].replace(0, np.nan))
                 # Step 3: Interpolate in log space
@@ -128,14 +128,14 @@ class SXRDataProcessor:
                 max_date = df.index.max().strftime('%Y%m%d')
                 filename = f"{str(output_file)}_{min_date}_{max_date}.csv"
-                print(f"💾 Saving to {filename}...")
                 df.to_csv(filename, index=True)
-                print(f"✅ Successfully processed {satellite_name}: {successful_files} files loaded, {failed_files} failed")
                 logging.info(f"Saved combined file: {output_file}")
             except Exception as e:
-                print(f"❌ Failed to process {satellite_name}: {e}")
                 logging.error(f"Failed to write {output_file}: {e}")
             finally:
                 for ds in datasets:
@@ -156,7 +156,7 @@ class SXRDataProcessor:
         if len(g18_files) != 0:
             satellites_to_process.append((g18_files, "GOES-18", self.output_dir / "combined_g18_avg1m", self.used_g18_files))
-        print(f"\n🚀 Starting processing of {len(satellites_to_process)} satellites...")
         # Process each satellite with overall progress tracking
         successful_satellites = 0
@@ -164,36 +164,36 @@ class SXRDataProcessor:
         for i, (files, satellite_name, output_file, used_file_list) in enumerate(satellites_to_process, 1):
             print(f"\n{'='*60}")
-            print(f"📡 Processing satellite {i}/{len(satellites_to_process)}: {satellite_name}")
             print(f"{'='*60}")
             try:
                 process_files(files, satellite_name, output_file, used_file_list)
                 successful_satellites += 1
             except Exception as e:
-                print(f"❌ Failed to process {satellite_name}: {e}")
                 failed_satellites += 1
                 logging.error(f"Failed to process {satellite_name}: {e}")
         # Print final summary
         print(f"\n{'='*60}")
-        print(f"📊 PROCESSING COMPLETE")
         print(f"{'='*60}")
-        print(f"✅ Successfully processed: {successful_satellites} satellites")
-        print(f"❌ Failed: {failed_satellites} satellites")
-        print(f"📁 Total files processed: {total_files}")
-        print(f"📂 Output directory: {self.output_dir}")
         # Print file usage statistics
         total_used_files = (len(self.used_g13_files) + len(self.used_g14_files) +
                            len(self.used_g15_files) + len(self.used_g16_files) +
                            len(self.used_g17_files) + len(self.used_g18_files))
-        print(f"📋 Files used in processing: {total_used_files}")
         if successful_satellites > 0:
-            print(f"\n🎉 SXR data processing completed successfully!")
         else:
-            print(f"\n⚠️  No satellites were processed successfully.")
 if __name__ == '__main__':
@@ -204,13 +204,13 @@ if __name__ == '__main__':
                         help='Directory where combined GOES data will be saved.')
     args = parser.parse_args()
-    print("🌞 GOES SXR Data Processing Tool")
     print("=" * 50)
-    print(f"📂 Data directory: {args.data_dir}")
-    print(f"📁 Output directory: {args.output_dir}")
     print("=" * 50)
     processor = SXRDataProcessor(data_dir=args.data_dir, output_dir=args.output_dir)
     processor.combine_goes_data()
-    print("\n🏁 All processing tasks completed.")

         total_files = len(g13_files) + len(g14_files) + len(g15_files) + len(g16_files) + len(g17_files) + len(g18_files)
         logging.info(
             f"Found {len(g13_files)} GOES-13 files, {len(g14_files)} GOES-14 files, {len(g15_files)} GOES-15 files, {len(g16_files)} GOES-16 files, {len(g17_files)} GOES-17 files, and {len(g18_files)} GOES-18 files.")
+        print(f"Total files found: {total_files}")
         if total_files == 0:
+            print("No GOES data files found in the specified directory.")
             return
         def process_files(files, satellite_name, output_file, used_file_list):
                             ds.close()
             if not datasets:
+                print(f"No valid datasets for {satellite_name}")
                 logging.warning(f"No valid datasets for {satellite_name}")
                 return
+            print(f"Processing {len(datasets)} datasets for {satellite_name}...")
             try:
+                print(f"Concatenating datasets...")
                 combined_ds = xr.concat(datasets, dim='time').sortby('time')
                 # Scaling factors for GOES-13, GOES-14, and GOES-15
                 if satellite_name in ['GOES-13', 'GOES-14', 'GOES-15']:
+                    print(f"Applying scaling factors for {satellite_name}...")
                     combined_ds['xrsa_flux'] = combined_ds['xrsa_flux'] / .85
                     combined_ds['xrsb_flux'] = combined_ds['xrsb_flux'] / .7
+                print(f"Converting to DataFrame...")
                 df = combined_ds.to_dataframe().reset_index()
                 if 'quad_diode' in df.columns:
+                    print(f"Filtering quad diode data...")
                     df = df[df['quad_diode'] == 0]  # Filter out quad diode data
                 #Filter out data where xrsb_flux has a quality flag of >0
+                print(f"Filtering out data where xrsb_flux has a quality flag of >0...")
+                df = df[df['xrsb_flag'] == 0]
                 df['time'] = pd.to_datetime(df['time'])
                 df.set_index('time', inplace=True)
+                print(f"Applying log interpolation...")
                 df_log = np.log10(df[columns_to_interp].replace(0, np.nan))
                 # Step 3: Interpolate in log space
                 max_date = df.index.max().strftime('%Y%m%d')
                 filename = f"{str(output_file)}_{min_date}_{max_date}.csv"
+                print(f"Saving to {filename}...")
                 df.to_csv(filename, index=True)
+                print(f"Successfully processed {satellite_name}: {successful_files} files loaded, {failed_files} failed")
                 logging.info(f"Saved combined file: {output_file}")
             except Exception as e:
+                print(f"Failed to process {satellite_name}: {e}")
                 logging.error(f"Failed to write {output_file}: {e}")
             finally:
                 for ds in datasets:
         if len(g18_files) != 0:
             satellites_to_process.append((g18_files, "GOES-18", self.output_dir / "combined_g18_avg1m", self.used_g18_files))
+        print(f"\nStarting processing of {len(satellites_to_process)} satellites...")
         # Process each satellite with overall progress tracking
         successful_satellites = 0
         for i, (files, satellite_name, output_file, used_file_list) in enumerate(satellites_to_process, 1):
             print(f"\n{'='*60}")
+            print(f"Processing satellite {i}/{len(satellites_to_process)}: {satellite_name}")
             print(f"{'='*60}")
             try:
                 process_files(files, satellite_name, output_file, used_file_list)
                 successful_satellites += 1
             except Exception as e:
+                print(f"Failed to process {satellite_name}: {e}")
                 failed_satellites += 1
                 logging.error(f"Failed to process {satellite_name}: {e}")
         # Print final summary
         print(f"\n{'='*60}")
+        print(f"PROCESSING COMPLETE")
         print(f"{'='*60}")
+        print(f"Successfully processed: {successful_satellites} satellites")
+        print(f"Failed: {failed_satellites} satellites")
+        print(f"Total files processed: {total_files}")
+        print(f"Output directory: {self.output_dir}")
         # Print file usage statistics
         total_used_files = (len(self.used_g13_files) + len(self.used_g14_files) +
                            len(self.used_g15_files) + len(self.used_g16_files) +
                            len(self.used_g17_files) + len(self.used_g18_files))
+        print(f"Files used in processing: {total_used_files}")
         if successful_satellites > 0:
+            print(f"\nSXR data processing completed successfully!")
         else:
+            print(f"\n⚠No satellites were processed successfully.")
 if __name__ == '__main__':
                         help='Directory where combined GOES data will be saved.')
     args = parser.parse_args()
+    print("GOES SXR Data Processing Tool")
     print("=" * 50)
+    print(f"Data directory: {args.data_dir}")
+    print(f"Output directory: {args.output_dir}")
     print("=" * 50)
     processor = SXRDataProcessor(data_dir=args.data_dir, output_dir=args.output_dir)
     processor.combine_goes_data()
+    print("\nAll processing tasks completed.")

download/download_sdo.py CHANGED Viewed

@@ -27,7 +27,7 @@ class SDODownloader:
         wavelengths (list): List of wavelengths to download.
         n_workers (int): Number of worker threads for parallel download.
     """
-    def __init__(self, base_path='/mnt/data/PAPER/SDOData', email=None, wavelengths=['94', '131', '171', '193', '211', '304'], n_workers=4, cadence=60):
         self.ds_path = base_path
         self.wavelengths = [str(wl) for wl in wavelengths]
         self.n_workers = n_workers
@@ -53,7 +53,11 @@ class SDODownloader:
             if os.path.exists(map_path):
                 return map_path
             # load map
             url = 'http://jsoc.stanford.edu' + segment
             # Retry download with exponential backoff
             max_retries = 3
@@ -111,27 +115,20 @@ class SDODownloader:
         id = date.isoformat()
         logging.info('Start download: %s' % id)
-        # query Magnetogram
-        #time_param = '%sZ' % date.isoformat('_', timespec='seconds')
-        #ds_hmi = 'hmi.M_720s[%s]{magnetogram}' % time_param
-        #keys_hmi = self.drms_client.keys(ds_hmi)
-        #header_hmi, segment_hmi = self.drms_client.query(ds_hmi, key=','.join(keys_hmi), seg='magnetogram')
-        #if len(header_hmi) != 1 or np.any(header_hmi.QUALITY != 0):
-        #    self.fetchDataFallback(date)
-        #    return
         # query EUV
         time_param = '%sZ' % date.isoformat('_', timespec='seconds')
         ds_euv = 'aia.lev1_euv_12s[%s][%s]{image}' % (time_param, ','.join(self.wavelengths))
         keys_euv = self.drms_client.keys(ds_euv)
         header_euv, segment_euv = self.drms_client.query(ds_euv, key=','.join(keys_euv), seg='image')
-        if len(header_euv) != len(self.wavelengths) or np.any(header_euv.QUALITY != 0):
             self.fetchDataFallback(date)
             return
         queue = []
-        #for (idx, h), s in zip(header_hmi.iterrows(), segment_hmi.magnetogram):
-        #    queue += [(h.to_dict(), s, date)]
         for (idx, h), s in zip(header_euv.iterrows(), segment_euv.image):
             queue += [(h.to_dict(), s, date)]
@@ -155,28 +152,6 @@ class SDODownloader:
         id = date.isoformat()
         logging.info('Fallback download: %s' % id)
-        # query Magnetogram
-        t = date - timedelta(hours=24)
-        ds_hmi = 'hmi.M_720s[%sZ/12h@720s]{magnetogram}' % t.replace(tzinfo=None).isoformat('_', timespec='seconds')
-        keys_hmi = self.drms_client.keys(ds_hmi)
-        header_tmp, segment_tmp = self.drms_client.query(ds_hmi, key=','.join(keys_hmi), seg='magnetogram')
-        assert len(header_tmp) != 0, 'No data found!'
-        date_str = header_tmp['DATE__OBS'].replace('MISSING', '').str.replace('60', '59')  # fix date format
-        date_diff = np.abs(pd.to_datetime(date_str).dt.tz_localize(None) - date)
-        # sort and filter
-        header_tmp['date_diff'] = date_diff
-        header_tmp.sort_values('date_diff')
-        segment_tmp['date_diff'] = date_diff
-        segment_tmp.sort_values('date_diff')
-        cond_tmp = header_tmp.QUALITY == 0
-        header_tmp = header_tmp[cond_tmp]
-        segment_tmp = segment_tmp[cond_tmp]
-        assert len(header_tmp) > 0, 'No valid quality flag found'
-        # replace invalid
-        header_hmi = header_tmp.iloc[0].drop('date_diff')
-        segment_hmi = segment_tmp.iloc[0].drop('date_diff')
-        ############################################################
-        # query EUV
         header_euv, segment_euv = [], []
         t = date - timedelta(hours=6)
         for wl in self.wavelengths:
@@ -184,21 +159,23 @@ class SDODownloader:
                 t.replace(tzinfo=None).isoformat('_', timespec='seconds'), wl)
             keys_euv = self.drms_client.keys(euv_ds)
             header_tmp, segment_tmp = self.drms_client.query(euv_ds, key=','.join(keys_euv), seg='image')
-            assert len(header_tmp) != 0, 'No data found!'
             date_str = header_tmp['DATE__OBS'].replace('MISSING', '').str.replace('60', '59')  # fix date format
             date_diff = (pd.to_datetime(date_str).dt.tz_localize(None) - date).abs()
             # sort and filter
             header_tmp['date_diff'] = date_diff
-            header_tmp.sort_values('date_diff')
             segment_tmp['date_diff'] = date_diff
-            segment_tmp.sort_values('date_diff')
-            cond_tmp = header_tmp.QUALITY == 0
-            header_tmp = header_tmp[cond_tmp]
-            segment_tmp = segment_tmp[cond_tmp]
-            assert len(header_tmp) > 0, 'No valid quality flag found'
-            # replace invalid
-            header_euv.append(header_tmp.iloc[0].drop('date_diff'))
-            segment_euv.append(segment_tmp.iloc[0].drop('date_diff'))
         queue = []
         #queue += [(header_hmi.to_dict(), segment_hmi.magnetogram, date)]
@@ -218,8 +195,8 @@ if __name__ == '__main__':
     parser.add_argument('--download_dir', type=str, help='path to the download directory.')
     parser.add_argument('--email', type=str, help='registered email address for JSOC.')
     parser.add_argument('--start_date', type=str, help='start date in format YYYY-MM-DD.')
-    parser.add_argument('--end_date', type=str, help='end date in format YYYY-MM-DD.', required=False,
-                        default=str(datetime.now()).split(' ')[0])
     parser.add_argument('--cadence', type=int, help='cadence in minutes.', required=False, default=60)
     args = parser.parse_args()
@@ -228,7 +205,7 @@ if __name__ == '__main__':
     end_date = args.end_date
     cadence = args.cadence
-    [os.makedirs(os.path.join(download_dir, str(c)), exist_ok=True) for c in [94, 131, 171, 193, 211, 304]]
     downloader = SDODownloader(base_path=download_dir, email=args.email)
     start_date_datetime = datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
     #end_date = datetime.now()
@@ -236,10 +213,10 @@ if __name__ == '__main__':
     #Skip over dates that already exist in the download directory
-    for d in [start_date_datetime + i * timedelta(minutes=1) for i in
-              range((end_date_datetime - start_date_datetime) // timedelta(minutes=1))]:
         #make sure the file exists in all wavelengths directories
-        for wl in [94, 131, 171, 193, 211, 304]:
             if not os.path.exists(os.path.join(
                 download_dir,
                 str(wl),

         wavelengths (list): List of wavelengths to download.
         n_workers (int): Number of worker threads for parallel download.
     """
+    def __init__(self, base_path='/mnt/data/PAPER/SDOData', email=None, wavelengths=['94', '131', '171', '193', '211', '304', '335'], n_workers=4, cadence=60):
         self.ds_path = base_path
         self.wavelengths = [str(wl) for wl in wavelengths]
         self.n_workers = n_workers
             if os.path.exists(map_path):
                 return map_path
             # load map
+            if not segment or pd.isna(segment):
+                logging.error('Segment path is null for %s — data may not be in JSOC cache' % header.get('DATE__OBS'))
+                raise ValueError('Null segment path for %s' % header.get('DATE__OBS'))
             url = 'http://jsoc.stanford.edu' + segment
+            logging.info('Downloading: %s' % url)
             # Retry download with exponential backoff
             max_retries = 3
         id = date.isoformat()
         logging.info('Start download: %s' % id)
         # query EUV
         time_param = '%sZ' % date.isoformat('_', timespec='seconds')
         ds_euv = 'aia.lev1_euv_12s[%s][%s]{image}' % (time_param, ','.join(self.wavelengths))
         keys_euv = self.drms_client.keys(ds_euv)
         header_euv, segment_euv = self.drms_client.query(ds_euv, key=','.join(keys_euv), seg='image')
+        logging.info('Fast-path query returned %d rows (need %d), qualities: %s' % (
+            len(header_euv), len(self.wavelengths),
+            list(header_euv.QUALITY) if len(header_euv) > 0 else []))
+        if len(header_euv) != len(self.wavelengths) or np.any(header_euv.QUALITY.fillna(0) != 0):
             self.fetchDataFallback(date)
             return
         queue = []
         for (idx, h), s in zip(header_euv.iterrows(), segment_euv.image):
             queue += [(h.to_dict(), s, date)]
         id = date.isoformat()
         logging.info('Fallback download: %s' % id)
         header_euv, segment_euv = [], []
         t = date - timedelta(hours=6)
         for wl in self.wavelengths:
                 t.replace(tzinfo=None).isoformat('_', timespec='seconds'), wl)
             keys_euv = self.drms_client.keys(euv_ds)
             header_tmp, segment_tmp = self.drms_client.query(euv_ds, key=','.join(keys_euv), seg='image')
+            logging.info('Fallback query wl=%s returned %d rows' % (wl, len(header_tmp)))
+            assert len(header_tmp) != 0, 'No data found for wl=%s at %s' % (wl, id)
             date_str = header_tmp['DATE__OBS'].replace('MISSING', '').str.replace('60', '59')  # fix date format
             date_diff = (pd.to_datetime(date_str).dt.tz_localize(None) - date).abs()
             # sort and filter
             header_tmp['date_diff'] = date_diff
             segment_tmp['date_diff'] = date_diff
+            cond_tmp = (header_tmp.QUALITY == 0) | header_tmp.QUALITY.isna()
+            header_filtered = header_tmp[cond_tmp]
+            segment_filtered = segment_tmp[cond_tmp]
+            if len(header_filtered) > 0:
+                header_tmp = header_filtered
+                segment_tmp = segment_filtered
+            else:
+                logging.warning('No quality-0 EUV frames for wl=%s at %s — using closest available' % (wl, id))
+            header_euv.append(header_tmp.sort_values('date_diff').iloc[0].drop('date_diff'))
+            segment_euv.append(segment_tmp.sort_values('date_diff').iloc[0].drop('date_diff'))
         queue = []
         #queue += [(header_hmi.to_dict(), segment_hmi.magnetogram, date)]
     parser.add_argument('--download_dir', type=str, help='path to the download directory.')
     parser.add_argument('--email', type=str, help='registered email address for JSOC.')
     parser.add_argument('--start_date', type=str, help='start date in format YYYY-MM-DD.')
+    parser.add_argument('--end_date', type=str, help='end date in format YYYY-MM-DD HH:MM:SS.', required=False,
+                        default=datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
     parser.add_argument('--cadence', type=int, help='cadence in minutes.', required=False, default=60)
     args = parser.parse_args()
     end_date = args.end_date
     cadence = args.cadence
+    [os.makedirs(os.path.join(download_dir, str(c)), exist_ok=True) for c in [94, 131, 171, 193, 211, 304, 335]]
     downloader = SDODownloader(base_path=download_dir, email=args.email)
     start_date_datetime = datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
     #end_date = datetime.now()
     #Skip over dates that already exist in the download directory
+    for d in [start_date_datetime + i * timedelta(minutes=cadence) for i in
+              range((end_date_datetime - start_date_datetime) // timedelta(minutes=cadence))]:
         #make sure the file exists in all wavelengths directories
+        for wl in [94, 131, 171, 193, 211, 304, 335]:
             if not os.path.exists(os.path.join(
                 download_dir,
                 str(wl),

download/sxr_downloader.py CHANGED Viewed

@@ -10,11 +10,9 @@ import pandas as pd
 class SXRDownloader:
     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-    def __init__(self, save_dir: str = '/mnt/data/PAPER/GOES-timespan', concat_dir: str = '/mnt/data/PAPER/GOES-timespan/combined'):
         self.save_dir = Path(save_dir)
         self.save_dir.mkdir(exist_ok=True)
-        self.concat_dir = Path(concat_dir)
-        self.concat_dir.mkdir(exist_ok=True)
         self.used_g13_files = []
         self.used_g14_files = []
         self.used_g15_files = []

 class SXRDownloader:
     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    def __init__(self, save_dir: str = '/mnt/data/PAPER/GOES-timespan'):
         self.save_dir = Path(save_dir)
         self.save_dir.mkdir(exist_ok=True)
         self.used_g13_files = []
         self.used_g14_files = []
         self.used_g15_files = []

forecasting/data_loaders/SDOAIA_dataloader.py CHANGED Viewed

@@ -12,6 +12,16 @@ import glob
 import os
 class AIA_GOESDataset(torch.utils.data.Dataset):
     """
     PyTorch Dataset for loading paired AIA (EUV images) and GOES (SXR flux) data.
@@ -354,7 +364,7 @@ class AIA_GOESDataModule(LightningDataModule):
         self.train_ds = AIA_GOESDataset(
             aia_dir=self.aia_train_dir,
             sxr_dir=self.sxr_train_dir,
-            sxr_transform=T.Lambda(lambda x: (np.log10(x + 1e-8) - self.sxr_norm[0]) / self.sxr_norm[1]),
             target_size=(512, 512),
             wavelengths=self.wavelengths,
             cadence=1,
@@ -368,7 +378,7 @@ class AIA_GOESDataModule(LightningDataModule):
         self.val_ds = AIA_GOESDataset(
             aia_dir=self.aia_val_dir,
             sxr_dir=self.sxr_val_dir,
-            sxr_transform=T.Lambda(lambda x: (np.log10(x + 1e-8) - self.sxr_norm[0]) / self.sxr_norm[1]),
             target_size=(512, 512),
             wavelengths=self.wavelengths,
             cadence=1,
@@ -382,7 +392,7 @@ class AIA_GOESDataModule(LightningDataModule):
         self.test_ds = AIA_GOESDataset(
             aia_dir=self.aia_test_dir,
             sxr_dir=self.sxr_test_dir,
-            sxr_transform=T.Lambda(lambda x: (np.log10(x + 1e-8) - self.sxr_norm[0]) / self.sxr_norm[1]),
             target_size=(512, 512),
             wavelengths=self.wavelengths,
             cadence=1,

 import os
+class SXRLogNormTransform:
+    """Picklable SXR log-normalization transform (replaces T.Lambda for spawn compatibility)."""
+    def __init__(self, mean: float, std: float):
+        self.mean = mean
+        self.std = std
+    def __call__(self, x: float) -> float:
+        return (np.log10(x + 1e-8) - self.mean) / self.std
 class AIA_GOESDataset(torch.utils.data.Dataset):
     """
     PyTorch Dataset for loading paired AIA (EUV images) and GOES (SXR flux) data.
         self.train_ds = AIA_GOESDataset(
             aia_dir=self.aia_train_dir,
             sxr_dir=self.sxr_train_dir,
+            sxr_transform=SXRLogNormTransform(self.sxr_norm[0], self.sxr_norm[1]),
             target_size=(512, 512),
             wavelengths=self.wavelengths,
             cadence=1,
         self.val_ds = AIA_GOESDataset(
             aia_dir=self.aia_val_dir,
             sxr_dir=self.sxr_val_dir,
+            sxr_transform=SXRLogNormTransform(self.sxr_norm[0], self.sxr_norm[1]),
             target_size=(512, 512),
             wavelengths=self.wavelengths,
             cadence=1,
         self.test_ds = AIA_GOESDataset(
             aia_dir=self.aia_test_dir,
             sxr_dir=self.sxr_test_dir,
+            sxr_transform=SXRLogNormTransform(self.sxr_norm[0], self.sxr_norm[1]),
             target_size=(512, 512),
             wavelengths=self.wavelengths,
             cadence=1,

forecasting/training/train.py CHANGED Viewed

@@ -84,69 +84,6 @@ def resolve_config_variables(config_dict):
     return recursive_substitute(config_dict, variables)
-# Parser
-parser = argparse.ArgumentParser()
-parser.add_argument('-config', type=str, default='config.yaml', required=True, help='Path to config YAML.')
-args = parser.parse_args()
-# Load config with variable substitution
-with open(args.config, 'r') as stream:
-    config_data = yaml.load(stream, Loader=yaml.SafeLoader)
-# Resolve variables like ${base_data_dir}
-config_data = resolve_config_variables(config_data)
-# Debug: Print resolved paths
-print("Resolved paths:")
-print(f"AIA dir: {config_data['data']['aia_dir']}")
-print(f"SXR dir: {config_data['data']['sxr_dir']}")
-print(f"Checkpoints dir: {config_data['data']['checkpoints_dir']}")
-sxr_norm = np.load(config_data['data']['sxr_norm_path'])
-training_wavelengths = config_data['wavelengths']
-# DataModule
-data_loader = AIA_GOESDataModule(
-    aia_train_dir= config_data['data']['aia_dir']+"/train",
-    aia_val_dir=config_data['data']['aia_dir']+"/val",
-    aia_test_dir=config_data['data']['aia_dir']+"/test",
-    sxr_train_dir=config_data['data']['sxr_dir']+"/train",
-    sxr_val_dir=config_data['data']['sxr_dir']+"/val",
-    sxr_test_dir=config_data['data']['sxr_dir']+"/test",
-    batch_size=config_data['batch_size'],
-    num_workers=min(8, os.cpu_count()),  # Limit workers to prevent shm issues
-    sxr_norm=sxr_norm,
-    wavelengths=training_wavelengths,
-    oversample=config_data['oversample'],
-    balance_strategy=config_data['balance_strategy'],
-)
-data_loader.setup()
-# Logger
-#wb_name = f"{instrument}_{n}" if len(combined_parameters) > 1 else "aia_sxr_model"
-wandb_logger = WandbLogger(
-    entity=config_data['wandb']['entity'],
-    project=config_data['wandb']['project'],
-    job_type=config_data['wandb']['job_type'],
-    tags=config_data['wandb']['tags'],
-    name=config_data['wandb']['run_name'],
-    notes=config_data['wandb']['notes'],
-    config=config_data
-)
-# Logging callback
-total_n_valid = len(data_loader.val_ds)
-plot_data = [data_loader.val_ds[i] for i in range(0, total_n_valid, max(1, total_n_valid // 4))]
-plot_samples = plot_data  # Keep as list of ((aia, sxr), target)
-#sxr_callback = SXRPredictionLogger(plot_samples)
-sxr_plot_callback = ImagePredictionLogger_SXR(plot_samples, sxr_norm)
-# Attention map callback - get patch size from config
-patch_size = config_data.get('vit_architecture', {}).get('patch_size', 16)
-attention = AttentionMapCallback(patch_size=patch_size, use_local_attention=True)
 class PTHCheckpointCallback(Callback):
     """
     Custom PyTorch Lightning callback to save model checkpoints in `.pth` format.
@@ -209,65 +146,6 @@ class PTHCheckpointCallback(Callback):
-# Checkpoint callback
-checkpoint_callback = ModelCheckpoint(
-    dirpath=config_data['data']['checkpoints_dir'],
-    monitor='val_total_loss',
-    mode='min',
-    save_top_k=10,
-    filename=f"{config_data['wandb']['run_name']}-{{epoch:02d}}-{{val_total_loss:.4f}}"
-)
-pth_callback = PTHCheckpointCallback(
-    dirpath=config_data['data']['checkpoints_dir'],
-    monitor='val_total_loss',
-    mode='min',
-    save_top_k=1,
-    filename_prefix=config_data['wandb']['run_name']
-)
-def process_batch(batch_data, sxr_norm, c_threshold, m_threshold, x_threshold):
-    """
-    Process a batch of SXR data to count flare occurrences in different intensity classes.
-    Parameters
-    ----------
-    batch_data : tuple
-        Tuple containing (batch, batch_idx).
-    sxr_norm : np.ndarray
-        Normalization parameters for SXR values.
-    c_threshold, m_threshold, x_threshold : float
-        Thresholds defining flare intensity categories.
-    Returns
-    -------
-    dict
-        Dictionary containing counts for quiet, C, M, and X class flares.
-    """
-    from forecasting.models.vit_patch_model import unnormalize_sxr
-    batch, batch_idx = batch_data
-    _, sxr = batch
-    # Unnormalize the SXR values
-    sxr_un = unnormalize_sxr(sxr, sxr_norm)
-    sxr_un_flat = sxr_un.view(-1).cpu().numpy()
-    total = len(sxr_un_flat)
-    quiet_count = ((sxr_un_flat < c_threshold)).sum()
-    c_count = ((sxr_un_flat >= c_threshold) & (sxr_un_flat < m_threshold)).sum()
-    m_count = ((sxr_un_flat >= m_threshold) & (sxr_un_flat < x_threshold)).sum()
-    x_count = ((sxr_un_flat >= x_threshold)).sum()
-    return {
-        'total': total,
-        'quiet_count': quiet_count,
-        'c_count': c_count,
-        'm_count': m_count,
-        'x_count': x_count,
-        'batch_idx': batch_idx
-    }
 def get_base_weights(data_loader, sxr_norm):
     """
     Compute inverse-frequency weights for flare classes based on training data.
@@ -353,94 +231,126 @@ def get_base_weights(data_loader, sxr_norm):
-base_weights = get_base_weights(data_loader, sxr_norm) if config_data.get('calculate_base_weights', True) else None
-model = ViTLocal(model_kwargs=config_data['vit_architecture'], sxr_norm = sxr_norm, base_weights=base_weights)
-# Set device based on config
-# Support both old 'gpu_id' and new 'gpu_ids' config keys for backward compatibility
-gpu_config = config_data.get('gpu_ids', config_data.get('gpu_id', 0))
-if gpu_config == -1:
-    """
-    Use CPU for training if GPU config is set to -1.
-    """
-    # CPU only
-    accelerator = "cpu"
-    devices = 1
-    strategy = "auto"
-    print("Using CPU for training")
-elif gpu_config == "all":
-    """
-    Use all available GPUs if GPU config is set to 'all'.
-    """
-    # Use all available GPUs
-    if torch.cuda.is_available():
-        accelerator = "gpu"
-        devices = -1  # -1 means use all available GPUs
-        num_gpus = torch.cuda.device_count()
-        strategy = "auto"
-        print(f"Using all available GPUs ({num_gpus} GPUs)")
-        if num_gpus > 1:
-            print(f"Multi-GPU training with DDP: Effective batch size = {config_data['batch_size']} x {num_gpus} GPUs = {config_data['batch_size'] * num_gpus}")
-    else:
-        accelerator = "cpu"
-        devices = 1
-        strategy = "auto"
-        print("No GPUs available, falling back to CPU")
-elif isinstance(gpu_config, list):
-    """
-    Use specific GPU IDs if provided as a list.
-    """
-    # Multiple specific GPUs
-    if torch.cuda.is_available():
-        accelerator = "gpu"
-        devices = gpu_config
-        strategy = "auto"
-        print(f"Using GPUs: {gpu_config}")
-        if len(gpu_config) > 1:
-            print(f"Multi-GPU training with DDP: Effective batch size = {config_data['batch_size']} x {len(gpu_config)} GPUs = {config_data['batch_size'] * len(gpu_config)}")
-    else:
-        accelerator = "cpu"
-        devices = 1
-        strategy = "auto"
-        print("No GPUs available, falling back to CPU")
-else:
-    """
-    Use a single GPU or CPU based on availability.
-    """
-    # Single GPU (integer)
-    if torch.cuda.is_available():
-        accelerator = "gpu"
-        devices = [gpu_config]
-        strategy = "auto"
-        print(f"Using GPU {gpu_config}")
     else:
-        accelerator = "cpu"
-        devices = 1
-        strategy = "auto"
-        print(f"GPU {gpu_config} not available, falling back to CPU")
-# Trainer
-trainer = Trainer(
-    default_root_dir=config_data['data']['checkpoints_dir'],
-    accelerator=accelerator,
-    devices=devices,
-    strategy=strategy,
-    max_epochs=config_data['epochs'],
-    callbacks=[attention, checkpoint_callback],
-    logger=wandb_logger,
-    log_every_n_steps=10,
-)
-trainer.fit(model, data_loader)
-# Save final PyTorch checkpoint with model and state_dict
-timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-final_checkpoint_path = os.path.join(config_data['data']['checkpoints_dir'], f"{config_data['wandb']['run_name']}-final-{timestamp}.pth")
-torch.save({
-    'model': model,
-    'state_dict': model.state_dict()
-}, final_checkpoint_path)
-print(f"Saved final PyTorch checkpoint: {final_checkpoint_path}")
-# Finalize
-wandb.finish()

     return recursive_substitute(config_dict, variables)
 class PTHCheckpointCallback(Callback):
     """
     Custom PyTorch Lightning callback to save model checkpoints in `.pth` format.
 def get_base_weights(data_loader, sxr_norm):
     """
     Compute inverse-frequency weights for flare classes based on training data.
+if __name__ == '__main__':
+    # Parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-config', type=str, default='config.yaml', required=True, help='Path to config YAML.')
+    args = parser.parse_args()
+    # Load config with variable substitution
+    with open(args.config, 'r') as stream:
+        config_data = yaml.load(stream, Loader=yaml.SafeLoader)
+    config_data = resolve_config_variables(config_data)
+    print("Resolved paths:")
+    print(f"AIA dir: {config_data['data']['aia_dir']}")
+    print(f"SXR dir: {config_data['data']['sxr_dir']}")
+    print(f"Checkpoints dir: {config_data['data']['checkpoints_dir']}")
+    sxr_norm = np.load(config_data['data']['sxr_norm_path'])
+    training_wavelengths = config_data['wavelengths']
+    # DataModule
+    data_loader = AIA_GOESDataModule(
+        aia_train_dir=config_data['data']['aia_dir'] + "/train",
+        aia_val_dir=config_data['data']['aia_dir'] + "/val",
+        aia_test_dir=config_data['data']['aia_dir'] + "/test",
+        sxr_train_dir=config_data['data']['sxr_dir'] + "/train",
+        sxr_val_dir=config_data['data']['sxr_dir'] + "/val",
+        sxr_test_dir=config_data['data']['sxr_dir'] + "/test",
+        batch_size=config_data['batch_size'],
+        num_workers=min(8, os.cpu_count()),
+        sxr_norm=sxr_norm,
+        wavelengths=training_wavelengths,
+        oversample=config_data['oversample'],
+        balance_strategy=config_data['balance_strategy'],
+    )
+    data_loader.setup()
+    # Logger
+    wandb_logger = WandbLogger(
+        entity=config_data['wandb']['entity'],
+        project=config_data['wandb']['project'],
+        job_type=config_data['wandb']['job_type'],
+        tags=config_data['wandb']['tags'],
+        name=config_data['wandb']['run_name'],
+        notes=config_data['wandb']['notes'],
+        config=config_data
+    )
+    # Callbacks
+    total_n_valid = len(data_loader.val_ds)
+    plot_samples = [data_loader.val_ds[i] for i in range(0, total_n_valid, max(1, total_n_valid // 4))]
+    sxr_plot_callback = ImagePredictionLogger_SXR(plot_samples, sxr_norm)
+    patch_size = config_data.get('vit_architecture', {}).get('patch_size', 16)
+    attention = AttentionMapCallback(patch_size=patch_size, use_local_attention=True)
+    base_weights = get_base_weights(data_loader, sxr_norm) if config_data.get('calculate_base_weights', True) else None
+    model = ViTLocal(model_kwargs=config_data['vit_architecture'], sxr_norm=sxr_norm, base_weights=base_weights)
+    # Checkpoint callbacks
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=config_data['data']['checkpoints_dir'],
+        monitor='val_total_loss',
+        mode='min',
+        save_top_k=10,
+        filename=f"{config_data['wandb']['run_name']}-{{epoch:02d}}-{{val_total_loss:.4f}}"
+    )
+    pth_callback = PTHCheckpointCallback(
+        dirpath=config_data['data']['checkpoints_dir'],
+        monitor='val_total_loss',
+        mode='min',
+        save_top_k=1,
+        filename_prefix=config_data['wandb']['run_name']
+    )
+    # Set device based on config
+    gpu_config = config_data.get('gpu_ids', config_data.get('gpu_id', 0))
+    if gpu_config == -1:
+        accelerator, devices, strategy = "cpu", 1, "auto"
+        print("Using CPU for training")
+    elif gpu_config == "all":
+        if torch.cuda.is_available():
+            accelerator, devices, strategy = "gpu", -1, "auto"
+            num_gpus = torch.cuda.device_count()
+            print(f"Using all available GPUs ({num_gpus} GPUs)")
+        else:
+            accelerator, devices, strategy = "cpu", 1, "auto"
+            print("No GPUs available, falling back to CPU")
+    elif isinstance(gpu_config, list):
+        if torch.cuda.is_available():
+            accelerator, devices, strategy = "gpu", gpu_config, "auto"
+            print(f"Using GPUs: {gpu_config}")
+        else:
+            accelerator, devices, strategy = "cpu", 1, "auto"
+            print("No GPUs available, falling back to CPU")
     else:
+        if torch.cuda.is_available():
+            accelerator, devices, strategy = "gpu", [gpu_config], "auto"
+            print(f"Using GPU {gpu_config}")
+        else:
+            accelerator, devices, strategy = "cpu", 1, "auto"
+            print(f"GPU {gpu_config} not available, falling back to CPU")
+    # Trainer
+    trainer = Trainer(
+        default_root_dir=config_data['data']['checkpoints_dir'],
+        accelerator=accelerator,
+        devices=devices,
+        strategy=strategy,
+        max_epochs=config_data['epochs'],
+        callbacks=[attention, checkpoint_callback],
+        logger=wandb_logger,
+        log_every_n_steps=10,
+    )
+    trainer.fit(model, data_loader)
+    # Save final checkpoint
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    final_checkpoint_path = os.path.join(
+        config_data['data']['checkpoints_dir'],
+        f"{config_data['wandb']['run_name']}-final-{timestamp}.pth"
+    )
+    torch.save({'model': model, 'state_dict': model.state_dict()}, final_checkpoint_path)
+    print(f"Saved final PyTorch checkpoint: {final_checkpoint_path}")
+    wandb.finish()

forecasting/training/train_config.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 #Base directories - change these to switch datasets
-base_data_dir: "/Volumes/T9/FOXES_Data"  # Change this line for different datasets
-base_checkpoint_dir: "/Volumes/T9/FOXES_Data"    # Change this line for different datasets
 wavelengths: [94, 131, 171, 193, 211, 304, 335]  # AIA wavelengths in Angstroms
 # GPU configuration
@@ -35,11 +35,11 @@ vit_architecture:
 # Data paths (automatically constructed from base directories)
 data:
   aia_dir:
-    "${base_data_dir}/AIA"
   sxr_dir:
-    "${base_data_dir}/SXR"
   sxr_norm_path:
-    "${base_data_dir}/SXR/normalized_sxr.npy"
   checkpoints_dir:
     "${base_checkpoint_dir}/new-checkpoint/"

 #Base directories - change these to switch datasets
+base_data_dir: "/Volumes/T9/Data_FOXES"  # Change this line for different datasets
+base_checkpoint_dir: "/Volumes/T9/Data_FOXES"    # Change this line for different datasets
 wavelengths: [94, 131, 171, 193, 211, 304, 335]  # AIA wavelengths in Angstroms
 # GPU configuration
 # Data paths (automatically constructed from base directories)
 data:
   aia_dir:
+    "${base_data_dir}/AIA_processed"
   sxr_dir:
+    "${base_data_dir}/SXR_processed"
   sxr_norm_path:
+    "${base_data_dir}/SXR_processed/normalized_sxr.npy"
   checkpoints_dir:
     "${base_checkpoint_dir}/new-checkpoint/"

pipeline_config.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+# =============================================================================
+# FOXES Pipeline Configuration
+# =============================================================================
+# Used by run_pipeline.py to run any combination of pipeline steps.
+#
+# Usage:
+#   python run_pipeline.py --config pipeline_config.yaml --steps all
+#   python run_pipeline.py --config pipeline_config.yaml --steps train,inference,flare_analysis
+#   python run_pipeline.py --list
+# -----------------------------------------------------------------------------
+# Shared date range (used by download_aia and download_sxr)
+# -----------------------------------------------------------------------------
+start_date: "2014-07-01 00:00:00"
+end_date:   "2014-07-08 00:00:00"
+# -----------------------------------------------------------------------------
+# AIA download  (step: download_aia)
+# -----------------------------------------------------------------------------
+aia:
+  download_dir: "/Volumes/T9/Data_FOXES/AIA_raw"
+  email: ""        # Must be registered at http://jsoc.stanford.edu
+  cadence: 1                    # Minutes between frames
+# -----------------------------------------------------------------------------
+# SXR download  (step: download_sxr)
+# -----------------------------------------------------------------------------
+sxr:
+  save_dir: "/Volumes/T9/Data_FOXES/SXR_raw"
+# -----------------------------------------------------------------------------
+# Preprocessing  (step: preprocess)
+# -----------------------------------------------------------------------------
+preprocess:
+  config: "data/pipeline_config.yaml"   # PipelineConfig for process_data_pipeline.py
+# -----------------------------------------------------------------------------
+# SXR normalization  (step: normalize)
+# -----------------------------------------------------------------------------
+normalize:
+  sxr_dir:     "/Volumes/T9/Data_FOXES/SXR_processed/train"
+  output_path: "/Volumes/T9/Data_FOXES/SXR_processed/normalized_sxr.npy"
+# -----------------------------------------------------------------------------
+# Train/val/test split  (step: split)
+# Runs split_data.py once for AIA and once for SXR.
+# -----------------------------------------------------------------------------
+split:
+  aia_input_dir: "/Volumes/T9/Data_FOXES/AIA_processed"  # splits into AIA_processed/train|val|test
+  sxr_input_dir: "/Volumes/T9/Data_FOXES/SXR_processed"  # splits into SXR_processed/train|val|test
+  train_start: "2014-07-01"
+  train_end:   "2014-07-05"
+  val_start:   "2014-07-06"
+  val_end:     "2014-07-07"
+  test_start:  "2014-07-08"
+  test_end:    "2025-12-31"
+# -----------------------------------------------------------------------------
+# Training  (step: train)
+# -----------------------------------------------------------------------------
+train:
+  config: "forecasting/training/train_config.yaml"
+  overrides:                          # Any key from train_config.yaml can go here
+    base_data_dir: "/Volumes/T9/Data_FOXES"
+    base_checkpoint_dir: "/Volumes/T9/Data_FOXES"
+    epochs: 150
+    batch_size: 6
+    wandb:
+      run_name: "pipeline-run"
+      entity: jayantbiradar619-university-of-arizona # Use your exact W&B username
+      project: Paper
+      job_type: training
+      tags:
+        - aia
+        - sxr
+        - regression
+      run_name: paper-8-patch-4ch
+      notes: Regression from AIA images to SXR images using ViTLocal model with 8x8 patches
+# -----------------------------------------------------------------------------
+# Inference & flare analysis  (steps: inference, flare_analysis)
+# -----------------------------------------------------------------------------
+inference:
+  config: "forecasting/inference/local_config.yaml"
+  overrides:                          # Any key from local_config.yaml can go here
+    paths:
+      data_dir: "/Volumes/T9/Data_FOXES"

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@
 sunpy[all]
 sunpy-soar
 astropy
 drms
 itipy

 sunpy[all]
 sunpy-soar
 astropy
+aiapy==0.6.4  # itipy 0.1.1 requires calibrate.util which was renamed in 0.7.0
 drms
 itipy

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,364 @@

+#!/usr/bin/env python3
+"""
+FOXES End-to-End Pipeline Orchestrator
+Runs any combination of pipeline steps in order:
+  1. download_aia   - Download SDO/AIA EUV images from JSOC (download/download_sdo.py)
+  2. download_sxr   - Download GOES SXR flux data (download/sxr_downloader.py)
+  3. combine_sxr    - Combine raw GOES .nc files into per-satellite CSVs (data/sxr_data_processing.py)
+  4. preprocess     - EUV cleaning, ITI processing, data alignment (data/process_data_pipeline.py)
+  5. split          - Split AIA + SXR into train/val/test (data/split_data.py)
+  6. normalize      - Compute SXR normalization stats on train split (data/sxr_normalization.py)
+  7. train          - Train the ViTLocal forecasting model (forecasting/training/train.py)
+  8. inference      - Run batch inference on val/test data (forecasting/inference/inference.py)
+  9. flare_analysis - Detect, track, and match flares (forecasting/inference/flare_analysis.py)
+Usage:
+  python run_pipeline.py --list
+  python run_pipeline.py --config pipeline_config.yaml --steps all
+  python run_pipeline.py --config pipeline_config.yaml --steps train,inference,flare_analysis
+"""
+import argparse
+import logging
+import subprocess
+import sys
+import time
+from pathlib import Path
+import yaml
+ROOT = Path(__file__).parent
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)s  %(message)s",
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(ROOT / "pipeline.log"),
+    ],
+)
+log = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Config helpers
+# ---------------------------------------------------------------------------
+def deep_merge(base: dict, overrides: dict) -> dict:
+    """Recursively merge overrides into base, modifying base in-place."""
+    for key, val in overrides.items():
+        if isinstance(val, dict) and isinstance(base.get(key), dict):
+            deep_merge(base[key], val)
+        else:
+            base[key] = val
+    return base
+def write_merged_config(base_path: str, overrides: dict, out_name: str) -> Path:
+    """
+    Load base_path YAML, apply overrides, write merged result to ROOT/.{out_name}.yaml.
+    Returns the path of the merged file.
+    """
+    with open(base_path) as f:
+        base = yaml.safe_load(f) or {}
+    deep_merge(base, overrides)
+    out = ROOT / f".merged_{out_name}.yaml"
+    with open(out, "w") as f:
+        yaml.dump(base, f, default_flow_style=False)
+    log.info(f"  Merged config written to {out}")
+    return out
+# ---------------------------------------------------------------------------
+# Step definitions
+# ---------------------------------------------------------------------------
+STEP_ORDER = [
+    "download_aia",
+    "download_sxr",
+    "combine_sxr",
+    "preprocess",
+    "split",
+    "normalize",
+    "train",
+    "inference",
+    "flare_analysis",
+]
+STEP_INFO = {
+    "download_aia": {
+        "description": "Download SDO/AIA EUV images from JSOC",
+        "script": ROOT / "download" / "download_sdo.py",
+    },
+    "download_sxr": {
+        "description": "Download GOES SXR flux data via SXRDownloader",
+        "script": None,  # invoked inline via python -c
+    },
+    "combine_sxr": {
+        "description": "Combine raw GOES .nc files into per-satellite CSVs for alignment",
+        "script": ROOT / "data" / "sxr_data_processing.py",
+    },
+    "preprocess": {
+        "description": "EUV cleaning, ITI processing, and AIA/SXR data alignment",
+        "script": ROOT / "data" / "process_data_pipeline.py",
+    },
+    "normalize": {
+        "description": "Compute SXR log-normalization statistics (mean/std)",
+        "script": ROOT / "data" / "sxr_normalization.py",
+    },
+    "split": {
+        "description": "Split AIA and SXR data into train/val/test by date range",
+        "script": ROOT / "data" / "split_data.py",
+    },
+    "train": {
+        "description": "Train the ViTLocal solar flare forecasting model",
+        "script": ROOT / "forecasting" / "training" / "train.py",
+    },
+    "inference": {
+        "description": "Run batch inference and save predictions CSV",
+        "script": ROOT / "forecasting" / "inference" / "inference.py",
+    },
+    "flare_analysis": {
+        "description": "Detect, track, and match flares; generate plots/movies",
+        "script": ROOT / "forecasting" / "inference" / "flare_analysis.py",
+    },
+}
+# ---------------------------------------------------------------------------
+# Command builders
+# ---------------------------------------------------------------------------
+def build_commands(step: str, cfg: dict, force: bool) -> list[list[str]] | None:
+    """
+    Return a list of subprocess commands for a given step, or None if required config is missing.
+    Most steps return a single command; 'split' returns two (AIA then SXR).
+    """
+    def require(keys: list[str], section: str = None) -> bool:
+        src = cfg.get(section, {}) if section else cfg
+        missing = [k for k in keys if not src.get(k)]
+        if missing:
+            prefix = f"{section}." if section else ""
+            log.error(f"pipeline_config.yaml missing required keys: {[prefix + k for k in missing]}")
+            return False
+        return True
+    if step == "download_aia":
+        if not require(["download_dir", "email"], "aia") or not require(["start_date"]):
+            return None
+        aia = cfg["aia"]
+        cmd = [sys.executable, str(STEP_INFO[step]["script"]),
+               "--download_dir", aia["download_dir"],
+               "--email",        aia["email"],
+               "--start_date",   cfg["start_date"]]
+        if cfg.get("end_date"):
+            cmd += ["--end_date", cfg["end_date"]]
+        if aia.get("cadence"):
+            cmd += ["--cadence", str(aia["cadence"])]
+        return [cmd]
+    if step == "download_sxr":
+        if not require(["save_dir"], "sxr") or not require(["start_date"]):
+            return None
+        start = cfg["start_date"]
+        end = cfg.get("end_date", start)
+        save_dir = cfg["sxr"]["save_dir"]
+        inline = (
+            f"import sys; sys.path.insert(0, r'{ROOT}'); "
+            f"from download.sxr_downloader import SXRDownloader; "
+            f"d = SXRDownloader(save_dir=r'{save_dir}'); "
+            f"d.download_and_save_goes_data(start='{start}', end='{end}')"
+        )
+        return [[sys.executable, "-c", inline]]
+    if step == "combine_sxr":
+        if not require(["save_dir"], "sxr"):
+            return None
+        raw_dir = cfg["sxr"]["save_dir"]
+        combined_dir = str(Path(raw_dir) / "combined")
+        return [[sys.executable, str(STEP_INFO[step]["script"]),
+                 "--data_dir", raw_dir,
+                 "--output_dir", combined_dir]]
+    script = STEP_INFO[step]["script"]
+    base = [sys.executable, str(script)]
+    if step == "preprocess":
+        pre = cfg.get("preprocess", {})
+        cmd = base[:]
+        if pre.get("config"):
+            cmd += ["--config", pre["config"]]
+        if force:
+            cmd += ["--force"]
+        return [cmd]
+    if step == "normalize":
+        if not require(["sxr_dir", "output_path"], "normalize"):
+            return None
+        n = cfg["normalize"]
+        return [base + ["--sxr_dir", n["sxr_dir"], "--output_path", n["output_path"]]]
+    if step == "split":
+        if not require(["aia_input_dir", "sxr_input_dir"], "split"):
+            return None
+        s = cfg["split"]
+        date_args = []
+        for key in ("train_start", "train_end", "val_start", "val_end", "test_start", "test_end"):
+            if s.get(key):
+                date_args += [f"--{key}", s[key]]
+        # Each data type splits into its own input directory (creates train/val/test subdirs there)
+        aia_cmd = base + ["--input_folder", s["aia_input_dir"], "--output_dir", s["aia_input_dir"],
+                          "--data_type", "aia"] + date_args
+        sxr_cmd = base + ["--input_folder", s["sxr_input_dir"], "--output_dir", s["sxr_input_dir"],
+                          "--data_type", "sxr"] + date_args
+        return [aia_cmd, sxr_cmd]
+    if step == "train":
+        if not require(["config"], "train"):
+            return None
+        t = cfg["train"]
+        config_path = t["config"]
+        if t.get("overrides"):
+            config_path = str(write_merged_config(config_path, t["overrides"], "train_config"))
+        return [base + ["-config", config_path]]
+    if step == "inference":
+        if not require(["config"], "inference"):
+            return None
+        inf = cfg["inference"]
+        config_path = inf["config"]
+        if inf.get("overrides"):
+            config_path = str(write_merged_config(config_path, inf["overrides"], "inference_config"))
+        return [base + ["-config", config_path]]
+    if step == "flare_analysis":
+        if not require(["config"], "inference"):
+            return None
+        inf = cfg["inference"]
+        config_path = inf["config"]
+        if inf.get("overrides"):
+            config_path = str(write_merged_config(config_path, inf["overrides"], "inference_config"))
+        return [base + ["--config", config_path]]
+    return [base]
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+def run_step(step: str, cmds: list[list[str]]) -> bool:
+    info = STEP_INFO[step]
+    total_start = time.time()
+    for i, cmd in enumerate(cmds):
+        label = f"{step.upper()}" + (f" ({i + 1}/{len(cmds)})" if len(cmds) > 1 else "")
+        log.info("")
+        log.info("=" * 70)
+        log.info(f"  STEP: {label}")
+        log.info(f"  {info['description']}")
+        log.info(f"  {' '.join(str(c) for c in cmd)}")
+        log.info("=" * 70)
+        result = subprocess.run(cmd, cwd=ROOT)
+        if result.returncode != 0:
+            log.error(f"  FAILED  {label} exited with code {result.returncode}")
+            return False
+    elapsed = time.time() - total_start
+    log.info(f"  DONE  {step} completed in {elapsed:.1f}s")
+    return True
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def list_steps():
+    print("\nAvailable pipeline steps (in order):\n")
+    for i, step in enumerate(STEP_ORDER, 1):
+        print(f"  {i}. {step:<16} {STEP_INFO[step]['description']}")
+    print()
+    print("Use --steps all to run every step, or comma-separate specific steps.")
+    print("Example: --steps train,inference,flare_analysis\n")
+def main():
+    parser = argparse.ArgumentParser(
+        description="FOXES End-to-End Pipeline Orchestrator",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--config", type=str, default=None, help="Path to pipeline_config.yaml")
+    parser.add_argument("--steps",  type=str, default=None,
+                        help=f"Comma-separated steps to run, or 'all'. Available: {', '.join(STEP_ORDER)}")
+    parser.add_argument("--list",  action="store_true", help="List all available steps and exit")
+    parser.add_argument("--force", action="store_true", help="Force re-run (forwarded to preprocess step)")
+    args = parser.parse_args()
+    if args.list:
+        list_steps()
+        return
+    if not args.steps:
+        parser.print_help()
+        return
+    if not args.config:
+        log.error("--config is required. Point it at your pipeline_config.yaml.")
+        sys.exit(1)
+    with open(args.config, "r") as f:
+        cfg = yaml.safe_load(f)
+    # Resolve step list
+    if args.steps.strip().lower() == "all":
+        selected = list(STEP_ORDER)
+    else:
+        selected = [s.strip() for s in args.steps.split(",")]
+        unknown = [s for s in selected if s not in STEP_INFO]
+        if unknown:
+            log.error(f"Unknown steps: {', '.join(unknown)}")
+            list_steps()
+            sys.exit(1)
+        selected = [s for s in STEP_ORDER if s in selected]  # preserve order
+    log.info(f"Config: {args.config}")
+    log.info(f"Running {len(selected)} step(s): {' -> '.join(selected)}")
+    passed, failed = [], []
+    for step in selected:
+        cmds = build_commands(step, cfg, args.force)
+        if cmds is None:
+            failed.append(step)
+            break
+        if run_step(step, cmds):
+            passed.append(step)
+        else:
+            failed.append(step)
+            log.error(f"Pipeline stopped at '{step}'.")
+            break
+    # Summary
+    log.info("")
+    log.info("=" * 70)
+    log.info("PIPELINE SUMMARY")
+    log.info("=" * 70)
+    for s in passed:
+        log.info(f"  PASSED   {s}")
+    for s in failed:
+        log.error(f"  FAILED   {s}")
+    for s in [s for s in selected if s not in passed and s not in failed]:
+        log.info(f"  SKIPPED  {s}")
+    log.info("=" * 70)
+    sys.exit(0 if not failed else 1)
+if __name__ == "__main__":
+    main()