FOXES / data /sxr_normalization.py
griffingoodwin04's picture
made changes to directory locations... removed some unnecessary files
b6d3638
import numpy as np
from pathlib import Path
import glob
import os
import argparse
def compute_sxr_norm(sxr_dir):
"""
Compute the mean and standard deviation of log10-transformed Soft X-Ray (SXR) flux values.
This function scans a given directory containing `.npy` SXR data files,
loads each file, filters out invalid or non-finite values, applies a logarithmic
transformation (`log10(SXR + 1e-8)`), and computes the mean and standard deviation
for normalization purposes. These normalization statistics are typically used
during model training and inference to ensure consistent SXR scaling.
Parameters
----------
sxr_dir : str or Path
Path to the directory containing `.npy` SXR flux files.
Returns
-------
tuple of (float, float)
- mean : Mean of log10-transformed SXR flux values.
- std : Standard deviation of log10-transformed SXR flux values.
Raises
------
FileNotFoundError
If the specified SXR directory does not exist.
ValueError
If no valid `.npy` files or no valid SXR values are found.
Notes
-----
- Files are expected to contain scalar SXR flux values in W/m².
- Invalid (non-finite or negative) values are automatically skipped.
- The logarithmic transform helps stabilize the variance and normalize scale differences.
"""
sxr_dir = Path(sxr_dir).resolve()
print(f"Checking SXR directory: {sxr_dir}")
if not sxr_dir.is_dir():
raise FileNotFoundError(f"SXR directory does not exist or is not a directory: {sxr_dir}")
# Use glob for case-insensitive matching
sxr_files = sorted(glob.glob(os.path.join(sxr_dir, "*.npy")))
print(f"Found {len(sxr_files)} SXR files in {sxr_dir}")
if len(sxr_files) == 0:
print(f"No files matching '*.npy' found. Listing directory contents:")
print(os.listdir(sxr_dir)[:10]) # Show first 10 files
raise ValueError(f"No SXR files found in {sxr_dir}")
sxr_values = []
for f in sxr_files:
try:
sxr = np.load(f)
sxr = np.atleast_1d(sxr).flatten()[0]
if not np.isfinite(sxr) or sxr < 0:
print(f"Skipping invalid SXR value in {f}: {sxr}")
continue
sxr_values.append(np.log10(sxr))
except Exception as e:
print(f"Failed to load SXR file {f}: {e}")
continue
sxr_values = np.array(sxr_values)
if len(sxr_values) == 0:
raise ValueError(f"No valid SXR values found in {sxr_dir}. All files failed to load or contained invalid data.")
mean = np.mean(sxr_values)
std = np.std(sxr_values)
print(f"Computed SXR normalization: mean={mean}, std={std}")
return mean, std
if __name__ == "__main__":
"""
Command-line interface for calculating and saving SXR normalization parameters.
When run as a script, this entry point:
1. Computes the normalization statistics (mean and standard deviation of log10(SXR flux))
for all valid `.npy` SXR files in the specified directory.
2. Saves the resulting (mean, std) tuple as a NumPy `.npy` file at the user-specified output path.
Usage
-----
python sxr_normalization.py --sxr_dir /path/to/SXR/dir --output_path /where/to/save/normalized_sxr.npy
Arguments
---------
--sxr_dir : str
Path to the directory containing SXR `.npy` data files.
--output_path : str
Path where the computed normalization statistics (mean, std) will be saved.
Notes
-----
- By default, --sxr_dir and --output_path are set for the paper dataset example.
- The output file `normalized_sxr.npy` contains a two-element array: [mean, std]
- These statistics are expected to be used for normalizing GOES SXR flux data in the ML pipeline.
"""
parser = argparse.ArgumentParser(description='Compute and save SXR normalization statistics.')
parser.add_argument('--sxr_dir', type=str, default='/mnt/data/PAPER/SXR/train', help='Path to the SXR data directory.')
parser.add_argument('--output_path', type=str, default='/mnt/data/PAPER/SXR/normalized_sxr.npy', help='Path to save the normalized SXR data.')
args = parser.parse_args()
sxr_norm = compute_sxr_norm(args.sxr_dir)
np.save(args.output_path, sxr_norm)
print(f"Saved SXR normalization to {args.output_path}")