File size: 10,040 Bytes
1edeed2 1f314a3 2045139 1f314a3 2045139 1f314a3 b6d3638 1f314a3 2045139 1f314a3 2045139 1f314a3 ec2b4e7 2045139 ec2b4e7 2045139 1f314a3 2045139 1f314a3 ec2b4e7 1f314a3 ec2b4e7 2045139 1f314a3 ec2b4e7 1f314a3 2045139 1f314a3 ec2b4e7 1f314a3 2045139 ec2b4e7 1f314a3 2045139 1f314a3 ec2b4e7 1f314a3 2c4457e ec2b4e7 2045139 1f314a3 2045139 ec2b4e7 1f314a3 2045139 ec2b4e7 1f314a3 ec2b4e7 1f314a3 2045139 1f314a3 ec2b4e7 1f314a3 2045139 1f314a3 2045139 1f314a3 2045139 1f314a3 2045139 1f314a3 2045139 1f314a3 2045139 1f314a3 2045139 ec2b4e7 2045139 ec2b4e7 2045139 ec2b4e7 2045139 ec2b4e7 2045139 ec2b4e7 2045139 ec2b4e7 2045139 ec2b4e7 2045139 ec2b4e7 1f314a3 b6d3638 1f314a3 b6d3638 1f314a3 2045139 ec2b4e7 2045139 ec2b4e7 2045139 1f314a3 ec2b4e7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | import argparse
import logging
from pathlib import Path
from typing import List, Tuple
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm
class SXRDataProcessor:
"""Class to process GOES X-ray data, including downloading, combining, and interpolating data from multiple satellites.
This class handles the downloading of GOES data, combining data from different satellites, and applying interpolation
in log space to the X-ray flux data.
It also tracks which files were used in the processing.
Parameters
----------
save_dir : str
Directory where downloaded GOES data will be saved.
concat_dir : str
Directory where combined GOES data will be saved.
"""
def __init__(self, data_dir: str = '/mnt/data/PAPER/GOES-timespan', output_dir: str = '/mnt/data/PAPER/GOES-timespan/combined'):
self.data_dir = Path(data_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.used_g13_files = []
self.used_g14_files = []
self.used_g15_files = []
self.used_g16_files = []
self.used_g17_files = []
self.used_g18_files = []
def combine_goes_data(self, columns_to_interp=["xrsb_flux", "xrsa_flux"]):
"""
Combine GOES-16 and GOES-18 files and track source files used.
Parameters
"""
print("🔍 Scanning for GOES data files...")
g13_files = sorted(self.data_dir.glob("*g13*.nc"))
g14_files = sorted(self.data_dir.glob("*g14*.nc"))
g15_files = sorted(self.data_dir.glob("*g15*.nc"))
g16_files = sorted(self.data_dir.glob("*g16*.nc"))
g17_files = sorted(self.data_dir.glob("*g17*.nc"))
g18_files = sorted(self.data_dir.glob("*g18*.nc"))
total_files = len(g13_files) + len(g14_files) + len(g15_files) + len(g16_files) + len(g17_files) + len(g18_files)
logging.info(
f"Found {len(g13_files)} GOES-13 files, {len(g14_files)} GOES-14 files, {len(g15_files)} GOES-15 files, {len(g16_files)} GOES-16 files, {len(g17_files)} GOES-17 files, and {len(g18_files)} GOES-18 files.")
print(f"Total files found: {total_files}")
if total_files == 0:
print("No GOES data files found in the specified directory.")
return
def process_files(files, satellite_name, output_file, used_file_list):
datasets = []
combined_meta = {}
successful_files = 0
failed_files = 0
print(f"🛰️ Processing {satellite_name} ({len(files)} files)...")
# Progress bar for file loading
with tqdm(files, desc=f"Loading {satellite_name}", unit="file",
bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]') as pbar:
for file_path in pbar:
pbar.set_postfix_str(f"Loading {file_path.name}")
try:
ds = xr.open_dataset(str(file_path))
datasets.append(ds)
used_file_list.append(file_path) # Track file used
successful_files += 1
logging.info(f"Loaded {file_path.name}")
except Exception as e:
failed_files += 1
logging.error(f"Could not load {file_path.name}: {e}")
continue
finally:
if 'ds' in locals():
ds.close()
if not datasets:
print(f"No valid datasets for {satellite_name}")
logging.warning(f"No valid datasets for {satellite_name}")
return
print(f"Processing {len(datasets)} datasets for {satellite_name}...")
try:
print(f"Concatenating datasets...")
combined_ds = xr.concat(datasets, dim='time').sortby('time')
# Scaling factors for GOES-13, GOES-14, and GOES-15
if satellite_name in ['GOES-13', 'GOES-14', 'GOES-15']:
print(f"Applying scaling factors for {satellite_name}...")
combined_ds['xrsa_flux'] = combined_ds['xrsa_flux'] / .85
combined_ds['xrsb_flux'] = combined_ds['xrsb_flux'] / .7
print(f"Converting to DataFrame...")
df = combined_ds.to_dataframe().reset_index()
if 'quad_diode' in df.columns:
print(f"Filtering quad diode data...")
df = df[df['quad_diode'] == 0] # Filter out quad diode data
#Filter out data where xrsb_flux has a quality flag of >0
print(f"Filtering out data where xrsb_flux has a quality flag of >0...")
df = df[df['xrsb_flag'] == 0]
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
print(f"Applying log interpolation...")
df_log = np.log10(df[columns_to_interp].replace(0, np.nan))
# Step 3: Interpolate in log space
df_log_interp = df_log.interpolate(method="time", limit_direction="both")
# Step 4: Back-transform to linear space
df[columns_to_interp] = 10 ** df_log_interp
# Add min and max dates to filename
min_date = df.index.min().strftime('%Y%m%d')
max_date = df.index.max().strftime('%Y%m%d')
filename = f"{str(output_file)}_{min_date}_{max_date}.csv"
print(f"Saving to {filename}...")
df.to_csv(filename, index=True)
print(f"Successfully processed {satellite_name}: {successful_files} files loaded, {failed_files} failed")
logging.info(f"Saved combined file: {output_file}")
except Exception as e:
print(f"Failed to process {satellite_name}: {e}")
logging.error(f"Failed to write {output_file}: {e}")
finally:
for ds in datasets:
ds.close()
# Create list of satellites to process
satellites_to_process = []
if len(g13_files) != 0:
satellites_to_process.append((g13_files, "GOES-13", self.output_dir / "combined_g13_avg1m", self.used_g13_files))
if len(g14_files) != 0:
satellites_to_process.append((g14_files, "GOES-14", self.output_dir / "combined_g14_avg1m", self.used_g14_files))
if len(g15_files) != 0:
satellites_to_process.append((g15_files, "GOES-15", self.output_dir / "combined_g15_avg1m", self.used_g15_files))
if len(g16_files) != 0:
satellites_to_process.append((g16_files, "GOES-16", self.output_dir / "combined_g16_avg1m", self.used_g16_files))
if len(g17_files) != 0:
satellites_to_process.append((g17_files, "GOES-17", self.output_dir / "combined_g17_avg1m", self.used_g17_files))
if len(g18_files) != 0:
satellites_to_process.append((g18_files, "GOES-18", self.output_dir / "combined_g18_avg1m", self.used_g18_files))
print(f"\nStarting processing of {len(satellites_to_process)} satellites...")
# Process each satellite with overall progress tracking
successful_satellites = 0
failed_satellites = 0
for i, (files, satellite_name, output_file, used_file_list) in enumerate(satellites_to_process, 1):
print(f"\n{'='*60}")
print(f"Processing satellite {i}/{len(satellites_to_process)}: {satellite_name}")
print(f"{'='*60}")
try:
process_files(files, satellite_name, output_file, used_file_list)
successful_satellites += 1
except Exception as e:
print(f"Failed to process {satellite_name}: {e}")
failed_satellites += 1
logging.error(f"Failed to process {satellite_name}: {e}")
# Print final summary
print(f"\n{'='*60}")
print(f"PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Successfully processed: {successful_satellites} satellites")
print(f"Failed: {failed_satellites} satellites")
print(f"Total files processed: {total_files}")
print(f"Output directory: {self.output_dir}")
# Print file usage statistics
total_used_files = (len(self.used_g13_files) + len(self.used_g14_files) +
len(self.used_g15_files) + len(self.used_g16_files) +
len(self.used_g17_files) + len(self.used_g18_files))
print(f"Files used in processing: {total_used_files}")
if successful_satellites > 0:
print(f"\nSXR data processing completed successfully!")
else:
print(f"\n⚠No satellites were processed successfully.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Preprocess GOES X-ray data.')
parser.add_argument('--data_dir', type=str, default='/mnt/data/PAPER/GOES-timespan',
help='Directory where downloaded GOES data is stored.')
parser.add_argument('--output_dir', type=str, default='/mnt/data/PAPER/GOES-timespan/combined',
help='Directory where combined GOES data will be saved.')
args = parser.parse_args()
print("GOES SXR Data Processing Tool")
print("=" * 50)
print(f"Data directory: {args.data_dir}")
print(f"Output directory: {args.output_dir}")
print("=" * 50)
processor = SXRDataProcessor(data_dir=args.data_dir, output_dir=args.output_dir)
processor.combine_goes_data()
print("\nAll processing tasks completed.")
|