Spaces:

alexdum
/

ghcnm

Running

File size: 6,755 Bytes

import requests
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve
import tarfile
import glob
import pandas as pd
from pandas.errors import EmptyDataError, ParserError
import sys
import concurrent.futures

# URL of the archive directory
BASE_URL = 'https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/archive/'

# Global definitions for workers
colspecs = [
    (0, 11),    # 1. GHCN identifier (columns 1-11)
    (12, 52),   # 2. Station name (columns 13-52)
    (53, 62),   # 3. Latitude (columns 54-62)
    (63, 73),   # 4. Longitude (columns 64-73)
    (74, 82),   # 5. Elevation (meters) (columns 75-82)
    (83, 89),   # 6. Year and month (columns 84-89)
    (90, 96),   # 7. Precipitation value (columns 91-96)
    (97, 98),   # 8. Measurement flag (column 98)
    (99, 100),  # 9. Quality control flag (column 100)
    (101, 102), # 10. Source flag (column 102)
    (103, 109), # 11. Source index (columns 104-109)
]

column_names = [
    "ghcn_id",
    "station_name",
    "latitude",
    "longitude",
    "elevation_m",
    "year_month",
    "precip_tenths_mm",
    "measurement_flag",
    "quality_flag",
    "source_flag",
    "source_index"
]

dtypes = {
    "ghcn_id": "string",
    "station_name": "string",
    "latitude": "float32",
    "longitude": "float32",
    "elevation_m": "float32",
    "year_month": "string",
    "precip_tenths_mm": "string",
    "measurement_flag": "string",
    "quality_flag": "string",
    "source_flag": "string",
    "source_index": "string"
}

def process_file(file):
    try:
        # print(f"Processing {file}") # Too noisy for multiprocessing
        df = pd.read_fwf(
            file,
            colspecs=colspecs,
            names=column_names,
            dtype=dtypes,  # Apply defined data types
            header=None    # Important: Ensure pandas doesn't look for a header row
        )

        # 1. Parse 'year_month' into 'year' and 'month' columns
        df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16')
        df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8')
        df.drop('year_month', axis=1, inplace=True)  # Drop the original column

        # Convert precip_tenths_mm to numeric, handling potential errors
        df['precip_tenths_mm'] = pd.to_numeric(df['precip_tenths_mm'], errors='coerce')
        df = df.dropna(subset=['precip_tenths_mm']) # Drop rows that failed conversion
        df['precip_tenths_mm'] = df['precip_tenths_mm'].astype('Int32')

        # 2. Handle Trace Precipitation (-1 becomes 0)
        df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0)

        # 3. Create precipitation column in millimeters (float)
        df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1)
        
        # FIXED FILTERING LOGIC
        # Using isin includes NAs as False (not in list), so ~False is True (Keep)
        bad_flags = ['O', 'R', 'T', 'S', 'K']
        mask = df['quality_flag'].isin(bad_flags).fillna(False)
        df = df[~mask]
        
        # skip is number of rows is  less than 120
        if len(df) < 120:
            return None
        # 4. Skip this file if any 'precip_mm' value is greater than 5000
        if (df['precip_mm'] > 2000).any():
            # print(f"Skipping {file} due to extreme precipitation value (> 2000mm)")
            return None

        return df
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None

def main():
    # Create a session
    session = requests.Session()
    
    # Get the HTML content of the archive page
    try:
        response = session.get(BASE_URL)
        response.raise_for_status()  # Raise an error for bad status
        
        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all .tar.gz files
        tar_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.tar.gz')]
        
        if tar_links:
            file_url =  BASE_URL + tar_links[0]
            filename = os.path.join('misc/data', tar_links[0])
            
            if os.path.exists(filename):
                print(f"File {filename} already exists. Skipping download.")
            else:
                print(f"Downloading {filename}...")
                urlretrieve(file_url, filename)
            
            if not os.path.exists('misc/data/pp') or not os.listdir('misc/data/pp'):
                print("Extracting files...")
                with tarfile.open(filename, 'r:gz') as tar:
                    tar.extractall(path='misc/data/pp')
            else:
                print("Files already extracted. Skipping extraction.")
    except Exception as e:
        print(f"Warning: Could not check/download updates: {e}")

    # Path to CSV files
    csv_path = 'misc/data/pp/*.csv'
    # Get list of all CSV files
    csv_files = glob.glob(csv_path)
    
    print(f"Found {len(csv_files)} files. Starting processing with multiprocessing...")
    
    # Process files in parallel
    dataframes = []
    # Use 75% of CPUs to avoid freezing system completely
    max_workers = max(1, int(os.cpu_count() * 0.75))
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Map returns results in order
        for i, df in enumerate(executor.map(process_file, csv_files)):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(csv_files)} files...")
            if df is not None:
                dataframes.append(df)

    if not dataframes:
        print("No valid dataframes found.")
        return

    print("Concatenating dataframes...")
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Rename columns to match TAVG schema
    combined_df = combined_df.rename(columns={
        'ghcn_id': 'ID',
        'year': 'YEAR',
        'month': 'MONTH',
        'precip_mm': 'VALUE'
    })

    # Filter out unneeded columns if desired, or keep them. 
    # Server.R uses: ID, YEAR, MONTH, VALUE.
    # Ensure VALUE is float
    combined_df['VALUE'] = combined_df['VALUE'].astype('float32')

    print("Saving to parquet...")
    # Save the DataFrame as a Parquet file
    combined_df.to_parquet('www/data/tabs/prec_long.parquet', engine='pyarrow', index=False)
        
    print(combined_df.describe())
    
    # Calculate availability
    print("Calculating availability...")
    station_summary = combined_df.groupby('ID')['YEAR'].agg(first_year='min', last_year='max').reset_index()
    # ID is already named ID
    station_summary.to_csv("www/data/tabs/prec_availability.csv", index=False)
    print("Done!")

if __name__ == '__main__':
    main()