import requests from bs4 import BeautifulSoup import os from urllib.request import urlretrieve import tarfile import glob import pandas as pd from pandas.errors import EmptyDataError, ParserError import sys # URL of the archive directory BASE_URL = 'https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/archive/' # Create a session session = requests.Session() # Get the HTML content of the archive page response = session.get(BASE_URL) response.raise_for_status() # Raise an error for bad status # Parse HTML with BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Find all .tar.gz files tar_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.tar.gz')] file_url = BASE_URL + tar_links[0] filename = os.path.join('misc/data', tar_links[0]) urlretrieve(file_url, filename) with tarfile.open(filename, 'r:gz') as tar: tar.extractall(path='misc/data/pp') ### read csv file # https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/doc/ghcn-m_v4_prcp_readme.txt colspecs = [ (0, 11), # 1. GHCN identifier (columns 1-11) (12, 52), # 2. Station name (columns 13-52) (53, 62), # 3. Latitude (columns 54-62) (63, 73), # 4. Longitude (columns 64-73) (74, 82), # 5. Elevation (meters) (columns 75-82) (83, 89), # 6. Year and month (columns 84-89) (90, 96), # 7. Precipitation value (columns 91-96) (97, 98), # 8. Measurement flag (column 98) (99, 100), # 9. Quality control flag (column 100) (101, 102), # 10. Source flag (column 102) (103, 109), # 11. Source index (columns 104-109) ] column_names = [ "ghcn_id", "station_name", "latitude", "longitude", "elevation_m", "year_month", "precip_tenths_mm", "measurement_flag", "quality_flag", "source_flag", "source_index" ] dtypes = { "ghcn_id": "string", "station_name": "string", "latitude": "float32", # float32 often sufficient for lat/lon "longitude": "float32", "elevation_m": "float32", # Elevation might have missing values, float handles NaN "year_month": "string", # Read as string first "precip_tenths_mm": "Int32", # Use nullable integer type "measurement_flag": "string", # Or 'category' after loading if desired "quality_flag": "string", # Or 'category' "source_flag": "string", # Or 'category' "source_index": "string" # Assuming it might not always be purely numeric } # Path to CSV files csv_path = 'misc/data/pp/*.csv' # Get list of all CSV files csv_files = glob.glob(csv_path) # List to hold DataFrames dataframes = [] for i, file in enumerate(csv_files): print(f"Processing file {i+1}/{len(csv_files)}: {file}") df = pd.read_fwf( file, colspecs=colspecs, names=column_names, dtype=dtypes, # Apply defined data types header=None # Important: Ensure pandas doesn't look for a header row ) # 1. Parse 'year_month' into 'year' and 'month' columns df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16') df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8') df.drop('year_month', axis=1, inplace=True) # Drop the original column # 2. Handle Trace Precipitation (-1 becomes 0) df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0) # 3. Create precipitation column in millimeters (float) df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1) # remove rows qith quality flags == O, or R, or T, or S, or K df = df[df['quality_flag'] != 'O'] df = df[df['quality_flag'] != 'R'] df = df[df['quality_flag'] != 'T'] df = df[df['quality_flag'] != 'S'] df = df[df['quality_flag'] != 'K'] # skip is number of rows is less than 120 if len(df) < 120: continue # 4. Skip this file if any 'precip_mm' value is greater than 5000 if (df['precip_mm'] > 2000).any(): #continue sys.exit(1) # 5. Append cleaned and filtered dataframe dataframes.append(df) combined_df = pd.concat(dataframes, ignore_index=True) # Save the DataFrame as a Parquet file combined_df.to_parquet('www/data/tabs/prec_long.parquet', engine='pyarrow', index=False) !rm -rf misc/data/* combined_df.describe()