Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| from urllib.request import urlretrieve | |
| import tarfile | |
| import glob | |
| import pandas as pd | |
| from pandas.errors import EmptyDataError, ParserError | |
| import sys | |
| # URL of the archive directory | |
| BASE_URL = 'https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/archive/' | |
| # Create a session | |
| session = requests.Session() | |
| # Get the HTML content of the archive page | |
| response = session.get(BASE_URL) | |
| response.raise_for_status() # Raise an error for bad status | |
| # Parse HTML with BeautifulSoup | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find all .tar.gz files | |
| tar_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.tar.gz')] | |
| file_url = BASE_URL + tar_links[0] | |
| filename = os.path.join('misc/data', tar_links[0]) | |
| urlretrieve(file_url, filename) | |
| with tarfile.open(filename, 'r:gz') as tar: | |
| tar.extractall(path='misc/data/pp') | |
| ### read csv file | |
| # https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/doc/ghcn-m_v4_prcp_readme.txt | |
| colspecs = [ | |
| (0, 11), # 1. GHCN identifier (columns 1-11) | |
| (12, 52), # 2. Station name (columns 13-52) | |
| (53, 62), # 3. Latitude (columns 54-62) | |
| (63, 73), # 4. Longitude (columns 64-73) | |
| (74, 82), # 5. Elevation (meters) (columns 75-82) | |
| (83, 89), # 6. Year and month (columns 84-89) | |
| (90, 96), # 7. Precipitation value (columns 91-96) | |
| (97, 98), # 8. Measurement flag (column 98) | |
| (99, 100), # 9. Quality control flag (column 100) | |
| (101, 102), # 10. Source flag (column 102) | |
| (103, 109), # 11. Source index (columns 104-109) | |
| ] | |
| column_names = [ | |
| "ghcn_id", | |
| "station_name", | |
| "latitude", | |
| "longitude", | |
| "elevation_m", | |
| "year_month", | |
| "precip_tenths_mm", | |
| "measurement_flag", | |
| "quality_flag", | |
| "source_flag", | |
| "source_index" | |
| ] | |
| dtypes = { | |
| "ghcn_id": "string", | |
| "station_name": "string", | |
| "latitude": "float32", # float32 often sufficient for lat/lon | |
| "longitude": "float32", | |
| "elevation_m": "float32", # Elevation might have missing values, float handles NaN | |
| "year_month": "string", # Read as string first | |
| "precip_tenths_mm": "Int32", # Use nullable integer type | |
| "measurement_flag": "string", # Or 'category' after loading if desired | |
| "quality_flag": "string", # Or 'category' | |
| "source_flag": "string", # Or 'category' | |
| "source_index": "string" # Assuming it might not always be purely numeric | |
| } | |
| # Path to CSV files | |
| csv_path = 'misc/data/pp/*.csv' | |
| # Get list of all CSV files | |
| csv_files = glob.glob(csv_path) | |
| # List to hold DataFrames | |
| dataframes = [] | |
| for i, file in enumerate(csv_files): | |
| print(f"Processing file {i+1}/{len(csv_files)}: {file}") | |
| df = pd.read_fwf( | |
| file, | |
| colspecs=colspecs, | |
| names=column_names, | |
| dtype=dtypes, # Apply defined data types | |
| header=None # Important: Ensure pandas doesn't look for a header row | |
| ) | |
| # 1. Parse 'year_month' into 'year' and 'month' columns | |
| df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16') | |
| df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8') | |
| df.drop('year_month', axis=1, inplace=True) # Drop the original column | |
| # 2. Handle Trace Precipitation (-1 becomes 0) | |
| df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0) | |
| # 3. Create precipitation column in millimeters (float) | |
| df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1) | |
| # remove rows qith quality flags == O, or R, or T, or S, or K | |
| df = df[df['quality_flag'] != 'O'] | |
| df = df[df['quality_flag'] != 'R'] | |
| df = df[df['quality_flag'] != 'T'] | |
| df = df[df['quality_flag'] != 'S'] | |
| df = df[df['quality_flag'] != 'K'] | |
| # skip is number of rows is less than 120 | |
| if len(df) < 120: | |
| continue | |
| # 4. Skip this file if any 'precip_mm' value is greater than 5000 | |
| if (df['precip_mm'] > 2000).any(): | |
| #continue | |
| sys.exit(1) | |
| # 5. Append cleaned and filtered dataframe | |
| dataframes.append(df) | |
| combined_df = pd.concat(dataframes, ignore_index=True) | |
| # Save the DataFrame as a Parquet file | |
| combined_df.to_parquet('www/data/tabs/prec_long.parquet', engine='pyarrow', index=False) | |
| !rm -rf misc/data/* | |
| combined_df.describe() | |