DulieuKhihauToancau

Running

App Files Files Community

DulieuKhihauToancau / misc /Python /ghcnm_read_prec.py

alexdum

update pp filters

168bc75 10 months ago

raw

history blame contribute delete

4.33 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	from urllib.request import urlretrieve
	import tarfile
	import glob
	import pandas as pd
	from pandas.errors import EmptyDataError, ParserError
	import sys




	# URL of the archive directory
	BASE_URL = 'https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/archive/'

	# Create a session
	session = requests.Session()

	# Get the HTML content of the archive page
	response = session.get(BASE_URL)
	response.raise_for_status() # Raise an error for bad status

	# Parse HTML with BeautifulSoup
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all .tar.gz files
	tar_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.tar.gz')]

	file_url = BASE_URL + tar_links[0]

	filename = os.path.join('misc/data', tar_links[0])
	urlretrieve(file_url, filename)


	with tarfile.open(filename, 'r:gz') as tar:
	tar.extractall(path='misc/data/pp')



	### read csv file
	# https://www.ncei.noaa.gov/data/ghcnm/v4/precipitation/doc/ghcn-m_v4_prcp_readme.txt
	colspecs = [
	(0, 11), # 1. GHCN identifier (columns 1-11)
	(12, 52), # 2. Station name (columns 13-52)
	(53, 62), # 3. Latitude (columns 54-62)
	(63, 73), # 4. Longitude (columns 64-73)
	(74, 82), # 5. Elevation (meters) (columns 75-82)
	(83, 89), # 6. Year and month (columns 84-89)
	(90, 96), # 7. Precipitation value (columns 91-96)
	(97, 98), # 8. Measurement flag (column 98)
	(99, 100), # 9. Quality control flag (column 100)
	(101, 102), # 10. Source flag (column 102)
	(103, 109), # 11. Source index (columns 104-109)
	]

	column_names = [
	"ghcn_id",
	"station_name",
	"latitude",
	"longitude",
	"elevation_m",
	"year_month",
	"precip_tenths_mm",
	"measurement_flag",
	"quality_flag",
	"source_flag",
	"source_index"
	]

	dtypes = {
	"ghcn_id": "string",
	"station_name": "string",
	"latitude": "float32", # float32 often sufficient for lat/lon
	"longitude": "float32",
	"elevation_m": "float32", # Elevation might have missing values, float handles NaN
	"year_month": "string", # Read as string first
	"precip_tenths_mm": "Int32", # Use nullable integer type
	"measurement_flag": "string", # Or 'category' after loading if desired
	"quality_flag": "string", # Or 'category'
	"source_flag": "string", # Or 'category'
	"source_index": "string" # Assuming it might not always be purely numeric
	}
	# Path to CSV files
	csv_path = 'misc/data/pp/*.csv'
	# Get list of all CSV files
	csv_files = glob.glob(csv_path)


	# List to hold DataFrames
	dataframes = []

	for i, file in enumerate(csv_files):
	print(f"Processing file {i+1}/{len(csv_files)}: {file}")

	df = pd.read_fwf(
	file,
	colspecs=colspecs,
	names=column_names,
	dtype=dtypes, # Apply defined data types
	header=None # Important: Ensure pandas doesn't look for a header row
	)

	# 1. Parse 'year_month' into 'year' and 'month' columns
	df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16')
	df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8')
	df.drop('year_month', axis=1, inplace=True) # Drop the original column

	# 2. Handle Trace Precipitation (-1 becomes 0)
	df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0)

	# 3. Create precipitation column in millimeters (float)
	df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1)

	# remove rows qith quality flags == O, or R, or T, or S, or K
	df = df[df['quality_flag'] != 'O']
	df = df[df['quality_flag'] != 'R']
	df = df[df['quality_flag'] != 'T']
	df = df[df['quality_flag'] != 'S']
	df = df[df['quality_flag'] != 'K']

	# skip is number of rows is less than 120
	if len(df) < 120:
	continue
	# 4. Skip this file if any 'precip_mm' value is greater than 5000
	if (df['precip_mm'] > 2000).any():
	#continue
	sys.exit(1)

	# 5. Append cleaned and filtered dataframe
	dataframes.append(df)

	combined_df = pd.concat(dataframes, ignore_index=True)

	# Save the DataFrame as a Parquet file
	combined_df.to_parquet('www/data/tabs/prec_long.parquet', engine='pyarrow', index=False)



	!rm -rf misc/data/*

	combined_df.describe()