Spaces:

alexdum
/

ghcnm

Running

App Files Files Community

ghcnm / misc /Python /ghcnm_read_tavg.py

alexdum

fix: Standardized tavg availability filename and updated precipitation data filtering logic.

4e54351 about 1 month ago

raw

history blame contribute delete

4.76 kB

	import pandas as pd
	import pyarrow

	def read_ghcnm_data(file_path):
	"""
	Reads a GHCNM v4 .dat file and returns a pandas DataFrame.

	Args:
	file_path: Path to the .dat file.

	Returns:
	A pandas DataFrame containing the data.
	"""

	colspecs = [
	(0, 11), (11, 15), (15, 19),
	(19, 24), (24, 25), (25, 26), (26, 27),
	(27, 32), (32, 33), (33, 34), (34, 35),
	(35, 40), (40, 41), (41, 42), (42, 43),
	(43, 48), (48, 49), (49, 50), (50, 51),
	(51, 56), (56, 57), (57, 58), (58, 59),
	(59, 64), (64, 65), (65, 66), (66, 67),
	(67, 72), (72, 73), (73, 74), (74, 75),
	(75, 80), (80, 81), (81, 82), (82, 83),
	(83, 88), (88, 89), (89, 90), (90, 91),
	(91, 96), (96, 97), (97, 98), (98, 99),
	(99, 104), (104, 105), (105, 106), (106, 107),
	(107, 112), (112, 113), (113, 114), (114, 115)
	]

	names = [
	"ID", "YEAR", "ELEMENT",
	"VALUE1", "DMFLAG1", "QCFLAG1", "DSFLAG1",
	"VALUE2", "DMFLAG2", "QCFLAG2", "DSFLAG2",
	"VALUE3", "DMFLAG3", "QCFLAG3", "DSFLAG3",
	"VALUE4", "DMFLAG4", "QCFLAG4", "DSFLAG4",
	"VALUE5", "DMFLAG5", "QCFLAG5", "DSFLAG5",
	"VALUE6", "DMFLAG6", "QCFLAG6", "DSFLAG6",
	"VALUE7", "DMFLAG7", "QCFLAG7", "DSFLAG7",
	"VALUE8", "DMFLAG8", "QCFLAG8", "DSFLAG8",
	"VALUE9", "DMFLAG9", "QCFLAG9", "DSFLAG9",
	"VALUE10", "DMFLAG10", "QCFLAG10", "DSFLAG10",
	"VALUE11", "DMFLAG11", "QCFLAG11", "DSFLAG11",
	"VALUE12", "DMFLAG12", "QCFLAG12", "DSFLAG12"
	]

	df = pd.read_fwf(file_path, colspecs=colspecs, names=names, dtype=str)

	# Convert temperature values to numeric (Celsius)
	for i in range(1, 13):
	df[f"VALUE{i}"] = pd.to_numeric(df[f"VALUE{i}"], errors='coerce').fillna(-9999) / 100.0

	return df


	# download file

	import tarfile
	import os
	from urllib.request import urlretrieve


	url = "https://www.ncei.noaa.gov/pub/data/ghcn/v4/ghcnm.tavg.latest.qcf.tar.gz"
	filename = os.path.join('misc/data', "ghcnm.tavg.latest.qcf.tar.gz")

	urlretrieve(url, filename)

	with tarfile.open(filename, 'r:gz') as tar:
	tar.extractall(path='misc/data')


	def find_inv_file(directory, extension):
	for root, dirs, files in os.walk(directory):
	for file in files:
	if file.endswith(extension):
	return os.path.join(root, file)
	return None

	dat_file = find_inv_file('misc/data', '.dat')


	# Example usage:
	data = read_ghcnm_data(dat_file)
	print(data.head())

	# Assuming your data is already loaded into a dataframe called 'filtered_data'
	# Transforming from wide to long format
	long_format_data = pd.wide_to_long(data,
	stubnames=['VALUE', 'DMFLAG', 'QCFLAG', 'DSFLAG'],
	i=['ID', 'YEAR', 'ELEMENT'],
	j='MONTH',
	suffix='\d+')

	# Resetting the index for better readability
	long_format_data = long_format_data.reset_index()

	print(long_format_data)



	# Save the DataFrame as a Parquet file
	long_format_data.to_parquet('www/data/tabs/tavg_long.parquet', engine='pyarrow', index=False)


	# Assuming long_format_data is the name of your DataFrame
	# Group by 'ID' and find the first and last year for each unique ID
	station_summary = long_format_data.groupby('ID')['YEAR'].agg(first_year='min', last_year='max').reset_index()

	# Display the result
	print(station_summary)


	station_summary.to_csv("www/data/tabs/tavg_availability.csv", index=False)


	### read text file
	# misc/data/ghcnm.v4.0.1.20250423/ghcnm.tavg.v4.0.1.20250423.qcf.inv

	meta_file = find_inv_file('misc/data', '.inv')

	# Define column specifications (start index is 0-based in pandas)
	colspecs = [
	(0, 11), # ID (1-11)
	(12, 20), # LATITUDE (13-20)
	(21, 30), # LONGITUDE (22-30)
	(31, 37), # STNELEV (32-37)
	(38, 68) # NAME (39-68)
	]

	# Define column names
	column_names = ['ID', 'LATITUDE', 'LONGITUDE', 'STNELEV', 'NAME']

	# Read the fixed-width formatted file
	meta = pd.read_fwf(meta_file, colspecs=colspecs, names=column_names)

	# Replace missing elevation values -999.0 with NaN
	meta['STNELEV'] = meta['STNELEV'].replace(-999.0, pd.NA)

	# Show the first few rows
	print(meta.head())

	# Write the DataFrame to a CSV file
	meta.to_csv('www/data/tabs/tavg_meta.csv', index=False)




	# remove all folders and files
	import glob
	import shutil

	items_to_remove = glob.glob('misc/data/*')
	for item_path in items_to_remove:
	try:
	if os.path.isfile(item_path) or os.path.islink(item_path):
	os.remove(item_path)
	elif os.path.isdir(item_path):
	shutil.rmtree(item_path)
	except Exception as e:
	print(f'Failed to delete {item_path}. Reason: {e}')



	# https://climatecharts.net/