import pandas as pd import pyarrow def read_ghcnm_data(file_path): """ Reads a GHCNM v4 .dat file and returns a pandas DataFrame. Args: file_path: Path to the .dat file. Returns: A pandas DataFrame containing the data. """ colspecs = [ (0, 11), (11, 15), (15, 19), (19, 24), (24, 25), (25, 26), (26, 27), (27, 32), (32, 33), (33, 34), (34, 35), (35, 40), (40, 41), (41, 42), (42, 43), (43, 48), (48, 49), (49, 50), (50, 51), (51, 56), (56, 57), (57, 58), (58, 59), (59, 64), (64, 65), (65, 66), (66, 67), (67, 72), (72, 73), (73, 74), (74, 75), (75, 80), (80, 81), (81, 82), (82, 83), (83, 88), (88, 89), (89, 90), (90, 91), (91, 96), (96, 97), (97, 98), (98, 99), (99, 104), (104, 105), (105, 106), (106, 107), (107, 112), (112, 113), (113, 114), (114, 115) ] names = [ "ID", "YEAR", "ELEMENT", "VALUE1", "DMFLAG1", "QCFLAG1", "DSFLAG1", "VALUE2", "DMFLAG2", "QCFLAG2", "DSFLAG2", "VALUE3", "DMFLAG3", "QCFLAG3", "DSFLAG3", "VALUE4", "DMFLAG4", "QCFLAG4", "DSFLAG4", "VALUE5", "DMFLAG5", "QCFLAG5", "DSFLAG5", "VALUE6", "DMFLAG6", "QCFLAG6", "DSFLAG6", "VALUE7", "DMFLAG7", "QCFLAG7", "DSFLAG7", "VALUE8", "DMFLAG8", "QCFLAG8", "DSFLAG8", "VALUE9", "DMFLAG9", "QCFLAG9", "DSFLAG9", "VALUE10", "DMFLAG10", "QCFLAG10", "DSFLAG10", "VALUE11", "DMFLAG11", "QCFLAG11", "DSFLAG11", "VALUE12", "DMFLAG12", "QCFLAG12", "DSFLAG12" ] df = pd.read_fwf(file_path, colspecs=colspecs, names=names, dtype=str) # Convert temperature values to numeric (Celsius) for i in range(1, 13): df[f"VALUE{i}"] = pd.to_numeric(df[f"VALUE{i}"], errors='coerce').fillna(-9999) / 100.0 return df # download file import tarfile import os from urllib.request import urlretrieve url = "https://www.ncei.noaa.gov/pub/data/ghcn/v4/ghcnm.tavg.latest.qcf.tar.gz" filename = os.path.join('misc/data', "ghcnm.tavg.latest.qcf.tar.gz") urlretrieve(url, filename) with tarfile.open(filename, 'r:gz') as tar: tar.extractall(path='misc/data') def find_inv_file(directory, extension): for root, dirs, files in os.walk(directory): for file in files: if file.endswith(extension): return os.path.join(root, file) return None dat_file = find_inv_file('misc/data', '.dat') # Example usage: data = read_ghcnm_data(dat_file) print(data.head()) # Assuming your data is already loaded into a dataframe called 'filtered_data' # Transforming from wide to long format long_format_data = pd.wide_to_long(data, stubnames=['VALUE', 'DMFLAG', 'QCFLAG', 'DSFLAG'], i=['ID', 'YEAR', 'ELEMENT'], j='MONTH', suffix='\d+') # Resetting the index for better readability long_format_data = long_format_data.reset_index() print(long_format_data) # Save the DataFrame as a Parquet file long_format_data.to_parquet('www/data/tabs/tavg_long.parquet', engine='pyarrow', index=False) # Assuming long_format_data is the name of your DataFrame # Group by 'ID' and find the first and last year for each unique ID station_summary = long_format_data.groupby('ID')['YEAR'].agg(first_year='min', last_year='max').reset_index() # Display the result print(station_summary) station_summary.to_csv("www/data/tabs/tavg_availability.csv", index=False) ### read text file # misc/data/ghcnm.v4.0.1.20250423/ghcnm.tavg.v4.0.1.20250423.qcf.inv meta_file = find_inv_file('misc/data', '.inv') # Define column specifications (start index is 0-based in pandas) colspecs = [ (0, 11), # ID (1-11) (12, 20), # LATITUDE (13-20) (21, 30), # LONGITUDE (22-30) (31, 37), # STNELEV (32-37) (38, 68) # NAME (39-68) ] # Define column names column_names = ['ID', 'LATITUDE', 'LONGITUDE', 'STNELEV', 'NAME'] # Read the fixed-width formatted file meta = pd.read_fwf(meta_file, colspecs=colspecs, names=column_names) # Replace missing elevation values -999.0 with NaN meta['STNELEV'] = meta['STNELEV'].replace(-999.0, pd.NA) # Show the first few rows print(meta.head()) # Write the DataFrame to a CSV file meta.to_csv('www/data/tabs/tavg_meta.csv', index=False) # remove all folders and files import glob import shutil items_to_remove = glob.glob('misc/data/*') for item_path in items_to_remove: try: if os.path.isfile(item_path) or os.path.islink(item_path): os.remove(item_path) elif os.path.isdir(item_path): shutil.rmtree(item_path) except Exception as e: print(f'Failed to delete {item_path}. Reason: {e}') # https://climatecharts.net/