| | import pandas as pd |
| | import pyarrow |
| |
|
| | def read_ghcnm_data(file_path): |
| | """ |
| | Reads a GHCNM v4 .dat file and returns a pandas DataFrame. |
| | |
| | Args: |
| | file_path: Path to the .dat file. |
| | |
| | Returns: |
| | A pandas DataFrame containing the data. |
| | """ |
| |
|
| | colspecs = [ |
| | (0, 11), (11, 15), (15, 19), |
| | (19, 24), (24, 25), (25, 26), (26, 27), |
| | (27, 32), (32, 33), (33, 34), (34, 35), |
| | (35, 40), (40, 41), (41, 42), (42, 43), |
| | (43, 48), (48, 49), (49, 50), (50, 51), |
| | (51, 56), (56, 57), (57, 58), (58, 59), |
| | (59, 64), (64, 65), (65, 66), (66, 67), |
| | (67, 72), (72, 73), (73, 74), (74, 75), |
| | (75, 80), (80, 81), (81, 82), (82, 83), |
| | (83, 88), (88, 89), (89, 90), (90, 91), |
| | (91, 96), (96, 97), (97, 98), (98, 99), |
| | (99, 104), (104, 105), (105, 106), (106, 107), |
| | (107, 112), (112, 113), (113, 114), (114, 115) |
| | ] |
| |
|
| | names = [ |
| | "ID", "YEAR", "ELEMENT", |
| | "VALUE1", "DMFLAG1", "QCFLAG1", "DSFLAG1", |
| | "VALUE2", "DMFLAG2", "QCFLAG2", "DSFLAG2", |
| | "VALUE3", "DMFLAG3", "QCFLAG3", "DSFLAG3", |
| | "VALUE4", "DMFLAG4", "QCFLAG4", "DSFLAG4", |
| | "VALUE5", "DMFLAG5", "QCFLAG5", "DSFLAG5", |
| | "VALUE6", "DMFLAG6", "QCFLAG6", "DSFLAG6", |
| | "VALUE7", "DMFLAG7", "QCFLAG7", "DSFLAG7", |
| | "VALUE8", "DMFLAG8", "QCFLAG8", "DSFLAG8", |
| | "VALUE9", "DMFLAG9", "QCFLAG9", "DSFLAG9", |
| | "VALUE10", "DMFLAG10", "QCFLAG10", "DSFLAG10", |
| | "VALUE11", "DMFLAG11", "QCFLAG11", "DSFLAG11", |
| | "VALUE12", "DMFLAG12", "QCFLAG12", "DSFLAG12" |
| | ] |
| |
|
| | df = pd.read_fwf(file_path, colspecs=colspecs, names=names, dtype=str) |
| |
|
| | |
| | for i in range(1, 13): |
| | df[f"VALUE{i}"] = pd.to_numeric(df[f"VALUE{i}"], errors='coerce').fillna(-9999) / 100.0 |
| |
|
| | return df |
| |
|
| |
|
| | |
| |
|
| | import tarfile |
| | import os |
| | from urllib.request import urlretrieve |
| |
|
| |
|
| | url = "https://www.ncei.noaa.gov/pub/data/ghcn/v4/ghcnm.tavg.latest.qcf.tar.gz" |
| | filename = os.path.join('misc/data', "ghcnm.tavg.latest.qcf.tar.gz") |
| |
|
| | urlretrieve(url, filename) |
| |
|
| | with tarfile.open(filename, 'r:gz') as tar: |
| | tar.extractall(path='misc/data') |
| | |
| | |
| | def find_inv_file(directory, extension): |
| | for root, dirs, files in os.walk(directory): |
| | for file in files: |
| | if file.endswith(extension): |
| | return os.path.join(root, file) |
| | return None |
| |
|
| | dat_file = find_inv_file('misc/data', '.dat') |
| |
|
| |
|
| | |
| | data = read_ghcnm_data(dat_file) |
| | print(data.head()) |
| |
|
| | |
| | |
| | long_format_data = pd.wide_to_long(data, |
| | stubnames=['VALUE', 'DMFLAG', 'QCFLAG', 'DSFLAG'], |
| | i=['ID', 'YEAR', 'ELEMENT'], |
| | j='MONTH', |
| | suffix='\d+') |
| |
|
| | |
| | long_format_data = long_format_data.reset_index() |
| |
|
| | print(long_format_data) |
| |
|
| |
|
| |
|
| | |
| | long_format_data.to_parquet('www/data/tabs/tavg_long.parquet', engine='pyarrow', index=False) |
| |
|
| |
|
| | |
| | |
| | station_summary = long_format_data.groupby('ID')['YEAR'].agg(first_year='min', last_year='max').reset_index() |
| |
|
| | |
| | print(station_summary) |
| |
|
| |
|
| | station_summary.to_csv("www/data/tabs/tavg_availability.csv", index=False) |
| |
|
| |
|
| | |
| | |
| |
|
| | meta_file = find_inv_file('misc/data', '.inv') |
| |
|
| | |
| | colspecs = [ |
| | (0, 11), |
| | (12, 20), |
| | (21, 30), |
| | (31, 37), |
| | (38, 68) |
| | ] |
| |
|
| | |
| | column_names = ['ID', 'LATITUDE', 'LONGITUDE', 'STNELEV', 'NAME'] |
| |
|
| | |
| | meta = pd.read_fwf(meta_file, colspecs=colspecs, names=column_names) |
| |
|
| | |
| | meta['STNELEV'] = meta['STNELEV'].replace(-999.0, pd.NA) |
| |
|
| | |
| | print(meta.head()) |
| |
|
| | |
| | meta.to_csv('www/data/tabs/tavg_meta.csv', index=False) |
| |
|
| |
|
| |
|
| |
|
| | |
| | import glob |
| | import shutil |
| |
|
| | items_to_remove = glob.glob('misc/data/*') |
| | for item_path in items_to_remove: |
| | try: |
| | if os.path.isfile(item_path) or os.path.islink(item_path): |
| | os.remove(item_path) |
| | elif os.path.isdir(item_path): |
| | shutil.rmtree(item_path) |
| | except Exception as e: |
| | print(f'Failed to delete {item_path}. Reason: {e}') |
| | |
| |
|
| |
|
| | |
| |
|