|
|
import glob |
|
|
import pandas as pd |
|
|
import os |
|
|
|
|
|
|
|
|
colspecs = [ |
|
|
(0, 11), |
|
|
(12, 52), |
|
|
(53, 62), |
|
|
(63, 73), |
|
|
(74, 82), |
|
|
(83, 89), |
|
|
(90, 96), |
|
|
(97, 98), |
|
|
(99, 100), |
|
|
(101, 102), |
|
|
(103, 109), |
|
|
] |
|
|
|
|
|
column_names = [ |
|
|
"ghcn_id", |
|
|
"station_name", |
|
|
"latitude", |
|
|
"longitude", |
|
|
"elevation_m", |
|
|
"year_month", |
|
|
"precip_tenths_mm", |
|
|
"measurement_flag", |
|
|
"quality_flag", |
|
|
"source_flag", |
|
|
"source_index" |
|
|
] |
|
|
|
|
|
dtypes = { |
|
|
"ghcn_id": "string", |
|
|
"station_name": "string", |
|
|
"latitude": "float32", |
|
|
"longitude": "float32", |
|
|
"elevation_m": "float32", |
|
|
"year_month": "string", |
|
|
"precip_tenths_mm": "string", |
|
|
"measurement_flag": "string", |
|
|
"quality_flag": "string", |
|
|
"source_flag": "string", |
|
|
"source_index": "string" |
|
|
} |
|
|
|
|
|
|
|
|
csv_path = 'misc/data/pp/*.csv' |
|
|
csv_files = glob.glob(csv_path) |
|
|
|
|
|
print(f"Found {len(csv_files)} files. Testing the first 50...") |
|
|
|
|
|
for i, file in enumerate(csv_files[:50]): |
|
|
print(f"\nProcessing {file}...") |
|
|
|
|
|
try: |
|
|
df = pd.read_fwf( |
|
|
file, |
|
|
colspecs=colspecs, |
|
|
names=column_names, |
|
|
dtype=dtypes, |
|
|
header=None |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"FAILED to read file: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16') |
|
|
df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8') |
|
|
df.drop('year_month', axis=1, inplace=True) |
|
|
|
|
|
|
|
|
df['precip_tenths_mm'] = pd.to_numeric(df['precip_tenths_mm'], errors='coerce') |
|
|
original_len = len(df) |
|
|
df = df.dropna(subset=['precip_tenths_mm']) |
|
|
if len(df) < original_len: |
|
|
print(f" Dropped {original_len - len(df)} rows due to NaN precip") |
|
|
|
|
|
df['precip_tenths_mm'] = df['precip_tenths_mm'].astype('Int32') |
|
|
|
|
|
|
|
|
df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0) |
|
|
|
|
|
|
|
|
df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1) |
|
|
|
|
|
|
|
|
print(f" Unique Quality Flags before filtering: {df['quality_flag'].unique()}") |
|
|
|
|
|
rows_before_quality = len(df) |
|
|
|
|
|
|
|
|
bad_flags = ['O', 'R', 'T', 'S', 'K'] |
|
|
|
|
|
|
|
|
|
|
|
mask = df['quality_flag'].isin(bad_flags) |
|
|
|
|
|
mask = mask.fillna(False) |
|
|
|
|
|
df = df[~mask] |
|
|
|
|
|
if len(df) < rows_before_quality: |
|
|
print(f" Dropped {rows_before_quality - len(df)} rows due to quality flags") |
|
|
|
|
|
|
|
|
if len(df) < 120: |
|
|
print(f" [SKIP] Too few rows: {len(df)} < 120") |
|
|
continue |
|
|
|
|
|
|
|
|
max_precip = df['precip_mm'].max() |
|
|
if (df['precip_mm'] > 2000).any(): |
|
|
print(f" [SKIP] Extreme value found: {max_precip} > 2000mm") |
|
|
continue |
|
|
|
|
|
print(f" [KEEP] File would be kept. Rows: {len(df)}, Max Precip: {max_precip}") |
|
|
|