File size: 3,734 Bytes
f34a7ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import glob
import pandas as pd
import os
# Define colspecs and names (copied from original script)
colspecs = [
(0, 11), # 1. GHCN identifier (columns 1-11)
(12, 52), # 2. Station name (columns 13-52)
(53, 62), # 3. Latitude (columns 54-62)
(63, 73), # 4. Longitude (columns 64-73)
(74, 82), # 5. Elevation (meters) (columns 75-82)
(83, 89), # 6. Year and month (columns 84-89)
(90, 96), # 7. Precipitation value (columns 91-96)
(97, 98), # 8. Measurement flag (column 98)
(99, 100), # 9. Quality control flag (column 100)
(101, 102), # 10. Source flag (column 102)
(103, 109), # 11. Source index (columns 104-109)
]
column_names = [
"ghcn_id",
"station_name",
"latitude",
"longitude",
"elevation_m",
"year_month",
"precip_tenths_mm",
"measurement_flag",
"quality_flag",
"source_flag",
"source_index"
]
dtypes = {
"ghcn_id": "string",
"station_name": "string",
"latitude": "float32",
"longitude": "float32",
"elevation_m": "float32",
"year_month": "string",
"precip_tenths_mm": "string",
"measurement_flag": "string",
"quality_flag": "string",
"source_flag": "string",
"source_index": "string"
}
# Path to CSV files
csv_path = 'misc/data/pp/*.csv'
csv_files = glob.glob(csv_path)
print(f"Found {len(csv_files)} files. Testing the first 50...")
for i, file in enumerate(csv_files[:50]):
print(f"\nProcessing {file}...")
try:
df = pd.read_fwf(
file,
colspecs=colspecs,
names=column_names,
dtype=dtypes,
header=None
)
except Exception as e:
print(f"FAILED to read file: {e}")
continue
# 1. Parse 'year_month'
df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16')
df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8')
df.drop('year_month', axis=1, inplace=True)
# Convert precip
df['precip_tenths_mm'] = pd.to_numeric(df['precip_tenths_mm'], errors='coerce')
original_len = len(df)
df = df.dropna(subset=['precip_tenths_mm'])
if len(df) < original_len:
print(f" Dropped {original_len - len(df)} rows due to NaN precip")
df['precip_tenths_mm'] = df['precip_tenths_mm'].astype('Int32')
# 2. Handle Trace
df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0)
# 3. Create mm
df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1)
# INSPECT Quality Flags
print(f" Unique Quality Flags before filtering: {df['quality_flag'].unique()}")
rows_before_quality = len(df)
# FIXED FILTERING LOGIC
# Using isin includes NAs as False (not in list), so ~False is True (Keep)
bad_flags = ['O', 'R', 'T', 'S', 'K']
# Ensure we handle NAs correctly. If isin returns NA for NA, we need to be careful.
# But usually isin returns boolean.
# Let's check specifically for the string dtype behavior.
mask = df['quality_flag'].isin(bad_flags)
# If mask contains NA, fill it with False (don't drop)
mask = mask.fillna(False)
df = df[~mask]
if len(df) < rows_before_quality:
print(f" Dropped {rows_before_quality - len(df)} rows due to quality flags")
# Check Length
if len(df) < 120:
print(f" [SKIP] Too few rows: {len(df)} < 120")
continue
# Check Extreme Values
max_precip = df['precip_mm'].max()
if (df['precip_mm'] > 2000).any():
print(f" [SKIP] Extreme value found: {max_precip} > 2000mm")
continue
print(f" [KEEP] File would be kept. Rows: {len(df)}, Max Precip: {max_precip}")
|