File size: 3,734 Bytes
f34a7ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import glob
import pandas as pd
import os

# Define colspecs and names (copied from original script)
colspecs = [
    (0, 11),    # 1. GHCN identifier (columns 1-11)
    (12, 52),   # 2. Station name (columns 13-52)
    (53, 62),   # 3. Latitude (columns 54-62)
    (63, 73),   # 4. Longitude (columns 64-73)
    (74, 82),   # 5. Elevation (meters) (columns 75-82)
    (83, 89),   # 6. Year and month (columns 84-89)
    (90, 96),   # 7. Precipitation value (columns 91-96)
    (97, 98),   # 8. Measurement flag (column 98)
    (99, 100),  # 9. Quality control flag (column 100)
    (101, 102), # 10. Source flag (column 102)
    (103, 109), # 11. Source index (columns 104-109)
]

column_names = [
    "ghcn_id",
    "station_name",
    "latitude",
    "longitude",
    "elevation_m",
    "year_month",
    "precip_tenths_mm",
    "measurement_flag",
    "quality_flag",
    "source_flag",
    "source_index"
]

dtypes = {
    "ghcn_id": "string",
    "station_name": "string",
    "latitude": "float32",
    "longitude": "float32",
    "elevation_m": "float32",
    "year_month": "string",
    "precip_tenths_mm": "string",
    "measurement_flag": "string",
    "quality_flag": "string",
    "source_flag": "string",
    "source_index": "string"
}

# Path to CSV files
csv_path = 'misc/data/pp/*.csv'
csv_files = glob.glob(csv_path)

print(f"Found {len(csv_files)} files. Testing the first 50...")

for i, file in enumerate(csv_files[:50]):
    print(f"\nProcessing {file}...")
    
    try:
        df = pd.read_fwf(
            file,
            colspecs=colspecs,
            names=column_names,
            dtype=dtypes,
            header=None
        )
    except Exception as e:
        print(f"FAILED to read file: {e}")
        continue

    # 1. Parse 'year_month'
    df['year'] = pd.to_numeric(df['year_month'].str[:4], errors='coerce').astype('Int16')
    df['month'] = pd.to_numeric(df['year_month'].str[4:], errors='coerce').astype('Int8')
    df.drop('year_month', axis=1, inplace=True)

    # Convert precip
    df['precip_tenths_mm'] = pd.to_numeric(df['precip_tenths_mm'], errors='coerce')
    original_len = len(df)
    df = df.dropna(subset=['precip_tenths_mm'])
    if len(df) < original_len:
         print(f"  Dropped {original_len - len(df)} rows due to NaN precip")

    df['precip_tenths_mm'] = df['precip_tenths_mm'].astype('Int32')

    # 2. Handle Trace
    df['precip_tenths_mm'] = df['precip_tenths_mm'].replace(-1, 0)

    # 3. Create mm
    df['precip_mm'] = (df['precip_tenths_mm'].astype('Float32') / 10.0).round(1)
    
    # INSPECT Quality Flags
    print(f"  Unique Quality Flags before filtering: {df['quality_flag'].unique()}")
    
    rows_before_quality = len(df)
    # FIXED FILTERING LOGIC
    # Using isin includes NAs as False (not in list), so ~False is True (Keep)
    bad_flags = ['O', 'R', 'T', 'S', 'K']
    # Ensure we handle NAs correctly. If isin returns NA for NA, we need to be careful.
    # But usually isin returns boolean.
    # Let's check specifically for the string dtype behavior.
    mask = df['quality_flag'].isin(bad_flags)
    # If mask contains NA, fill it with False (don't drop)
    mask = mask.fillna(False) 
    
    df = df[~mask]
    
    if len(df) < rows_before_quality:
        print(f"  Dropped {rows_before_quality - len(df)} rows due to quality flags")

    # Check Length
    if len(df) < 120:
        print(f"  [SKIP] Too few rows: {len(df)} < 120")
        continue

    # Check Extreme Values
    max_precip = df['precip_mm'].max()
    if (df['precip_mm'] > 2000).any():
        print(f"  [SKIP] Extreme value found: {max_precip} > 2000mm")
        continue

    print(f"  [KEEP] File would be kept. Rows: {len(df)}, Max Precip: {max_precip}")