File size: 5,673 Bytes
e4667e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import pandas as pd
import numpy as np
import glob
import joblib
import os
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# --- CONFIG ---
DATA_DIR = "datasets"
MODEL_DIR = "."
REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"]
FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"]

def generate_synthetic_weather(df):
    """
    Generates Wind and Temp based on Month if missing.
    Strategy:
    - Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C)
    - Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C)
    """
    # Ensure Timestamp is datetime
    # Looking at data: 01-01-2020 -> DD-MM-YYYY
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce')
    
    # Extract month
    # Fill NaNs in Timestamp first or drop? We need month for logic. 
    # Let's ffill timestamp if missing or just use random if really needed.
    # Looking at data, Timestamp seems consistent.
    
    mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2])
    
    # Wind
    if 'Wind' not in df.columns:
        df['Wind'] = np.nan
    
    # Generate for Winter
    n_winter = mask_winter.sum()
    if n_winter > 0:
        df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter)
    
    # Generate for Rest
    n_other = (~mask_winter).sum()
    if n_other > 0:
        df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other)

    # Temp
    if 'Temp' not in df.columns:
        df['Temp'] = np.nan
        
    if n_winter > 0:
        df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter)
    
    if n_other > 0:
        df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other)
        
    return df

def label_data(row):
    """
    Apply Physics Rules to label the primary source.
    """
    pm25 = row.get('PM2.5', 0)
    no2 = row.get('NO2', 0)
    so2 = row.get('SO2', 0)
    co = row.get('CO', 0)
    wind = row.get('Wind', 0)
    temp = row.get('Temp', 0)
    
    # Avoid division by zero
    so2_no2_ratio = so2 / no2 if no2 > 0 else 0
    
    # Rule 1: INDUSTRY
    if so2 > 15 and so2_no2_ratio > 0.3:
        return "INDUSTRY"
    
    # Rule 2: FARM_FIRE
    if co > 2.0 and pm25 > 200:
        return "FARM_FIRE"
    
    # Rule 3: TRAFFIC
    if no2 > 60 and so2 < 10:
        return "TRAFFIC"
    
    # Rule 4: BOWL_EFFECT
    if wind < 1.0 and temp < 10:
        return "BOWL_EFFECT"
    
    # Rule 5: CLEAN
    if pm25 < 60:
        return "CLEAN"
        
    return None

def main():
    print("--- Starting Training Pipeline ---")
    
    # 1. Data Ingestion
    all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
    print(f"Found {len(all_files)} CSV files.")
    
    df_list = []
    for f in all_files:
        try:
            temp_df = pd.read_csv(f)
            df_list.append(temp_df)
        except Exception as e:
            print(f"Error reading {f}: {e}")
            
    if not df_list:
        print("No data found. Exiting.")
        return

    master_df = pd.concat(df_list, ignore_index=True)
    print(f"Merged Data Shape (Raw): {master_df.shape}")
    
    # Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO)
    # We just ensure they exist and map if necessary.
    # Based on view, they map directly.
    
    # 2. Preprocessing & Imputation
    # Forward fill NaNs
    master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill()
    
    # Synthetic Weather
    print("Generating Synthetic Weather...")
    master_df = generate_synthetic_weather(master_df)
    
    # Final check for NaNs in features
    master_df[FEATURES] = master_df[FEATURES].ffill().bfill()
    master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain
    
    print(f"Data Shape after Preprocessing: {master_df.shape}")

    # 3. Labeling
    print("Labeling Data...")
    master_df['target'] = master_df.apply(label_data, axis=1)
    
    # Filter labeled rows
    labeled_df = master_df[master_df['target'].notna()].copy()
    print(f"Labeled Data Shape: {labeled_df.shape}")
    print("Class Distribution:")
    print(labeled_df['target'].value_counts())
    
    if labeled_df.empty:
        print("No classes labeled! Adjust rules.")
        return

    # 4. Model A: RandomForest (Source Classification)
    print("Training Source Classifier (RandomForest)...")
    X = labeled_df[FEATURES]
    y = labeled_df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    print("Source Model Evaluation:")
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    joblib.dump(clf, "source_model.pkl")
    print("Saved source_model.pkl")
    
    # 5. Model B: IsolationForest (Anomaly Detection)
    # Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies.
    # But here we just want a general anomaly detector for readings.
    # Let's train on the merged dataset features.
    print("Training Anomaly Detector (IsolationForest)...")
    iso = IsolationForest(contamination=0.05, random_state=42)
    iso.fit(master_df[FEATURES])
    
    joblib.dump(iso, "fraud_model.pkl")
    print("Saved fraud_model.pkl")
    
    print("--- Training Complete ---")

if __name__ == "__main__":
    main()