Spaces:
Sleeping
Sleeping
File size: 5,673 Bytes
e4667e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | import pandas as pd
import numpy as np
import glob
import joblib
import os
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# --- CONFIG ---
DATA_DIR = "datasets"
MODEL_DIR = "."
REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"]
FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"]
def generate_synthetic_weather(df):
"""
Generates Wind and Temp based on Month if missing.
Strategy:
- Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C)
- Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C)
"""
# Ensure Timestamp is datetime
# Looking at data: 01-01-2020 -> DD-MM-YYYY
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce')
# Extract month
# Fill NaNs in Timestamp first or drop? We need month for logic.
# Let's ffill timestamp if missing or just use random if really needed.
# Looking at data, Timestamp seems consistent.
mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2])
# Wind
if 'Wind' not in df.columns:
df['Wind'] = np.nan
# Generate for Winter
n_winter = mask_winter.sum()
if n_winter > 0:
df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter)
# Generate for Rest
n_other = (~mask_winter).sum()
if n_other > 0:
df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other)
# Temp
if 'Temp' not in df.columns:
df['Temp'] = np.nan
if n_winter > 0:
df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter)
if n_other > 0:
df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other)
return df
def label_data(row):
"""
Apply Physics Rules to label the primary source.
"""
pm25 = row.get('PM2.5', 0)
no2 = row.get('NO2', 0)
so2 = row.get('SO2', 0)
co = row.get('CO', 0)
wind = row.get('Wind', 0)
temp = row.get('Temp', 0)
# Avoid division by zero
so2_no2_ratio = so2 / no2 if no2 > 0 else 0
# Rule 1: INDUSTRY
if so2 > 15 and so2_no2_ratio > 0.3:
return "INDUSTRY"
# Rule 2: FARM_FIRE
if co > 2.0 and pm25 > 200:
return "FARM_FIRE"
# Rule 3: TRAFFIC
if no2 > 60 and so2 < 10:
return "TRAFFIC"
# Rule 4: BOWL_EFFECT
if wind < 1.0 and temp < 10:
return "BOWL_EFFECT"
# Rule 5: CLEAN
if pm25 < 60:
return "CLEAN"
return None
def main():
print("--- Starting Training Pipeline ---")
# 1. Data Ingestion
all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
print(f"Found {len(all_files)} CSV files.")
df_list = []
for f in all_files:
try:
temp_df = pd.read_csv(f)
df_list.append(temp_df)
except Exception as e:
print(f"Error reading {f}: {e}")
if not df_list:
print("No data found. Exiting.")
return
master_df = pd.concat(df_list, ignore_index=True)
print(f"Merged Data Shape (Raw): {master_df.shape}")
# Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO)
# We just ensure they exist and map if necessary.
# Based on view, they map directly.
# 2. Preprocessing & Imputation
# Forward fill NaNs
master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill()
# Synthetic Weather
print("Generating Synthetic Weather...")
master_df = generate_synthetic_weather(master_df)
# Final check for NaNs in features
master_df[FEATURES] = master_df[FEATURES].ffill().bfill()
master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain
print(f"Data Shape after Preprocessing: {master_df.shape}")
# 3. Labeling
print("Labeling Data...")
master_df['target'] = master_df.apply(label_data, axis=1)
# Filter labeled rows
labeled_df = master_df[master_df['target'].notna()].copy()
print(f"Labeled Data Shape: {labeled_df.shape}")
print("Class Distribution:")
print(labeled_df['target'].value_counts())
if labeled_df.empty:
print("No classes labeled! Adjust rules.")
return
# 4. Model A: RandomForest (Source Classification)
print("Training Source Classifier (RandomForest)...")
X = labeled_df[FEATURES]
y = labeled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Source Model Evaluation:")
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
joblib.dump(clf, "source_model.pkl")
print("Saved source_model.pkl")
# 5. Model B: IsolationForest (Anomaly Detection)
# Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies.
# But here we just want a general anomaly detector for readings.
# Let's train on the merged dataset features.
print("Training Anomaly Detector (IsolationForest)...")
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(master_df[FEATURES])
joblib.dump(iso, "fraud_model.pkl")
print("Saved fraud_model.pkl")
print("--- Training Complete ---")
if __name__ == "__main__":
main()
|