import pandas as pd import numpy as np import glob import joblib import os from sklearn.ensemble import RandomForestClassifier, IsolationForest from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split # --- CONFIG --- DATA_DIR = "datasets" MODEL_DIR = "." REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"] FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"] def generate_synthetic_weather(df): """ Generates Wind and Temp based on Month if missing. Strategy: - Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C) - Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C) """ # Ensure Timestamp is datetime # Looking at data: 01-01-2020 -> DD-MM-YYYY df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce') # Extract month # Fill NaNs in Timestamp first or drop? We need month for logic. # Let's ffill timestamp if missing or just use random if really needed. # Looking at data, Timestamp seems consistent. mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2]) # Wind if 'Wind' not in df.columns: df['Wind'] = np.nan # Generate for Winter n_winter = mask_winter.sum() if n_winter > 0: df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter) # Generate for Rest n_other = (~mask_winter).sum() if n_other > 0: df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other) # Temp if 'Temp' not in df.columns: df['Temp'] = np.nan if n_winter > 0: df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter) if n_other > 0: df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other) return df def label_data(row): """ Apply Physics Rules to label the primary source. """ pm25 = row.get('PM2.5', 0) no2 = row.get('NO2', 0) so2 = row.get('SO2', 0) co = row.get('CO', 0) wind = row.get('Wind', 0) temp = row.get('Temp', 0) # Avoid division by zero so2_no2_ratio = so2 / no2 if no2 > 0 else 0 # Rule 1: INDUSTRY if so2 > 15 and so2_no2_ratio > 0.3: return "INDUSTRY" # Rule 2: FARM_FIRE if co > 2.0 and pm25 > 200: return "FARM_FIRE" # Rule 3: TRAFFIC if no2 > 60 and so2 < 10: return "TRAFFIC" # Rule 4: BOWL_EFFECT if wind < 1.0 and temp < 10: return "BOWL_EFFECT" # Rule 5: CLEAN if pm25 < 60: return "CLEAN" return None def main(): print("--- Starting Training Pipeline ---") # 1. Data Ingestion all_files = glob.glob(os.path.join(DATA_DIR, "*.csv")) print(f"Found {len(all_files)} CSV files.") df_list = [] for f in all_files: try: temp_df = pd.read_csv(f) df_list.append(temp_df) except Exception as e: print(f"Error reading {f}: {e}") if not df_list: print("No data found. Exiting.") return master_df = pd.concat(df_list, ignore_index=True) print(f"Merged Data Shape (Raw): {master_df.shape}") # Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO) # We just ensure they exist and map if necessary. # Based on view, they map directly. # 2. Preprocessing & Imputation # Forward fill NaNs master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill() # Synthetic Weather print("Generating Synthetic Weather...") master_df = generate_synthetic_weather(master_df) # Final check for NaNs in features master_df[FEATURES] = master_df[FEATURES].ffill().bfill() master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain print(f"Data Shape after Preprocessing: {master_df.shape}") # 3. Labeling print("Labeling Data...") master_df['target'] = master_df.apply(label_data, axis=1) # Filter labeled rows labeled_df = master_df[master_df['target'].notna()].copy() print(f"Labeled Data Shape: {labeled_df.shape}") print("Class Distribution:") print(labeled_df['target'].value_counts()) if labeled_df.empty: print("No classes labeled! Adjust rules.") return # 4. Model A: RandomForest (Source Classification) print("Training Source Classifier (RandomForest)...") X = labeled_df[FEATURES] y = labeled_df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) clf = RandomForestClassifier(n_estimators=100, random_state=42) clf.fit(X_train, y_train) print("Source Model Evaluation:") y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred)) joblib.dump(clf, "source_model.pkl") print("Saved source_model.pkl") # 5. Model B: IsolationForest (Anomaly Detection) # Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies. # But here we just want a general anomaly detector for readings. # Let's train on the merged dataset features. print("Training Anomaly Detector (IsolationForest)...") iso = IsolationForest(contamination=0.05, random_state=42) iso.fit(master_df[FEATURES]) joblib.dump(iso, "fraud_model.pkl") print("Saved fraud_model.pkl") print("--- Training Complete ---") if __name__ == "__main__": main()