Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import glob | |
| import joblib | |
| import os | |
| from sklearn.ensemble import RandomForestClassifier, IsolationForest | |
| from sklearn.metrics import classification_report | |
| from sklearn.model_selection import train_test_split | |
| # --- CONFIG --- | |
| DATA_DIR = "datasets" | |
| MODEL_DIR = "." | |
| REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"] | |
| FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"] | |
| def generate_synthetic_weather(df): | |
| """ | |
| Generates Wind and Temp based on Month if missing. | |
| Strategy: | |
| - Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C) | |
| - Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C) | |
| """ | |
| # Ensure Timestamp is datetime | |
| # Looking at data: 01-01-2020 -> DD-MM-YYYY | |
| df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce') | |
| # Extract month | |
| # Fill NaNs in Timestamp first or drop? We need month for logic. | |
| # Let's ffill timestamp if missing or just use random if really needed. | |
| # Looking at data, Timestamp seems consistent. | |
| mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2]) | |
| # Wind | |
| if 'Wind' not in df.columns: | |
| df['Wind'] = np.nan | |
| # Generate for Winter | |
| n_winter = mask_winter.sum() | |
| if n_winter > 0: | |
| df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter) | |
| # Generate for Rest | |
| n_other = (~mask_winter).sum() | |
| if n_other > 0: | |
| df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other) | |
| # Temp | |
| if 'Temp' not in df.columns: | |
| df['Temp'] = np.nan | |
| if n_winter > 0: | |
| df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter) | |
| if n_other > 0: | |
| df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other) | |
| return df | |
| def label_data(row): | |
| """ | |
| Apply Physics Rules to label the primary source. | |
| """ | |
| pm25 = row.get('PM2.5', 0) | |
| no2 = row.get('NO2', 0) | |
| so2 = row.get('SO2', 0) | |
| co = row.get('CO', 0) | |
| wind = row.get('Wind', 0) | |
| temp = row.get('Temp', 0) | |
| # Avoid division by zero | |
| so2_no2_ratio = so2 / no2 if no2 > 0 else 0 | |
| # Rule 1: INDUSTRY | |
| if so2 > 15 and so2_no2_ratio > 0.3: | |
| return "INDUSTRY" | |
| # Rule 2: FARM_FIRE | |
| if co > 2.0 and pm25 > 200: | |
| return "FARM_FIRE" | |
| # Rule 3: TRAFFIC | |
| if no2 > 60 and so2 < 10: | |
| return "TRAFFIC" | |
| # Rule 4: BOWL_EFFECT | |
| if wind < 1.0 and temp < 10: | |
| return "BOWL_EFFECT" | |
| # Rule 5: CLEAN | |
| if pm25 < 60: | |
| return "CLEAN" | |
| return None | |
| def main(): | |
| print("--- Starting Training Pipeline ---") | |
| # 1. Data Ingestion | |
| all_files = glob.glob(os.path.join(DATA_DIR, "*.csv")) | |
| print(f"Found {len(all_files)} CSV files.") | |
| df_list = [] | |
| for f in all_files: | |
| try: | |
| temp_df = pd.read_csv(f) | |
| df_list.append(temp_df) | |
| except Exception as e: | |
| print(f"Error reading {f}: {e}") | |
| if not df_list: | |
| print("No data found. Exiting.") | |
| return | |
| master_df = pd.concat(df_list, ignore_index=True) | |
| print(f"Merged Data Shape (Raw): {master_df.shape}") | |
| # Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO) | |
| # We just ensure they exist and map if necessary. | |
| # Based on view, they map directly. | |
| # 2. Preprocessing & Imputation | |
| # Forward fill NaNs | |
| master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill() | |
| # Synthetic Weather | |
| print("Generating Synthetic Weather...") | |
| master_df = generate_synthetic_weather(master_df) | |
| # Final check for NaNs in features | |
| master_df[FEATURES] = master_df[FEATURES].ffill().bfill() | |
| master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain | |
| print(f"Data Shape after Preprocessing: {master_df.shape}") | |
| # 3. Labeling | |
| print("Labeling Data...") | |
| master_df['target'] = master_df.apply(label_data, axis=1) | |
| # Filter labeled rows | |
| labeled_df = master_df[master_df['target'].notna()].copy() | |
| print(f"Labeled Data Shape: {labeled_df.shape}") | |
| print("Class Distribution:") | |
| print(labeled_df['target'].value_counts()) | |
| if labeled_df.empty: | |
| print("No classes labeled! Adjust rules.") | |
| return | |
| # 4. Model A: RandomForest (Source Classification) | |
| print("Training Source Classifier (RandomForest)...") | |
| X = labeled_df[FEATURES] | |
| y = labeled_df['target'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| clf = RandomForestClassifier(n_estimators=100, random_state=42) | |
| clf.fit(X_train, y_train) | |
| print("Source Model Evaluation:") | |
| y_pred = clf.predict(X_test) | |
| print(classification_report(y_test, y_pred)) | |
| joblib.dump(clf, "source_model.pkl") | |
| print("Saved source_model.pkl") | |
| # 5. Model B: IsolationForest (Anomaly Detection) | |
| # Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies. | |
| # But here we just want a general anomaly detector for readings. | |
| # Let's train on the merged dataset features. | |
| print("Training Anomaly Detector (IsolationForest)...") | |
| iso = IsolationForest(contamination=0.05, random_state=42) | |
| iso.fit(master_df[FEATURES]) | |
| joblib.dump(iso, "fraud_model.pkl") | |
| print("Saved fraud_model.pkl") | |
| print("--- Training Complete ---") | |
| if __name__ == "__main__": | |
| main() | |