Envirolytics_backend / train_model.py
Ayush Modi
Clean deploy with LFS
e4667e2
import pandas as pd
import numpy as np
import glob
import joblib
import os
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# --- CONFIG ---
DATA_DIR = "datasets"
MODEL_DIR = "."
REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"]
FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"]
def generate_synthetic_weather(df):
"""
Generates Wind and Temp based on Month if missing.
Strategy:
- Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C)
- Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C)
"""
# Ensure Timestamp is datetime
# Looking at data: 01-01-2020 -> DD-MM-YYYY
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce')
# Extract month
# Fill NaNs in Timestamp first or drop? We need month for logic.
# Let's ffill timestamp if missing or just use random if really needed.
# Looking at data, Timestamp seems consistent.
mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2])
# Wind
if 'Wind' not in df.columns:
df['Wind'] = np.nan
# Generate for Winter
n_winter = mask_winter.sum()
if n_winter > 0:
df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter)
# Generate for Rest
n_other = (~mask_winter).sum()
if n_other > 0:
df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other)
# Temp
if 'Temp' not in df.columns:
df['Temp'] = np.nan
if n_winter > 0:
df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter)
if n_other > 0:
df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other)
return df
def label_data(row):
"""
Apply Physics Rules to label the primary source.
"""
pm25 = row.get('PM2.5', 0)
no2 = row.get('NO2', 0)
so2 = row.get('SO2', 0)
co = row.get('CO', 0)
wind = row.get('Wind', 0)
temp = row.get('Temp', 0)
# Avoid division by zero
so2_no2_ratio = so2 / no2 if no2 > 0 else 0
# Rule 1: INDUSTRY
if so2 > 15 and so2_no2_ratio > 0.3:
return "INDUSTRY"
# Rule 2: FARM_FIRE
if co > 2.0 and pm25 > 200:
return "FARM_FIRE"
# Rule 3: TRAFFIC
if no2 > 60 and so2 < 10:
return "TRAFFIC"
# Rule 4: BOWL_EFFECT
if wind < 1.0 and temp < 10:
return "BOWL_EFFECT"
# Rule 5: CLEAN
if pm25 < 60:
return "CLEAN"
return None
def main():
print("--- Starting Training Pipeline ---")
# 1. Data Ingestion
all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
print(f"Found {len(all_files)} CSV files.")
df_list = []
for f in all_files:
try:
temp_df = pd.read_csv(f)
df_list.append(temp_df)
except Exception as e:
print(f"Error reading {f}: {e}")
if not df_list:
print("No data found. Exiting.")
return
master_df = pd.concat(df_list, ignore_index=True)
print(f"Merged Data Shape (Raw): {master_df.shape}")
# Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO)
# We just ensure they exist and map if necessary.
# Based on view, they map directly.
# 2. Preprocessing & Imputation
# Forward fill NaNs
master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill()
# Synthetic Weather
print("Generating Synthetic Weather...")
master_df = generate_synthetic_weather(master_df)
# Final check for NaNs in features
master_df[FEATURES] = master_df[FEATURES].ffill().bfill()
master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain
print(f"Data Shape after Preprocessing: {master_df.shape}")
# 3. Labeling
print("Labeling Data...")
master_df['target'] = master_df.apply(label_data, axis=1)
# Filter labeled rows
labeled_df = master_df[master_df['target'].notna()].copy()
print(f"Labeled Data Shape: {labeled_df.shape}")
print("Class Distribution:")
print(labeled_df['target'].value_counts())
if labeled_df.empty:
print("No classes labeled! Adjust rules.")
return
# 4. Model A: RandomForest (Source Classification)
print("Training Source Classifier (RandomForest)...")
X = labeled_df[FEATURES]
y = labeled_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Source Model Evaluation:")
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
joblib.dump(clf, "source_model.pkl")
print("Saved source_model.pkl")
# 5. Model B: IsolationForest (Anomaly Detection)
# Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies.
# But here we just want a general anomaly detector for readings.
# Let's train on the merged dataset features.
print("Training Anomaly Detector (IsolationForest)...")
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(master_df[FEATURES])
joblib.dump(iso, "fraud_model.pkl")
print("Saved fraud_model.pkl")
print("--- Training Complete ---")
if __name__ == "__main__":
main()