Spaces:

Ayush472
/

Envirolytics_backend

Sleeping

File size: 5,673 Bytes

e4667e2

import pandas as pd
import numpy as np
import glob
import joblib
import os
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# --- CONFIG ---
DATA_DIR = "datasets"
MODEL_DIR = "."
REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"]
FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"]

def generate_synthetic_weather(df):
    """
    Generates Wind and Temp based on Month if missing.
    Strategy:
    - Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C)
    - Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C)
    """
    # Ensure Timestamp is datetime
    # Looking at data: 01-01-2020 -> DD-MM-YYYY
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce')
    
    # Extract month
    # Fill NaNs in Timestamp first or drop? We need month for logic. 
    # Let's ffill timestamp if missing or just use random if really needed.
    # Looking at data, Timestamp seems consistent.
    
    mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2])
    
    # Wind
    if 'Wind' not in df.columns:
        df['Wind'] = np.nan
    
    # Generate for Winter
    n_winter = mask_winter.sum()
    if n_winter > 0:
        df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter)
    
    # Generate for Rest
    n_other = (~mask_winter).sum()
    if n_other > 0:
        df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other)

    # Temp
    if 'Temp' not in df.columns:
        df['Temp'] = np.nan
        
    if n_winter > 0:
        df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter)
    
    if n_other > 0:
        df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other)
        
    return df

def label_data(row):
    """
    Apply Physics Rules to label the primary source.
    """
    pm25 = row.get('PM2.5', 0)
    no2 = row.get('NO2', 0)
    so2 = row.get('SO2', 0)
    co = row.get('CO', 0)
    wind = row.get('Wind', 0)
    temp = row.get('Temp', 0)
    
    # Avoid division by zero
    so2_no2_ratio = so2 / no2 if no2 > 0 else 0
    
    # Rule 1: INDUSTRY
    if so2 > 15 and so2_no2_ratio > 0.3:
        return "INDUSTRY"
    
    # Rule 2: FARM_FIRE
    if co > 2.0 and pm25 > 200:
        return "FARM_FIRE"
    
    # Rule 3: TRAFFIC
    if no2 > 60 and so2 < 10:
        return "TRAFFIC"
    
    # Rule 4: BOWL_EFFECT
    if wind < 1.0 and temp < 10:
        return "BOWL_EFFECT"
    
    # Rule 5: CLEAN
    if pm25 < 60:
        return "CLEAN"
        
    return None

def main():
    print("--- Starting Training Pipeline ---")
    
    # 1. Data Ingestion
    all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
    print(f"Found {len(all_files)} CSV files.")
    
    df_list = []
    for f in all_files:
        try:
            temp_df = pd.read_csv(f)
            df_list.append(temp_df)
        except Exception as e:
            print(f"Error reading {f}: {e}")
            
    if not df_list:
        print("No data found. Exiting.")
        return

    master_df = pd.concat(df_list, ignore_index=True)
    print(f"Merged Data Shape (Raw): {master_df.shape}")
    
    # Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO)
    # We just ensure they exist and map if necessary.
    # Based on view, they map directly.
    
    # 2. Preprocessing & Imputation
    # Forward fill NaNs
    master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill()
    
    # Synthetic Weather
    print("Generating Synthetic Weather...")
    master_df = generate_synthetic_weather(master_df)
    
    # Final check for NaNs in features
    master_df[FEATURES] = master_df[FEATURES].ffill().bfill()
    master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain
    
    print(f"Data Shape after Preprocessing: {master_df.shape}")

    # 3. Labeling
    print("Labeling Data...")
    master_df['target'] = master_df.apply(label_data, axis=1)
    
    # Filter labeled rows
    labeled_df = master_df[master_df['target'].notna()].copy()
    print(f"Labeled Data Shape: {labeled_df.shape}")
    print("Class Distribution:")
    print(labeled_df['target'].value_counts())
    
    if labeled_df.empty:
        print("No classes labeled! Adjust rules.")
        return

    # 4. Model A: RandomForest (Source Classification)
    print("Training Source Classifier (RandomForest)...")
    X = labeled_df[FEATURES]
    y = labeled_df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    print("Source Model Evaluation:")
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    joblib.dump(clf, "source_model.pkl")
    print("Saved source_model.pkl")
    
    # 5. Model B: IsolationForest (Anomaly Detection)
    # Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies.
    # But here we just want a general anomaly detector for readings.
    # Let's train on the merged dataset features.
    print("Training Anomaly Detector (IsolationForest)...")
    iso = IsolationForest(contamination=0.05, random_state=42)
    iso.fit(master_df[FEATURES])
    
    joblib.dump(iso, "fraud_model.pkl")
    print("Saved fraud_model.pkl")
    
    print("--- Training Complete ---")

if __name__ == "__main__":
    main()