Spaces:

Ayush472
/

Envirolytics_backend

Sleeping

Envirolytics_backend / train_model.py

Ayush Modi

Clean deploy with LFS

e4667e2 10 days ago

5.67 kB

	import pandas as pd
	import numpy as np
	import glob
	import joblib
	import os
	from sklearn.ensemble import RandomForestClassifier, IsolationForest
	from sklearn.metrics import classification_report
	from sklearn.model_selection import train_test_split

	# --- CONFIG ---
	DATA_DIR = "datasets"
	MODEL_DIR = "."
	REQUIRED_COLS = ["PM2.5", "NO2", "SO2", "CO"]
	FEATURES = ["PM2.5", "NO2", "SO2", "CO", "Wind", "Temp"]

	def generate_synthetic_weather(df):
	"""
	Generates Wind and Temp based on Month if missing.
	Strategy:
	- Winter (Nov, Dec, Jan, Feb): Low Wind (0.5 - 2.0 m/s), Low Temp (5 - 15 C)
	- Summer/Other: Higher Wind (2.0 - 5.0 m/s), Higher Temp (25 - 40 C)
	"""
	# Ensure Timestamp is datetime
	# Looking at data: 01-01-2020 -> DD-MM-YYYY
	df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d-%m-%Y', dayfirst=True, errors='coerce')

	# Extract month
	# Fill NaNs in Timestamp first or drop? We need month for logic.
	# Let's ffill timestamp if missing or just use random if really needed.
	# Looking at data, Timestamp seems consistent.

	mask_winter = df['Timestamp'].dt.month.isin([11, 12, 1, 2])

	# Wind
	if 'Wind' not in df.columns:
	df['Wind'] = np.nan

	# Generate for Winter
	n_winter = mask_winter.sum()
	if n_winter > 0:
	df.loc[mask_winter & df['Wind'].isna(), 'Wind'] = np.random.uniform(0.5, 2.0, n_winter)

	# Generate for Rest
	n_other = (~mask_winter).sum()
	if n_other > 0:
	df.loc[(~mask_winter) & df['Wind'].isna(), 'Wind'] = np.random.uniform(2.0, 5.0, n_other)

	# Temp
	if 'Temp' not in df.columns:
	df['Temp'] = np.nan

	if n_winter > 0:
	df.loc[mask_winter & df['Temp'].isna(), 'Temp'] = np.random.uniform(5.0, 15.0, n_winter)

	if n_other > 0:
	df.loc[(~mask_winter) & df['Temp'].isna(), 'Temp'] = np.random.uniform(25.0, 40.0, n_other)

	return df

	def label_data(row):
	"""
	Apply Physics Rules to label the primary source.
	"""
	pm25 = row.get('PM2.5', 0)
	no2 = row.get('NO2', 0)
	so2 = row.get('SO2', 0)
	co = row.get('CO', 0)
	wind = row.get('Wind', 0)
	temp = row.get('Temp', 0)

	# Avoid division by zero
	so2_no2_ratio = so2 / no2 if no2 > 0 else 0

	# Rule 1: INDUSTRY
	if so2 > 15 and so2_no2_ratio > 0.3:
	return "INDUSTRY"

	# Rule 2: FARM_FIRE
	if co > 2.0 and pm25 > 200:
	return "FARM_FIRE"

	# Rule 3: TRAFFIC
	if no2 > 60 and so2 < 10:
	return "TRAFFIC"

	# Rule 4: BOWL_EFFECT
	if wind < 1.0 and temp < 10:
	return "BOWL_EFFECT"

	# Rule 5: CLEAN
	if pm25 < 60:
	return "CLEAN"

	return None

	def main():
	print("--- Starting Training Pipeline ---")

	# 1. Data Ingestion
	all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
	print(f"Found {len(all_files)} CSV files.")

	df_list = []
	for f in all_files:
	try:
	temp_df = pd.read_csv(f)
	df_list.append(temp_df)
	except Exception as e:
	print(f"Error reading {f}: {e}")

	if not df_list:
	print("No data found. Exiting.")
	return

	master_df = pd.concat(df_list, ignore_index=True)
	print(f"Merged Data Shape (Raw): {master_df.shape}")

	# Standardize columns (The datasets viewed already have PM2.5, NO2, SO2, CO)
	# We just ensure they exist and map if necessary.
	# Based on view, they map directly.

	# 2. Preprocessing & Imputation
	# Forward fill NaNs
	master_df[REQUIRED_COLS] = master_df[REQUIRED_COLS].ffill().bfill()

	# Synthetic Weather
	print("Generating Synthetic Weather...")
	master_df = generate_synthetic_weather(master_df)

	# Final check for NaNs in features
	master_df[FEATURES] = master_df[FEATURES].ffill().bfill()
	master_df.dropna(subset=FEATURES, inplace=True) # Drop if any still remain

	print(f"Data Shape after Preprocessing: {master_df.shape}")

	# 3. Labeling
	print("Labeling Data...")
	master_df['target'] = master_df.apply(label_data, axis=1)

	# Filter labeled rows
	labeled_df = master_df[master_df['target'].notna()].copy()
	print(f"Labeled Data Shape: {labeled_df.shape}")
	print("Class Distribution:")
	print(labeled_df['target'].value_counts())

	if labeled_df.empty:
	print("No classes labeled! Adjust rules.")
	return

	# 4. Model A: RandomForest (Source Classification)
	print("Training Source Classifier (RandomForest)...")
	X = labeled_df[FEATURES]
	y = labeled_df['target']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	clf = RandomForestClassifier(n_estimators=100, random_state=42)
	clf.fit(X_train, y_train)

	print("Source Model Evaluation:")
	y_pred = clf.predict(X_test)
	print(classification_report(y_test, y_pred))

	joblib.dump(clf, "source_model.pkl")
	print("Saved source_model.pkl")

	# 5. Model B: IsolationForest (Anomaly Detection)
	# Train on all data (or just labeled) to find outliers? usually trained on 'normal' data to find anomalies.
	# But here we just want a general anomaly detector for readings.
	# Let's train on the merged dataset features.
	print("Training Anomaly Detector (IsolationForest)...")
	iso = IsolationForest(contamination=0.05, random_state=42)
	iso.fit(master_df[FEATURES])

	joblib.dump(iso, "fraud_model.pkl")
	print("Saved fraud_model.pkl")

	print("--- Training Complete ---")

	if __name__ == "__main__":
	main()