""" Phase 3 – Data Preprocessing & Feature Engineering """ import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier from imblearn.over_sampling import SMOTE import warnings, sys, os warnings.filterwarnings("ignore") sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from utils.helpers import load_leads, load_bank st.set_page_config(page_title="Phase 3 – Preprocessing", page_icon="⚙️", layout="wide") st.markdown(""" """, unsafe_allow_html=True) st.markdown('

⚙️ Phase 3 – Preprocessing & Feature Engineering

' '

Imputation · Encoding · Feature creation · Scaling · SMOTE · Train-test split

', unsafe_allow_html=True) leads = load_leads() bank = load_bank() dataset_choice = st.radio("Select Dataset", ["Leads (EdTech)", "Bank Marketing (UCI)"], horizontal=True) is_leads = dataset_choice.startswith("Leads") df_raw = leads.copy() if is_leads else bank.copy() target = "Converted" if is_leads else "y" name = "Leads" if is_leads else "Bank" # ── Step Configuration Panel ────────────────────────────────────────────────── st.markdown("### ⚙️ Pipeline Configuration") col1, col2, col3 = st.columns(3) with col1: miss_strategy = st.selectbox("Missing value strategy", ["Median/Mode", "Mean/Mode", "Drop rows with >40% missing cols"]) drop_threshold = st.slider("Drop column if missing % >", 10, 100, 40) with col2: encoding_method = st.selectbox("Categorical encoding", ["Label Encoding", "One-Hot Encoding"]) scaling_method = st.selectbox("Feature scaling", ["StandardScaler", "MinMaxScaler", "None"]) with col3: apply_smote = st.checkbox("Apply SMOTE (class balance)", value=True) apply_poly = st.checkbox("Add Polynomial Features", value=False) apply_rfe = st.checkbox("Apply RFE (Feature Selection)", value=False) test_size = st.slider("Test set size (%)", 10, 40, 20) random_state = 42 run = st.button("▶ Run Preprocessing Pipeline", type="primary", use_container_width=True) if run: steps_log = [] df = df_raw.copy() # ── Step 1: Drop high-missingness columns ───────────────────────────────── with st.expander("**Step 1 – Drop high-missing columns**", expanded=True): miss_pct = df.isnull().mean() * 100 drop_cols = miss_pct[miss_pct > drop_threshold].index.tolist() if target in drop_cols: drop_cols.remove(target) df.drop(columns=drop_cols, inplace=True) msg = f"Dropped {len(drop_cols)} columns with >{drop_threshold}% missing: {drop_cols if drop_cols else 'None'}" st.info(msg) steps_log.append(("Drop high-missing cols", f"{len(drop_cols)} dropped")) # ── Step 2: Drop duplicates ─────────────────────────────────────────────── with st.expander("**Step 2 – Remove duplicates**", expanded=True): before = len(df) df.drop_duplicates(inplace=True) removed = before - len(df) st.info(f"Removed {removed} duplicate rows. Remaining: {len(df):,}") steps_log.append(("Remove duplicates", f"{removed} rows removed")) # ── Step 3: Impute missing values ───────────────────────────────────────── with st.expander("**Step 3 – Missing value imputation**", expanded=True): num_cols = df.select_dtypes(include=np.number).columns.tolist() cat_cols = df.select_dtypes(include="object").columns.tolist() if target in cat_cols: cat_cols.remove(target) imputed_num = 0 for col in num_cols: n_miss = df[col].isnull().sum() if n_miss > 0: fill_val = df[col].median() if "Median" in miss_strategy else df[col].mean() df[col].fillna(fill_val, inplace=True) imputed_num += 1 imputed_cat = 0 for col in cat_cols: n_miss = df[col].isnull().sum() if n_miss > 0: df[col].fillna(df[col].mode().iloc[0] if not df[col].mode().empty else "Unknown", inplace=True) imputed_cat += 1 col1, col2 = st.columns(2) col1.success(f"✅ Imputed {imputed_num} numeric columns ({miss_strategy.split('/')[0]})") col2.success(f"✅ Imputed {imputed_cat} categorical columns (Mode)") steps_log.append(("Imputation", f"{imputed_num} numeric, {imputed_cat} categorical")) # Remaining missing remaining = df.isnull().sum().sum() if remaining == 0: st.success("✅ No missing values remaining.") else: st.warning(f"⚠️ {remaining} missing values still present after imputation.") # ── Step 4: Feature Engineering ────────────────────────────────────────── with st.expander("**Step 4 – Feature Engineering**", expanded=True): new_feats = [] if is_leads: if "TotalVisits" in df.columns and "Total Time Spent on Website" in df.columns: df["Time_Per_Visit"] = (df["Total Time Spent on Website"] / (df["TotalVisits"] + 1)).round(2) new_feats.append("Time_Per_Visit") if "TotalVisits" in df.columns and "Page Views Per Visit" in df.columns: df["Engagement_Score"] = ( df["TotalVisits"] * 0.3 + df["Total Time Spent on Website"].fillna(0) * 0.5 + df["Page Views Per Visit"].fillna(0) * 0.2 ).round(2) new_feats.append("Engagement_Score") else: if "duration" in df.columns and "campaign" in df.columns: df["Avg_Duration_Per_Contact"] = (df["duration"] / (df["campaign"] + 1)).round(2) new_feats.append("Avg_Duration_Per_Contact") if "pdays" in df.columns: df["Previously_Contacted"] = (df["pdays"] != -1).astype(int) new_feats.append("Previously_Contacted") if "age" in df.columns: df["Age_Group"] = pd.cut(df["age"], bins=[0,25,35,50,65,100], labels=["<25","25-35","35-50","50-65","65+"]) new_feats.append("Age_Group") if new_feats: st.success(f"✅ Created {len(new_feats)} new features: {new_feats}") else: st.info("No additional feature engineering applied.") # ── Optional: Polynomial Features ───────────────────────────────────── if apply_poly: num_cols_poly = df.select_dtypes(include=np.number).columns.tolist() if "_target" in num_cols_poly: num_cols_poly.remove("_target") poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) poly_data = poly.fit_transform(df[num_cols_poly].fillna(0)) poly_cols = poly.get_feature_names_out(num_cols_poly) df_poly = pd.DataFrame(poly_data, columns=poly_cols, index=df.index) # Update df with poly features (preserving non-numeric and target) df = pd.concat([df.drop(columns=num_cols_poly), df_poly], axis=1) st.success(f"✅ Added polynomial features. Total columns: {df.shape[1]}") new_feats.append("Polynomial Features") steps_log.append(("Feature Engineering", f"{len(new_feats)} new features")) # ── Step 5: Encode categoricals ─────────────────────────────────────────── with st.expander("**Step 5 – Categorical Encoding**", expanded=True): # Refresh cat_cols after feature engineering cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist() if target in cat_cols: cat_cols.remove(target) # Encode target if target == "y": df["_target"] = (df["y"] == "yes").astype(int) else: df["_target"] = df["Converted"].astype(int) # Encode features if encoding_method == "Label Encoding": le_map = {} for col in cat_cols: le = LabelEncoder() df[col] = le.fit_transform(df[col].astype(str)) le_map[col] = le st.success(f"✅ Label-encoded {len(cat_cols)} categorical columns.") else: # One-hot before_cols = df.shape[1] df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int) after_cols = df.shape[1] st.success(f"✅ One-hot encoded {len(cat_cols)} columns → {after_cols - before_cols} new binary columns.") steps_log.append(("Encoding", f"{encoding_method} on {len(cat_cols)} cols")) # ── Step 6: Remove original target if separate ──────────────────────────── for col in [target, "Converted", "y"]: if col in df.columns and col != "_target": df.drop(columns=[col], inplace=True) # Drop non-predictive ID cols for Leads if is_leads: for col in ["Prospect ID", "Lead Number"]: if col in df.columns: df.drop(columns=[col], inplace=True) # ── Step 7: Final feature matrix ────────────────────────────────────────── X = df.drop(columns=["_target"]) y_col = df["_target"] # Ensure all numeric X = X.select_dtypes(include=np.number) with st.expander("**Step 6 – Feature Matrix Shape**", expanded=True): st.info(f"Feature matrix X: {X.shape[0]:,} rows × {X.shape[1]} features | " f"Target y: {y_col.value_counts().to_dict()}") st.dataframe(X.head(3), use_container_width=True) # ── Step 8: Train-Test Split ─────────────────────────────────────────────── with st.expander("**Step 7 – Train/Test Split**", expanded=True): X_train, X_test, y_train, y_test = train_test_split( X, y_col, test_size=test_size/100, random_state=random_state, stratify=y_col) col1, col2 = st.columns(2) col1.metric("Training Samples", f"{len(X_train):,}") col2.metric("Test Samples", f"{len(X_test):,}") fig = px.bar(x=["Train", "Test"], y=[len(X_train), len(X_test)], color=["Train", "Test"], title="Train/Test Split", color_discrete_sequence=["#2d6a9f", "#e74c3c"]) fig.update_layout(showlegend=False, height=300) st.plotly_chart(fig, use_container_width=True) steps_log.append(("Train/Test Split", f"{100-test_size}/{test_size}")) # ── Step 9: Feature Scaling ──────────────────────────────────────────────── with st.expander("**Step 8 – Feature Scaling**", expanded=True): if scaling_method != "None": scaler = StandardScaler() if scaling_method == "StandardScaler" else MinMaxScaler() X_train_sc = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index) X_test_sc = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index) st.success(f"✅ Applied {scaling_method} to {X_train.shape[1]} features.") else: X_train_sc, X_test_sc, scaler = X_train, X_test, None st.info("No scaling applied.") steps_log.append(("Scaling", scaling_method)) # ── Step 10: SMOTE ──────────────────────────────────────────────────────── with st.expander("**Step 9 – SMOTE Class Balancing**", expanded=True): if apply_smote: # Fill any NaNs that survived imputation / were introduced by scaling if isinstance(X_train_sc, pd.DataFrame): X_train_sc = X_train_sc.fillna(X_train_sc.median()) else: col_medians = np.nanmedian(X_train_sc, axis=0) nan_mask = np.isnan(X_train_sc) X_train_sc[nan_mask] = np.take(col_medians, np.where(nan_mask)[1]) before_dist = pd.Series(y_train).value_counts() sm = SMOTE(random_state=random_state) X_res, y_res = sm.fit_resample(X_train_sc, y_train) after_dist = pd.Series(y_res).value_counts() col1, col2 = st.columns(2) with col1: st.markdown("**Before SMOTE**") fig_b = px.pie(values=before_dist.values, names=before_dist.index.astype(str), hole=0.4, color_discrete_sequence=["#3498db","#e74c3c"]) st.plotly_chart(fig_b, use_container_width=True) with col2: st.markdown("**After SMOTE**") fig_a = px.pie(values=after_dist.values, names=after_dist.index.astype(str), hole=0.4, color_discrete_sequence=["#3498db","#e74c3c"]) st.plotly_chart(fig_a, use_container_width=True) st.success(f"✅ SMOTE applied: {len(X_res):,} training samples (was {len(X_train):,})") X_train_final, y_train_final = X_res, y_res steps_log.append(("SMOTE", f"{len(X_res):,} resampled")) else: X_train_final, y_train_final = X_train_sc, y_train st.info("SMOTE not applied.") # ── Step 11: RFE (Feature Selection) ────────────────────────────────────── with st.expander("**Step 10 – RFE Feature Selection**", expanded=True): if apply_rfe: n_features_to_select = max(10, int(X_train_final.shape[1] * 0.5)) st.info(f"Selecting top {n_features_to_select} features using RF selector...") selector = RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42), n_features_to_select=n_features_to_select, step=5) X_train_final = pd.DataFrame(selector.fit_transform(X_train_final, y_train_final), columns=X_train_final.columns[selector.support_]) X_test_sc = X_test_sc[X_train_final.columns] st.success(f"✅ RFE complete. Selected {X_train_final.shape[1]} features.") steps_log.append(("RFE", f"Selected {X_train_final.shape[1]} features")) else: st.info("RFE not applied.") # ── Save to session state ───────────────────────────────────────────────── key = "leads" if is_leads else "bank" st.session_state[f"X_train_{key}"] = X_train_final st.session_state[f"X_test_{key}"] = X_test_sc st.session_state[f"y_train_{key}"] = y_train_final st.session_state[f"y_test_{key}"] = y_test st.session_state[f"feature_names_{key}"] = X_train_final.columns.tolist() st.session_state[f"scaler_{key}"] = scaler st.session_state[f"preprocessed_{key}"] = True # ── Pipeline Summary ────────────────────────────────────────────────────── st.markdown("---") st.markdown("### ✅ Preprocessing Pipeline Summary") summary_df = pd.DataFrame(steps_log, columns=["Step", "Result"]) summary_df.index = range(1, len(summary_df)+1) st.dataframe(summary_df, use_container_width=True) st.success(f"✅ {name} dataset preprocessed and saved to session state. " "Proceed to **Phase 4 – Lead Scoring Models** →") else: st.markdown(""" ### Pipeline Steps """) steps = [ ("1", "Drop columns with >threshold% missing values"), ("2", "Remove duplicate records"), ("3", "Impute missing values (Median/Mean for numeric, Mode for categorical)"), ("4", "Feature engineering (engagement score, temporal features, age groups)"), ("5", "Categorical encoding (Label Encoding or One-Hot Encoding)"), ("6", "Prepare feature matrix X and target vector y"), ("7", "Stratified train/test split"), ("8", "Feature scaling (StandardScaler / MinMaxScaler)"), ("9", "SMOTE oversampling for class imbalance"), ] for num, desc in steps: st.markdown(f'

Step {num} – {desc}

', unsafe_allow_html=True) st.info("Configure the pipeline above and click **Run Preprocessing Pipeline**.")