Lead-score / pages /3_Phase_3_Preprocessing.py
prologlover91's picture
Upload 39 files
f382db8 verified
"""
Phase 3 – Data Preprocessing & Feature Engineering
"""
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import warnings, sys, os
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from utils.helpers import load_leads, load_bank
st.set_page_config(page_title="Phase 3 – Preprocessing", page_icon="βš™οΈ", layout="wide")
st.markdown("""
<style>
.phase-header { background:linear-gradient(135deg,#1e3a5f,#2d6a9f);
color:white;padding:1.5rem 2rem;border-radius:10px;margin-bottom:1rem;}
.step-card { background:#f8faff;border:1px solid #d1e0f5;border-radius:8px;
padding:1rem 1.2rem;margin:.5rem 0; }
.step-num { font-weight:800;color:#2d6a9f;font-size:1.1rem; }
.badge-green { background:#d4edda;color:#155724;padding:.2rem .6rem;border-radius:12px;font-size:.8rem; }
.badge-blue { background:#cce5ff;color:#004085;padding:.2rem .6rem;border-radius:12px;font-size:.8rem; }
</style>
""", unsafe_allow_html=True)
st.markdown('<div class="phase-header"><h2>βš™οΈ Phase 3 – Preprocessing & Feature Engineering</h2>'
'<p>Imputation Β· Encoding Β· Feature creation Β· Scaling Β· SMOTE Β· Train-test split</p></div>',
unsafe_allow_html=True)
leads = load_leads()
bank = load_bank()
dataset_choice = st.radio("Select Dataset", ["Leads (EdTech)", "Bank Marketing (UCI)"], horizontal=True)
is_leads = dataset_choice.startswith("Leads")
df_raw = leads.copy() if is_leads else bank.copy()
target = "Converted" if is_leads else "y"
name = "Leads" if is_leads else "Bank"
# ── Step Configuration Panel ──────────────────────────────────────────────────
st.markdown("### βš™οΈ Pipeline Configuration")
col1, col2, col3 = st.columns(3)
with col1:
miss_strategy = st.selectbox("Missing value strategy", ["Median/Mode", "Mean/Mode", "Drop rows with >40% missing cols"])
drop_threshold = st.slider("Drop column if missing % >", 10, 100, 40)
with col2:
encoding_method = st.selectbox("Categorical encoding", ["Label Encoding", "One-Hot Encoding"])
scaling_method = st.selectbox("Feature scaling", ["StandardScaler", "MinMaxScaler", "None"])
with col3:
apply_smote = st.checkbox("Apply SMOTE (class balance)", value=True)
apply_poly = st.checkbox("Add Polynomial Features", value=False)
apply_rfe = st.checkbox("Apply RFE (Feature Selection)", value=False)
test_size = st.slider("Test set size (%)", 10, 40, 20)
random_state = 42
run = st.button("β–Ά Run Preprocessing Pipeline", type="primary", use_container_width=True)
if run:
steps_log = []
df = df_raw.copy()
# ── Step 1: Drop high-missingness columns ─────────────────────────────────
with st.expander("**Step 1 – Drop high-missing columns**", expanded=True):
miss_pct = df.isnull().mean() * 100
drop_cols = miss_pct[miss_pct > drop_threshold].index.tolist()
if target in drop_cols:
drop_cols.remove(target)
df.drop(columns=drop_cols, inplace=True)
msg = f"Dropped {len(drop_cols)} columns with >{drop_threshold}% missing: {drop_cols if drop_cols else 'None'}"
st.info(msg)
steps_log.append(("Drop high-missing cols", f"{len(drop_cols)} dropped"))
# ── Step 2: Drop duplicates ───────────────────────────────────────────────
with st.expander("**Step 2 – Remove duplicates**", expanded=True):
before = len(df)
df.drop_duplicates(inplace=True)
removed = before - len(df)
st.info(f"Removed {removed} duplicate rows. Remaining: {len(df):,}")
steps_log.append(("Remove duplicates", f"{removed} rows removed"))
# ── Step 3: Impute missing values ─────────────────────────────────────────
with st.expander("**Step 3 – Missing value imputation**", expanded=True):
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(include="object").columns.tolist()
if target in cat_cols: cat_cols.remove(target)
imputed_num = 0
for col in num_cols:
n_miss = df[col].isnull().sum()
if n_miss > 0:
fill_val = df[col].median() if "Median" in miss_strategy else df[col].mean()
df[col].fillna(fill_val, inplace=True)
imputed_num += 1
imputed_cat = 0
for col in cat_cols:
n_miss = df[col].isnull().sum()
if n_miss > 0:
df[col].fillna(df[col].mode().iloc[0] if not df[col].mode().empty else "Unknown",
inplace=True)
imputed_cat += 1
col1, col2 = st.columns(2)
col1.success(f"βœ… Imputed {imputed_num} numeric columns ({miss_strategy.split('/')[0]})")
col2.success(f"βœ… Imputed {imputed_cat} categorical columns (Mode)")
steps_log.append(("Imputation", f"{imputed_num} numeric, {imputed_cat} categorical"))
# Remaining missing
remaining = df.isnull().sum().sum()
if remaining == 0:
st.success("βœ… No missing values remaining.")
else:
st.warning(f"⚠️ {remaining} missing values still present after imputation.")
# ── Step 4: Feature Engineering ──────────────────────────────────────────
with st.expander("**Step 4 – Feature Engineering**", expanded=True):
new_feats = []
if is_leads:
if "TotalVisits" in df.columns and "Total Time Spent on Website" in df.columns:
df["Time_Per_Visit"] = (df["Total Time Spent on Website"] /
(df["TotalVisits"] + 1)).round(2)
new_feats.append("Time_Per_Visit")
if "TotalVisits" in df.columns and "Page Views Per Visit" in df.columns:
df["Engagement_Score"] = (
df["TotalVisits"] * 0.3 +
df["Total Time Spent on Website"].fillna(0) * 0.5 +
df["Page Views Per Visit"].fillna(0) * 0.2
).round(2)
new_feats.append("Engagement_Score")
else:
if "duration" in df.columns and "campaign" in df.columns:
df["Avg_Duration_Per_Contact"] = (df["duration"] / (df["campaign"] + 1)).round(2)
new_feats.append("Avg_Duration_Per_Contact")
if "pdays" in df.columns:
df["Previously_Contacted"] = (df["pdays"] != -1).astype(int)
new_feats.append("Previously_Contacted")
if "age" in df.columns:
df["Age_Group"] = pd.cut(df["age"], bins=[0,25,35,50,65,100],
labels=["<25","25-35","35-50","50-65","65+"])
new_feats.append("Age_Group")
if new_feats:
st.success(f"βœ… Created {len(new_feats)} new features: {new_feats}")
else:
st.info("No additional feature engineering applied.")
# ── Optional: Polynomial Features ─────────────────────────────────────
if apply_poly:
num_cols_poly = df.select_dtypes(include=np.number).columns.tolist()
if "_target" in num_cols_poly: num_cols_poly.remove("_target")
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_data = poly.fit_transform(df[num_cols_poly].fillna(0))
poly_cols = poly.get_feature_names_out(num_cols_poly)
df_poly = pd.DataFrame(poly_data, columns=poly_cols, index=df.index)
# Update df with poly features (preserving non-numeric and target)
df = pd.concat([df.drop(columns=num_cols_poly), df_poly], axis=1)
st.success(f"βœ… Added polynomial features. Total columns: {df.shape[1]}")
new_feats.append("Polynomial Features")
steps_log.append(("Feature Engineering", f"{len(new_feats)} new features"))
# ── Step 5: Encode categoricals ───────────────────────────────────────────
with st.expander("**Step 5 – Categorical Encoding**", expanded=True):
# Refresh cat_cols after feature engineering
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
if target in cat_cols: cat_cols.remove(target)
# Encode target
if target == "y":
df["_target"] = (df["y"] == "yes").astype(int)
else:
df["_target"] = df["Converted"].astype(int)
# Encode features
if encoding_method == "Label Encoding":
le_map = {}
for col in cat_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str))
le_map[col] = le
st.success(f"βœ… Label-encoded {len(cat_cols)} categorical columns.")
else: # One-hot
before_cols = df.shape[1]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int)
after_cols = df.shape[1]
st.success(f"βœ… One-hot encoded {len(cat_cols)} columns β†’ {after_cols - before_cols} new binary columns.")
steps_log.append(("Encoding", f"{encoding_method} on {len(cat_cols)} cols"))
# ── Step 6: Remove original target if separate ────────────────────────────
for col in [target, "Converted", "y"]:
if col in df.columns and col != "_target":
df.drop(columns=[col], inplace=True)
# Drop non-predictive ID cols for Leads
if is_leads:
for col in ["Prospect ID", "Lead Number"]:
if col in df.columns:
df.drop(columns=[col], inplace=True)
# ── Step 7: Final feature matrix ──────────────────────────────────────────
X = df.drop(columns=["_target"])
y_col = df["_target"]
# Ensure all numeric
X = X.select_dtypes(include=np.number)
with st.expander("**Step 6 – Feature Matrix Shape**", expanded=True):
st.info(f"Feature matrix X: {X.shape[0]:,} rows Γ— {X.shape[1]} features | "
f"Target y: {y_col.value_counts().to_dict()}")
st.dataframe(X.head(3), use_container_width=True)
# ── Step 8: Train-Test Split ───────────────────────────────────────────────
with st.expander("**Step 7 – Train/Test Split**", expanded=True):
X_train, X_test, y_train, y_test = train_test_split(
X, y_col, test_size=test_size/100, random_state=random_state, stratify=y_col)
col1, col2 = st.columns(2)
col1.metric("Training Samples", f"{len(X_train):,}")
col2.metric("Test Samples", f"{len(X_test):,}")
fig = px.bar(x=["Train", "Test"], y=[len(X_train), len(X_test)],
color=["Train", "Test"],
title="Train/Test Split",
color_discrete_sequence=["#2d6a9f", "#e74c3c"])
fig.update_layout(showlegend=False, height=300)
st.plotly_chart(fig, use_container_width=True)
steps_log.append(("Train/Test Split", f"{100-test_size}/{test_size}"))
# ── Step 9: Feature Scaling ────────────────────────────────────────────────
with st.expander("**Step 8 – Feature Scaling**", expanded=True):
if scaling_method != "None":
scaler = StandardScaler() if scaling_method == "StandardScaler" else MinMaxScaler()
X_train_sc = pd.DataFrame(scaler.fit_transform(X_train),
columns=X_train.columns, index=X_train.index)
X_test_sc = pd.DataFrame(scaler.transform(X_test),
columns=X_test.columns, index=X_test.index)
st.success(f"βœ… Applied {scaling_method} to {X_train.shape[1]} features.")
else:
X_train_sc, X_test_sc, scaler = X_train, X_test, None
st.info("No scaling applied.")
steps_log.append(("Scaling", scaling_method))
# ── Step 10: SMOTE ────────────────────────────────────────────────────────
with st.expander("**Step 9 – SMOTE Class Balancing**", expanded=True):
if apply_smote:
# Fill any NaNs that survived imputation / were introduced by scaling
if isinstance(X_train_sc, pd.DataFrame):
X_train_sc = X_train_sc.fillna(X_train_sc.median())
else:
col_medians = np.nanmedian(X_train_sc, axis=0)
nan_mask = np.isnan(X_train_sc)
X_train_sc[nan_mask] = np.take(col_medians, np.where(nan_mask)[1])
before_dist = pd.Series(y_train).value_counts()
sm = SMOTE(random_state=random_state)
X_res, y_res = sm.fit_resample(X_train_sc, y_train)
after_dist = pd.Series(y_res).value_counts()
col1, col2 = st.columns(2)
with col1:
st.markdown("**Before SMOTE**")
fig_b = px.pie(values=before_dist.values, names=before_dist.index.astype(str),
hole=0.4, color_discrete_sequence=["#3498db","#e74c3c"])
st.plotly_chart(fig_b, use_container_width=True)
with col2:
st.markdown("**After SMOTE**")
fig_a = px.pie(values=after_dist.values, names=after_dist.index.astype(str),
hole=0.4, color_discrete_sequence=["#3498db","#e74c3c"])
st.plotly_chart(fig_a, use_container_width=True)
st.success(f"βœ… SMOTE applied: {len(X_res):,} training samples (was {len(X_train):,})")
X_train_final, y_train_final = X_res, y_res
steps_log.append(("SMOTE", f"{len(X_res):,} resampled"))
else:
X_train_final, y_train_final = X_train_sc, y_train
st.info("SMOTE not applied.")
# ── Step 11: RFE (Feature Selection) ──────────────────────────────────────
with st.expander("**Step 10 – RFE Feature Selection**", expanded=True):
if apply_rfe:
n_features_to_select = max(10, int(X_train_final.shape[1] * 0.5))
st.info(f"Selecting top {n_features_to_select} features using RF selector...")
selector = RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42),
n_features_to_select=n_features_to_select, step=5)
X_train_final = pd.DataFrame(selector.fit_transform(X_train_final, y_train_final),
columns=X_train_final.columns[selector.support_])
X_test_sc = X_test_sc[X_train_final.columns]
st.success(f"βœ… RFE complete. Selected {X_train_final.shape[1]} features.")
steps_log.append(("RFE", f"Selected {X_train_final.shape[1]} features"))
else:
st.info("RFE not applied.")
# ── Save to session state ─────────────────────────────────────────────────
key = "leads" if is_leads else "bank"
st.session_state[f"X_train_{key}"] = X_train_final
st.session_state[f"X_test_{key}"] = X_test_sc
st.session_state[f"y_train_{key}"] = y_train_final
st.session_state[f"y_test_{key}"] = y_test
st.session_state[f"feature_names_{key}"] = X_train_final.columns.tolist()
st.session_state[f"scaler_{key}"] = scaler
st.session_state[f"preprocessed_{key}"] = True
# ── Pipeline Summary ──────────────────────────────────────────────────────
st.markdown("---")
st.markdown("### βœ… Preprocessing Pipeline Summary")
summary_df = pd.DataFrame(steps_log, columns=["Step", "Result"])
summary_df.index = range(1, len(summary_df)+1)
st.dataframe(summary_df, use_container_width=True)
st.success(f"βœ… {name} dataset preprocessed and saved to session state. "
"Proceed to **Phase 4 – Lead Scoring Models** β†’")
else:
st.markdown("""
### Pipeline Steps
""")
steps = [
("1", "Drop columns with >threshold% missing values"),
("2", "Remove duplicate records"),
("3", "Impute missing values (Median/Mean for numeric, Mode for categorical)"),
("4", "Feature engineering (engagement score, temporal features, age groups)"),
("5", "Categorical encoding (Label Encoding or One-Hot Encoding)"),
("6", "Prepare feature matrix X and target vector y"),
("7", "Stratified train/test split"),
("8", "Feature scaling (StandardScaler / MinMaxScaler)"),
("9", "SMOTE oversampling for class imbalance"),
]
for num, desc in steps:
st.markdown(f'<div class="step-card"><span class="step-num">Step {num}</span> – {desc}</div>',
unsafe_allow_html=True)
st.info("Configure the pipeline above and click **Run Preprocessing Pipeline**.")