Spaces:
Sleeping
Sleeping
| """ | |
| Phase 3 β Data Preprocessing & Feature Engineering | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures | |
| from sklearn.feature_selection import RFE | |
| from sklearn.ensemble import RandomForestClassifier | |
| from imblearn.over_sampling import SMOTE | |
| import warnings, sys, os | |
| warnings.filterwarnings("ignore") | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) | |
| from utils.helpers import load_leads, load_bank | |
| st.set_page_config(page_title="Phase 3 β Preprocessing", page_icon="βοΈ", layout="wide") | |
| st.markdown(""" | |
| <style> | |
| .phase-header { background:linear-gradient(135deg,#1e3a5f,#2d6a9f); | |
| color:white;padding:1.5rem 2rem;border-radius:10px;margin-bottom:1rem;} | |
| .step-card { background:#f8faff;border:1px solid #d1e0f5;border-radius:8px; | |
| padding:1rem 1.2rem;margin:.5rem 0; } | |
| .step-num { font-weight:800;color:#2d6a9f;font-size:1.1rem; } | |
| .badge-green { background:#d4edda;color:#155724;padding:.2rem .6rem;border-radius:12px;font-size:.8rem; } | |
| .badge-blue { background:#cce5ff;color:#004085;padding:.2rem .6rem;border-radius:12px;font-size:.8rem; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.markdown('<div class="phase-header"><h2>βοΈ Phase 3 β Preprocessing & Feature Engineering</h2>' | |
| '<p>Imputation Β· Encoding Β· Feature creation Β· Scaling Β· SMOTE Β· Train-test split</p></div>', | |
| unsafe_allow_html=True) | |
| leads = load_leads() | |
| bank = load_bank() | |
| dataset_choice = st.radio("Select Dataset", ["Leads (EdTech)", "Bank Marketing (UCI)"], horizontal=True) | |
| is_leads = dataset_choice.startswith("Leads") | |
| df_raw = leads.copy() if is_leads else bank.copy() | |
| target = "Converted" if is_leads else "y" | |
| name = "Leads" if is_leads else "Bank" | |
| # ββ Step Configuration Panel ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("### βοΈ Pipeline Configuration") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| miss_strategy = st.selectbox("Missing value strategy", ["Median/Mode", "Mean/Mode", "Drop rows with >40% missing cols"]) | |
| drop_threshold = st.slider("Drop column if missing % >", 10, 100, 40) | |
| with col2: | |
| encoding_method = st.selectbox("Categorical encoding", ["Label Encoding", "One-Hot Encoding"]) | |
| scaling_method = st.selectbox("Feature scaling", ["StandardScaler", "MinMaxScaler", "None"]) | |
| with col3: | |
| apply_smote = st.checkbox("Apply SMOTE (class balance)", value=True) | |
| apply_poly = st.checkbox("Add Polynomial Features", value=False) | |
| apply_rfe = st.checkbox("Apply RFE (Feature Selection)", value=False) | |
| test_size = st.slider("Test set size (%)", 10, 40, 20) | |
| random_state = 42 | |
| run = st.button("βΆ Run Preprocessing Pipeline", type="primary", use_container_width=True) | |
| if run: | |
| steps_log = [] | |
| df = df_raw.copy() | |
| # ββ Step 1: Drop high-missingness columns βββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 1 β Drop high-missing columns**", expanded=True): | |
| miss_pct = df.isnull().mean() * 100 | |
| drop_cols = miss_pct[miss_pct > drop_threshold].index.tolist() | |
| if target in drop_cols: | |
| drop_cols.remove(target) | |
| df.drop(columns=drop_cols, inplace=True) | |
| msg = f"Dropped {len(drop_cols)} columns with >{drop_threshold}% missing: {drop_cols if drop_cols else 'None'}" | |
| st.info(msg) | |
| steps_log.append(("Drop high-missing cols", f"{len(drop_cols)} dropped")) | |
| # ββ Step 2: Drop duplicates βββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 2 β Remove duplicates**", expanded=True): | |
| before = len(df) | |
| df.drop_duplicates(inplace=True) | |
| removed = before - len(df) | |
| st.info(f"Removed {removed} duplicate rows. Remaining: {len(df):,}") | |
| steps_log.append(("Remove duplicates", f"{removed} rows removed")) | |
| # ββ Step 3: Impute missing values βββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 3 β Missing value imputation**", expanded=True): | |
| num_cols = df.select_dtypes(include=np.number).columns.tolist() | |
| cat_cols = df.select_dtypes(include="object").columns.tolist() | |
| if target in cat_cols: cat_cols.remove(target) | |
| imputed_num = 0 | |
| for col in num_cols: | |
| n_miss = df[col].isnull().sum() | |
| if n_miss > 0: | |
| fill_val = df[col].median() if "Median" in miss_strategy else df[col].mean() | |
| df[col].fillna(fill_val, inplace=True) | |
| imputed_num += 1 | |
| imputed_cat = 0 | |
| for col in cat_cols: | |
| n_miss = df[col].isnull().sum() | |
| if n_miss > 0: | |
| df[col].fillna(df[col].mode().iloc[0] if not df[col].mode().empty else "Unknown", | |
| inplace=True) | |
| imputed_cat += 1 | |
| col1, col2 = st.columns(2) | |
| col1.success(f"β Imputed {imputed_num} numeric columns ({miss_strategy.split('/')[0]})") | |
| col2.success(f"β Imputed {imputed_cat} categorical columns (Mode)") | |
| steps_log.append(("Imputation", f"{imputed_num} numeric, {imputed_cat} categorical")) | |
| # Remaining missing | |
| remaining = df.isnull().sum().sum() | |
| if remaining == 0: | |
| st.success("β No missing values remaining.") | |
| else: | |
| st.warning(f"β οΈ {remaining} missing values still present after imputation.") | |
| # ββ Step 4: Feature Engineering ββββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 4 β Feature Engineering**", expanded=True): | |
| new_feats = [] | |
| if is_leads: | |
| if "TotalVisits" in df.columns and "Total Time Spent on Website" in df.columns: | |
| df["Time_Per_Visit"] = (df["Total Time Spent on Website"] / | |
| (df["TotalVisits"] + 1)).round(2) | |
| new_feats.append("Time_Per_Visit") | |
| if "TotalVisits" in df.columns and "Page Views Per Visit" in df.columns: | |
| df["Engagement_Score"] = ( | |
| df["TotalVisits"] * 0.3 + | |
| df["Total Time Spent on Website"].fillna(0) * 0.5 + | |
| df["Page Views Per Visit"].fillna(0) * 0.2 | |
| ).round(2) | |
| new_feats.append("Engagement_Score") | |
| else: | |
| if "duration" in df.columns and "campaign" in df.columns: | |
| df["Avg_Duration_Per_Contact"] = (df["duration"] / (df["campaign"] + 1)).round(2) | |
| new_feats.append("Avg_Duration_Per_Contact") | |
| if "pdays" in df.columns: | |
| df["Previously_Contacted"] = (df["pdays"] != -1).astype(int) | |
| new_feats.append("Previously_Contacted") | |
| if "age" in df.columns: | |
| df["Age_Group"] = pd.cut(df["age"], bins=[0,25,35,50,65,100], | |
| labels=["<25","25-35","35-50","50-65","65+"]) | |
| new_feats.append("Age_Group") | |
| if new_feats: | |
| st.success(f"β Created {len(new_feats)} new features: {new_feats}") | |
| else: | |
| st.info("No additional feature engineering applied.") | |
| # ββ Optional: Polynomial Features βββββββββββββββββββββββββββββββββββββ | |
| if apply_poly: | |
| num_cols_poly = df.select_dtypes(include=np.number).columns.tolist() | |
| if "_target" in num_cols_poly: num_cols_poly.remove("_target") | |
| poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) | |
| poly_data = poly.fit_transform(df[num_cols_poly].fillna(0)) | |
| poly_cols = poly.get_feature_names_out(num_cols_poly) | |
| df_poly = pd.DataFrame(poly_data, columns=poly_cols, index=df.index) | |
| # Update df with poly features (preserving non-numeric and target) | |
| df = pd.concat([df.drop(columns=num_cols_poly), df_poly], axis=1) | |
| st.success(f"β Added polynomial features. Total columns: {df.shape[1]}") | |
| new_feats.append("Polynomial Features") | |
| steps_log.append(("Feature Engineering", f"{len(new_feats)} new features")) | |
| # ββ Step 5: Encode categoricals βββββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 5 β Categorical Encoding**", expanded=True): | |
| # Refresh cat_cols after feature engineering | |
| cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist() | |
| if target in cat_cols: cat_cols.remove(target) | |
| # Encode target | |
| if target == "y": | |
| df["_target"] = (df["y"] == "yes").astype(int) | |
| else: | |
| df["_target"] = df["Converted"].astype(int) | |
| # Encode features | |
| if encoding_method == "Label Encoding": | |
| le_map = {} | |
| for col in cat_cols: | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col].astype(str)) | |
| le_map[col] = le | |
| st.success(f"β Label-encoded {len(cat_cols)} categorical columns.") | |
| else: # One-hot | |
| before_cols = df.shape[1] | |
| df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int) | |
| after_cols = df.shape[1] | |
| st.success(f"β One-hot encoded {len(cat_cols)} columns β {after_cols - before_cols} new binary columns.") | |
| steps_log.append(("Encoding", f"{encoding_method} on {len(cat_cols)} cols")) | |
| # ββ Step 6: Remove original target if separate ββββββββββββββββββββββββββββ | |
| for col in [target, "Converted", "y"]: | |
| if col in df.columns and col != "_target": | |
| df.drop(columns=[col], inplace=True) | |
| # Drop non-predictive ID cols for Leads | |
| if is_leads: | |
| for col in ["Prospect ID", "Lead Number"]: | |
| if col in df.columns: | |
| df.drop(columns=[col], inplace=True) | |
| # ββ Step 7: Final feature matrix ββββββββββββββββββββββββββββββββββββββββββ | |
| X = df.drop(columns=["_target"]) | |
| y_col = df["_target"] | |
| # Ensure all numeric | |
| X = X.select_dtypes(include=np.number) | |
| with st.expander("**Step 6 β Feature Matrix Shape**", expanded=True): | |
| st.info(f"Feature matrix X: {X.shape[0]:,} rows Γ {X.shape[1]} features | " | |
| f"Target y: {y_col.value_counts().to_dict()}") | |
| st.dataframe(X.head(3), use_container_width=True) | |
| # ββ Step 8: Train-Test Split βββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 7 β Train/Test Split**", expanded=True): | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y_col, test_size=test_size/100, random_state=random_state, stratify=y_col) | |
| col1, col2 = st.columns(2) | |
| col1.metric("Training Samples", f"{len(X_train):,}") | |
| col2.metric("Test Samples", f"{len(X_test):,}") | |
| fig = px.bar(x=["Train", "Test"], y=[len(X_train), len(X_test)], | |
| color=["Train", "Test"], | |
| title="Train/Test Split", | |
| color_discrete_sequence=["#2d6a9f", "#e74c3c"]) | |
| fig.update_layout(showlegend=False, height=300) | |
| st.plotly_chart(fig, use_container_width=True) | |
| steps_log.append(("Train/Test Split", f"{100-test_size}/{test_size}")) | |
| # ββ Step 9: Feature Scaling ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 8 β Feature Scaling**", expanded=True): | |
| if scaling_method != "None": | |
| scaler = StandardScaler() if scaling_method == "StandardScaler" else MinMaxScaler() | |
| X_train_sc = pd.DataFrame(scaler.fit_transform(X_train), | |
| columns=X_train.columns, index=X_train.index) | |
| X_test_sc = pd.DataFrame(scaler.transform(X_test), | |
| columns=X_test.columns, index=X_test.index) | |
| st.success(f"β Applied {scaling_method} to {X_train.shape[1]} features.") | |
| else: | |
| X_train_sc, X_test_sc, scaler = X_train, X_test, None | |
| st.info("No scaling applied.") | |
| steps_log.append(("Scaling", scaling_method)) | |
| # ββ Step 10: SMOTE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 9 β SMOTE Class Balancing**", expanded=True): | |
| if apply_smote: | |
| # Fill any NaNs that survived imputation / were introduced by scaling | |
| if isinstance(X_train_sc, pd.DataFrame): | |
| X_train_sc = X_train_sc.fillna(X_train_sc.median()) | |
| else: | |
| col_medians = np.nanmedian(X_train_sc, axis=0) | |
| nan_mask = np.isnan(X_train_sc) | |
| X_train_sc[nan_mask] = np.take(col_medians, np.where(nan_mask)[1]) | |
| before_dist = pd.Series(y_train).value_counts() | |
| sm = SMOTE(random_state=random_state) | |
| X_res, y_res = sm.fit_resample(X_train_sc, y_train) | |
| after_dist = pd.Series(y_res).value_counts() | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("**Before SMOTE**") | |
| fig_b = px.pie(values=before_dist.values, names=before_dist.index.astype(str), | |
| hole=0.4, color_discrete_sequence=["#3498db","#e74c3c"]) | |
| st.plotly_chart(fig_b, use_container_width=True) | |
| with col2: | |
| st.markdown("**After SMOTE**") | |
| fig_a = px.pie(values=after_dist.values, names=after_dist.index.astype(str), | |
| hole=0.4, color_discrete_sequence=["#3498db","#e74c3c"]) | |
| st.plotly_chart(fig_a, use_container_width=True) | |
| st.success(f"β SMOTE applied: {len(X_res):,} training samples (was {len(X_train):,})") | |
| X_train_final, y_train_final = X_res, y_res | |
| steps_log.append(("SMOTE", f"{len(X_res):,} resampled")) | |
| else: | |
| X_train_final, y_train_final = X_train_sc, y_train | |
| st.info("SMOTE not applied.") | |
| # ββ Step 11: RFE (Feature Selection) ββββββββββββββββββββββββββββββββββββββ | |
| with st.expander("**Step 10 β RFE Feature Selection**", expanded=True): | |
| if apply_rfe: | |
| n_features_to_select = max(10, int(X_train_final.shape[1] * 0.5)) | |
| st.info(f"Selecting top {n_features_to_select} features using RF selector...") | |
| selector = RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42), | |
| n_features_to_select=n_features_to_select, step=5) | |
| X_train_final = pd.DataFrame(selector.fit_transform(X_train_final, y_train_final), | |
| columns=X_train_final.columns[selector.support_]) | |
| X_test_sc = X_test_sc[X_train_final.columns] | |
| st.success(f"β RFE complete. Selected {X_train_final.shape[1]} features.") | |
| steps_log.append(("RFE", f"Selected {X_train_final.shape[1]} features")) | |
| else: | |
| st.info("RFE not applied.") | |
| # ββ Save to session state βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| key = "leads" if is_leads else "bank" | |
| st.session_state[f"X_train_{key}"] = X_train_final | |
| st.session_state[f"X_test_{key}"] = X_test_sc | |
| st.session_state[f"y_train_{key}"] = y_train_final | |
| st.session_state[f"y_test_{key}"] = y_test | |
| st.session_state[f"feature_names_{key}"] = X_train_final.columns.tolist() | |
| st.session_state[f"scaler_{key}"] = scaler | |
| st.session_state[f"preprocessed_{key}"] = True | |
| # ββ Pipeline Summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown("---") | |
| st.markdown("### β Preprocessing Pipeline Summary") | |
| summary_df = pd.DataFrame(steps_log, columns=["Step", "Result"]) | |
| summary_df.index = range(1, len(summary_df)+1) | |
| st.dataframe(summary_df, use_container_width=True) | |
| st.success(f"β {name} dataset preprocessed and saved to session state. " | |
| "Proceed to **Phase 4 β Lead Scoring Models** β") | |
| else: | |
| st.markdown(""" | |
| ### Pipeline Steps | |
| """) | |
| steps = [ | |
| ("1", "Drop columns with >threshold% missing values"), | |
| ("2", "Remove duplicate records"), | |
| ("3", "Impute missing values (Median/Mean for numeric, Mode for categorical)"), | |
| ("4", "Feature engineering (engagement score, temporal features, age groups)"), | |
| ("5", "Categorical encoding (Label Encoding or One-Hot Encoding)"), | |
| ("6", "Prepare feature matrix X and target vector y"), | |
| ("7", "Stratified train/test split"), | |
| ("8", "Feature scaling (StandardScaler / MinMaxScaler)"), | |
| ("9", "SMOTE oversampling for class imbalance"), | |
| ] | |
| for num, desc in steps: | |
| st.markdown(f'<div class="step-card"><span class="step-num">Step {num}</span> β {desc}</div>', | |
| unsafe_allow_html=True) | |
| st.info("Configure the pipeline above and click **Run Preprocessing Pipeline**.") | |