import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve import xgboost as xgb import shap import joblib import warnings warnings.filterwarnings('ignore') # Set page configuration st.set_page_config( page_title="E-commerce Churn Prediction", page_icon="🛒", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) class ChurnPredictor: def __init__(self): self.model = None self.preprocessor = None self.feature_names = None self.target_name = 'Churn' def load_data(self): """Load and preprocess the data""" url = "https://raw.githubusercontent.com/Ricendfish/M1-Assignment/main/data_ecommerce(in).csv" df = pd.read_csv(url) # Remove duplicates df = df.drop_duplicates() # Create RecentOrder feature df['RecentOrder'] = np.where(df['DaySinceLastOrder'] <= 30, 1, 0) return df def preprocess_data(self, df): """Preprocess the data for modeling""" # Separate features and target X = df.drop('Churn', axis=1) y = df['Churn'] # Define features numerical_features = ['Tenure', 'WarehouseToHome', 'NumberOfDeviceRegistered', 'SatisfactionScore', 'NumberOfAddress', 'CashbackAmount'] categorical_features = ['PreferedOrderCat', 'MaritalStatus'] binary_features = ['Complain', 'RecentOrder'] # Preprocessors numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(drop='first', sparse_output=False)) ]) binary_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')) ]) # Column transformer self.preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features), ('bin', binary_transformer, binary_features) ]) # Fit and transform the data X_processed = self.preprocessor.fit_transform(X) # Get feature names after preprocessing feature_names = numerical_features.copy() cat_features = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features) feature_names.extend(cat_features) feature_names.extend(binary_features) self.feature_names = feature_names return X_processed, y, feature_names def train_model(self, X, y): """Train the prediction model""" # Split the data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Train XGBoost model self.model = xgb.XGBClassifier( n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42 ) self.model.fit(X_train, y_train) # Calculate performance metrics y_pred = self.model.predict(X_test) y_pred_proba = self.model.predict_proba(X_test)[:, 1] accuracy = self.model.score(X_test, y_test) auc_score = roc_auc_score(y_test, y_pred_proba) return X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score def predict_churn(self, input_data): """Predict churn for new data""" if self.model is None or self.preprocessor is None: raise ValueError("Model not trained yet!") # Preprocess input data input_processed = self.preprocessor.transform(input_data) # Make prediction prediction = self.model.predict(input_processed) probability = self.model.predict_proba(input_processed)[:, 1] return prediction[0], probability[0] def what_if_analysis(self, base_data, feature_to_change, values_range): """Perform what-if analysis by changing one feature""" probabilities = [] for value in values_range: modified_data = base_data.copy() modified_data[feature_to_change] = value _, probability = self.predict_churn(modified_data) probabilities.append(probability) return probabilities def explain_prediction(self, input_data): """Generate SHAP explanation for a prediction""" if self.model is None or self.preprocessor is None: raise ValueError("Model not trained yet!") # Preprocess input data input_processed = self.preprocessor.transform(input_data) # Create SHAP explainer explainer = shap.TreeExplainer(self.model) shap_values = explainer.shap_values(input_processed) # For binary classification, shap_values might be a list with two arrays if isinstance(shap_values, list): shap_values = shap_values[1] # Use the positive class (churn) # Get feature names feature_names = self.feature_names return shap_values[0], explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value, feature_names def main(): # Header st.markdown('

🛒 E-commerce Customer Churn Prediction

', unsafe_allow_html=True) # Initialize predictor predictor = ChurnPredictor() # Sidebar st.sidebar.title("Navigation") app_mode = st.sidebar.selectbox("Choose App Mode", ["Data Overview", "EDA", "Churn Prediction", "What-If Analysis", "Model Insights"]) # Load data with st.spinner('Loading data...'): df = predictor.load_data() if app_mode == "Data Overview": show_data_overview(df) elif app_mode == "EDA": show_eda(df) elif app_mode == "Churn Prediction": show_churn_prediction(predictor, df) elif app_mode == "What-If Analysis": show_what_if_analysis(predictor, df) elif app_mode == "Model Insights": show_model_insights(predictor, df) def show_data_overview(df): st.header("📊 Data Overview") col1, col2 = st.columns([2, 1]) with col1: st.subheader("Dataset Preview") st.dataframe(df.head(10), use_container_width=True) with col2: st.subheader("Dataset Info") st.write(f"**Shape:** {df.shape}") st.write(f"**Columns:** {len(df.columns)}") st.write(f"**Missing Values:** {df.isnull().sum().sum()}") # Churn distribution churn_count = df['Churn'].value_counts() churn_rate = churn_count[1] non_churn_rate = churn_count[0] st.metric("Customers Likely to Churn", f"{churn_rate}") st.metric("Customers Not Likely to Churn", f"{non_churn_rate}") st.subheader("Data Description") st.dataframe(df.describe(), use_container_width=True) def show_eda(df): st.header("📈 Exploratory Data Analysis") # Churn distribution col1, col2 = st.columns(2) with col1: # Convert churn to meaningful labels df_churn_display = df.copy() df_churn_display['Churn_Label'] = df_churn_display['Churn'].map({0: 'Not Likely to Churn', 1: 'Likely to Churn'}) fig = px.pie(df_churn_display, names='Churn_Label', title='Churn Distribution', color='Churn_Label', color_discrete_map={'Not Likely to Churn':'lightblue', 'Likely to Churn':'lightcoral'}) st.plotly_chart(fig, use_container_width=True) with col2: churn_by_marital = df.groupby('MaritalStatus')['Churn'].mean().reset_index() churn_by_marital['Churn_Rate'] = churn_by_marital['Churn'] fig = px.bar(churn_by_marital, x='MaritalStatus', y='Churn_Rate', title='Churn Rate by Marital Status', color='MaritalStatus') st.plotly_chart(fig, use_container_width=True) # Line graphs instead of box plots st.subheader("Trend Analysis") # Line graph 1: Churn rate vs Satisfaction Score satisfaction_churn = df.groupby('SatisfactionScore')['Churn'].mean().reset_index() fig1 = px.line(satisfaction_churn, x='SatisfactionScore', y='Churn', title='Churn Rate vs Satisfaction Score', markers=True) fig1.update_layout(xaxis_title="Satisfaction Score", yaxis_title="Churn Rate") st.plotly_chart(fig1, use_container_width=True) # Line graph 2: Churn rate vs Tenure col1, col2 = st.columns(2) with col1: tenure_churn = df.groupby('Tenure')['Churn'].mean().reset_index() fig2 = px.line(tenure_churn, x='Tenure', y='Churn', title='Churn Rate vs Customer Tenure', markers=True) fig2.update_layout(xaxis_title="Tenure (months)", yaxis_title="Churn Rate") st.plotly_chart(fig2, use_container_width=True) with col2: # Line graph 3: Churn rate vs Cashback Amount (binned) df_cashback_binned = df.copy() df_cashback_binned['Cashback_Bin'] = pd.cut(df_cashback_binned['CashbackAmount'], bins=10) cashback_churn = df_cashback_binned.groupby('Cashback_Bin')['Churn'].mean().reset_index() cashback_churn['Cashback_Mid'] = cashback_churn['Cashback_Bin'].apply(lambda x: x.mid) fig3 = px.line(cashback_churn, x='Cashback_Mid', y='Churn', title='Churn Rate vs Cashback Amount', markers=True) fig3.update_layout(xaxis_title="Cashback Amount", yaxis_title="Churn Rate") st.plotly_chart(fig3, use_container_width=True) def show_churn_prediction(predictor, df): st.header("🔮 Churn Prediction") # Train model if not already trained if predictor.model is None: with st.spinner('Training model...'): X_processed, y, feature_names = predictor.preprocess_data(df) X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score = predictor.train_model(X_processed, y) # Input form st.subheader("Enter Customer Details") col1, col2, col3 = st.columns(3) with col1: tenure = st.slider("Tenure (months)", 0, 60, 12) warehouse_to_home = st.slider("Distance to Warehouse (km)", 5, 50, 15) num_devices = st.slider("Number of Devices Registered", 1, 6, 3) satisfaction = st.slider("Satisfaction Score", 1, 5, 3) with col2: num_addresses = st.slider("Number of Addresses", 1, 20, 4) cashback = st.slider("Cashback Amount", 0.0, 300.0, 150.0) days_since_order = st.slider("Days Since Last Order", 0, 60, 7) # Changed complaint to meaningful labels complain_option = st.selectbox("Complaint Status", ["No Complaint", "Complaint Filed"]) complain = 1 if complain_option == "Complaint Filed" else 0 with col3: # Simplified category selection - you can remove this if not needed preferred_category = st.selectbox("Preferred Category", ['Electronics', 'Fashion', 'Grocery', 'Home & Kitchen', 'Others']) marital_status = st.selectbox("Marital Status", ['Single', 'Married', 'Divorced']) # Map simplified categories to original format if needed category_mapping = { 'Electronics': 'Laptop & Accessory', 'Fashion': 'Fashion', 'Grocery': 'Grocery', 'Home & Kitchen': 'Others', 'Others': 'Others' } # Create input dataframe input_data = pd.DataFrame({ 'Tenure': [tenure], 'WarehouseToHome': [warehouse_to_home], 'NumberOfDeviceRegistered': [num_devices], 'PreferedOrderCat': [category_mapping[preferred_category]], 'SatisfactionScore': [satisfaction], 'MaritalStatus': [marital_status], 'NumberOfAddress': [num_addresses], 'Complain': [complain], 'DaySinceLastOrder': [days_since_order], 'CashbackAmount': [cashback], 'RecentOrder': [1 if days_since_order <= 30 else 0] }) if st.button("Predict Churn", type="primary"): try: prediction, probability = predictor.predict_churn(input_data) # Display results st.subheader("Prediction Results") if prediction == 1: risk_class = "churn-risk-high" risk_text = "LIKELY TO CHURN" risk_color = "red" emoji = "🔴" prediction_label = "Likely to Churn" else: risk_class = "churn-risk-low" risk_text = "NOT LIKELY TO CHURN" risk_color = "green" emoji = "🟢" prediction_label = "Not Likely to Churn" st.markdown(f"""

{emoji} Churn Prediction: {risk_text}

Prediction: {prediction_label}

Probability: {probability:.2f}

""", unsafe_allow_html=True) # Generate SHAP explanation with st.spinner('Analyzing factors...'): shap_values, expected_value, feature_names = predictor.explain_prediction(input_data) # Create a DataFrame for SHAP values shap_df = pd.DataFrame({ 'Feature': feature_names, 'SHAP Value': shap_values }) # Sort by absolute SHAP value shap_df['Abs_SHAP'] = np.abs(shap_df['SHAP Value']) shap_df = shap_df.sort_values('Abs_SHAP', ascending=False).head(10) # Create horizontal bar chart st.subheader("📊 Factors Influencing Prediction") fig = px.bar(shap_df, x='SHAP Value', y='Feature', orientation='h', title='Top Factors Influencing Prediction', color='SHAP Value', color_continuous_scale='RdBu_r', range_color=[-max(np.abs(shap_df['SHAP Value'])), max(np.abs(shap_df['SHAP Value']))]) fig.update_layout(yaxis={'categoryorder':'total ascending'}) st.plotly_chart(fig, use_container_width=True) # Display key factors in a more user-friendly way st.subheader("🔑 Key Factors") # Get top 5 factors top_factors = shap_df.head(5) for _, row in top_factors.iterrows(): factor_name = row['Feature'] impact = row['SHAP Value'] # Convert feature names to more readable format readable_names = { 'Tenure': 'Customer Tenure', 'SatisfactionScore': 'Satisfaction Score', 'CashbackAmount': 'Cashback Amount', 'Complain': 'Complaint Status', 'WarehouseToHome': 'Distance to Warehouse', 'NumberOfDeviceRegistered': 'Number of Devices', 'NumberOfAddress': 'Number of Addresses', 'RecentOrder': 'Recent Order Activity', 'PreferedOrderCat_Mobile': 'Preferred Category: Mobile', 'PreferedOrderCat_Laptop & Accessory': 'Preferred Category: Electronics', 'PreferedOrderCat_Fashion': 'Preferred Category: Fashion', 'PreferedOrderCat_Grocery': 'Preferred Category: Grocery', 'MaritalStatus_Married': 'Marital Status: Married', 'MaritalStatus_Single': 'Marital Status: Single' } display_name = readable_names.get(factor_name, factor_name) if impact > 0: st.write(f"🔴 **{display_name}** increased churn risk") else: st.write(f"🟢 **{display_name}** decreased churn risk") # Recommendations st.subheader("📋 Recommendations") if prediction == 1: st.markdown("""

🛑 Customer Retention Actions Recommended:

Improve the service: Identify the causes of recent complaints
Collect feedback: Carry out surveys in order to identify service issues
Cashback: Increase cashback for loyal customers
Loyalty programs: Special benefits and discounts for longterm customers

""", unsafe_allow_html=True) else: st.markdown("""

✅ Customer Retention Actions:

Maintening current customers: Use loyalty programs, coupons
Constant checkins: Send short surveys to prevent complaints
Keep engagement: through special offers, bundles, time-limited offers

""", unsafe_allow_html=True) except Exception as e: st.error(f"Error making prediction: {str(e)}") def show_what_if_analysis(predictor, df): st.header("🔍 What-If Analysis") st.markdown("Explore how changing different factors affects churn likelihood") # Train model if not already trained if predictor.model is None: with st.spinner('Training model...'): X_processed, y, feature_names = predictor.preprocess_data(df) predictor.train_model(X_processed, y) # Simplified Base Customer Profile st.subheader("Base Customer Profile") # Use columns for better layout col1, col2 = st.columns(2) with col1: base_tenure = st.slider("Base Tenure (months)", 0, 60, 12, key="base_tenure") base_satisfaction = st.slider("Base Satisfaction Score", 1, 5, 3, key="base_satisfaction") with col2: base_cashback = st.slider("Base Cashback Amount", 0.0, 300.0, 150.0, key="base_cashback") base_warehouse_dist = st.slider("Base Warehouse Distance", 5, 50, 15, key="base_dist") base_complain = st.selectbox("Base Complaint Status", ["No Complaint", "Complaint Filed"], key="base_complain") base_complain_val = 1 if base_complain == "Complaint Filed" else 0 # Create base data base_data = pd.DataFrame({ 'Tenure': [base_tenure], 'WarehouseToHome': [base_warehouse_dist], 'NumberOfDeviceRegistered': [3], 'PreferedOrderCat': ['Laptop & Accessory'], 'SatisfactionScore': [base_satisfaction], 'MaritalStatus': ['Single'], 'NumberOfAddress': [4], 'Complain': [base_complain_val], 'DaySinceLastOrder': [7], 'CashbackAmount': [base_cashback], 'RecentOrder': [1] }) # What-if scenario st.subheader("What-If Scenario") col1, col2 = st.columns(2) with col1: feature_to_test = st.selectbox( "Feature to Analyze", ['SatisfactionScore', 'CashbackAmount', 'Tenure', 'WarehouseToHome'] ) with col2: if feature_to_test == 'SatisfactionScore': test_range = st.slider("Test Range", 1, 5, (1, 5)) values_range = list(range(test_range[0], test_range[1] + 1)) elif feature_to_test == 'CashbackAmount': test_range = st.slider("Test Range", 0, 300, (0, 300)) values_range = list(range(test_range[0], test_range[1] + 1, 30)) elif feature_to_test == 'Tenure': test_range = st.slider("Test Range", 0, 60, (0, 60)) values_range = list(range(test_range[0], test_range[1] + 1, 6)) else: # WarehouseToHome test_range = st.slider("Test Range", 5, 50, (5, 50)) values_range = list(range(test_range[0], test_range[1] + 1, 5)) if st.button("Run What-If Analysis"): with st.spinner('Analyzing scenarios...'): probabilities = predictor.what_if_analysis(base_data, feature_to_test, values_range) # Create what-if analysis chart fig = go.Figure() fig.add_trace(go.Scatter( x=values_range, y=probabilities, mode='lines+markers', name='Churn Probability', line=dict(color='red', width=3), marker=dict(size=8) )) # Add threshold line fig.add_hline(y=0.5, line_dash="dash", line_color="orange", annotation_text="Decision Threshold", annotation_position="bottom right") fig.update_layout( title=f'What-If Analysis: Churn Probability vs {feature_to_test}', xaxis_title=feature_to_test, yaxis_title='Churn Probability', hovermode='x unified', height=500 ) st.plotly_chart(fig, use_container_width=True) # Insights st.subheader("📊 Analysis Insights") current_prob = predictor.predict_churn(base_data)[1] min_prob = min(probabilities) max_prob = max(probabilities) col1, col2, col3 = st.columns(3) with col1: st.metric("Current Probability", f"{current_prob:.2f}") with col2: st.metric("Minimum Probability", f"{min_prob:.2f}") with col3: st.metric("Maximum Probability", f"{max_prob:.2f}") # Business recommendations based on analysis if feature_to_test == 'SatisfactionScore': st.info("**💡 Insight:** Improving satisfaction score from 1 to 5 can reduce churn probability by " f"{(max(probabilities) - min(probabilities)):.2%}") elif feature_to_test == 'CashbackAmount': st.info("**💡 Insight:** Higher cashback amounts show diminishing returns on churn reduction. " "Optimal range appears to be between 150-200 units.") def show_model_insights(predictor, df): st.header("🤖 Model Insights") # Train model if not already trained if predictor.model is None: with st.spinner('Training model and generating insights...'): X_processed, y, feature_names = predictor.preprocess_data(df) X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score = predictor.train_model(X_processed, y) col1, col2 = st.columns(2) with col1: st.metric("Model Accuracy", f"{accuracy:.1%}") st.metric("AUC Score", f"{auc_score:.3f}") with col2: # Display the provided confusion matrix image st.subheader("Confusion Matrix") IMAGE_URL = "https://raw.githubusercontent.com/Ricendfish/M1-Assignment/main/image.png" st.image(IMAGE_URL, caption="Final Confusion Matrix (Threshold = 0.4150)") # Feature Importance st.subheader("Feature Importance") if hasattr(predictor.model, 'feature_importances_'): feature_importance = pd.DataFrame({ 'feature': predictor.feature_names, 'importance': predictor.model.feature_importances_ }).sort_values('importance', ascending=True) fig = px.bar(feature_importance.tail(10), x='importance', y='feature', title='Top 10 Most Important Features', orientation='h', color='importance', color_continuous_scale='Viridis') st.plotly_chart(fig, use_container_width=True) if __name__ == "__main__": main()