import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
import xgboost as xgb
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')
# Set page configuration
st.set_page_config(
page_title="E-commerce Churn Prediction",
page_icon="🛒",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
""", unsafe_allow_html=True)
class ChurnPredictor:
def __init__(self):
self.model = None
self.preprocessor = None
self.feature_names = None
self.target_name = 'Churn'
def load_data(self):
"""Load and preprocess the data"""
url = "https://raw.githubusercontent.com/Ricendfish/M1-Assignment/main/data_ecommerce(in).csv"
df = pd.read_csv(url)
# Remove duplicates
df = df.drop_duplicates()
# Create RecentOrder feature
df['RecentOrder'] = np.where(df['DaySinceLastOrder'] <= 30, 1, 0)
return df
def preprocess_data(self, df):
"""Preprocess the data for modeling"""
# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn']
# Define features
numerical_features = ['Tenure', 'WarehouseToHome', 'NumberOfDeviceRegistered',
'SatisfactionScore', 'NumberOfAddress', 'CashbackAmount']
categorical_features = ['PreferedOrderCat', 'MaritalStatus']
binary_features = ['Complain', 'RecentOrder']
# Preprocessors
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(drop='first', sparse_output=False))
])
binary_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent'))
])
# Column transformer
self.preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features),
('bin', binary_transformer, binary_features)
])
# Fit and transform the data
X_processed = self.preprocessor.fit_transform(X)
# Get feature names after preprocessing
feature_names = numerical_features.copy()
cat_features = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
feature_names.extend(cat_features)
feature_names.extend(binary_features)
self.feature_names = feature_names
return X_processed, y, feature_names
def train_model(self, X, y):
"""Train the prediction model"""
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train XGBoost model
self.model = xgb.XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
random_state=42
)
self.model.fit(X_train, y_train)
# Calculate performance metrics
y_pred = self.model.predict(X_test)
y_pred_proba = self.model.predict_proba(X_test)[:, 1]
accuracy = self.model.score(X_test, y_test)
auc_score = roc_auc_score(y_test, y_pred_proba)
return X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score
def predict_churn(self, input_data):
"""Predict churn for new data"""
if self.model is None or self.preprocessor is None:
raise ValueError("Model not trained yet!")
# Preprocess input data
input_processed = self.preprocessor.transform(input_data)
# Make prediction
prediction = self.model.predict(input_processed)
probability = self.model.predict_proba(input_processed)[:, 1]
return prediction[0], probability[0]
def what_if_analysis(self, base_data, feature_to_change, values_range):
"""Perform what-if analysis by changing one feature"""
probabilities = []
for value in values_range:
modified_data = base_data.copy()
modified_data[feature_to_change] = value
_, probability = self.predict_churn(modified_data)
probabilities.append(probability)
return probabilities
def explain_prediction(self, input_data):
"""Generate SHAP explanation for a prediction"""
if self.model is None or self.preprocessor is None:
raise ValueError("Model not trained yet!")
# Preprocess input data
input_processed = self.preprocessor.transform(input_data)
# Create SHAP explainer
explainer = shap.TreeExplainer(self.model)
shap_values = explainer.shap_values(input_processed)
# For binary classification, shap_values might be a list with two arrays
if isinstance(shap_values, list):
shap_values = shap_values[1] # Use the positive class (churn)
# Get feature names
feature_names = self.feature_names
return shap_values[0], explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value, feature_names
def main():
# Header
st.markdown('
🛒 E-commerce Customer Churn Prediction
',
unsafe_allow_html=True)
# Initialize predictor
predictor = ChurnPredictor()
# Sidebar
st.sidebar.title("Navigation")
app_mode = st.sidebar.selectbox("Choose App Mode",
["Data Overview", "EDA", "Churn Prediction", "What-If Analysis", "Model Insights"])
# Load data
with st.spinner('Loading data...'):
df = predictor.load_data()
if app_mode == "Data Overview":
show_data_overview(df)
elif app_mode == "EDA":
show_eda(df)
elif app_mode == "Churn Prediction":
show_churn_prediction(predictor, df)
elif app_mode == "What-If Analysis":
show_what_if_analysis(predictor, df)
elif app_mode == "Model Insights":
show_model_insights(predictor, df)
def show_data_overview(df):
st.header("📊 Data Overview")
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Dataset Preview")
st.dataframe(df.head(10), use_container_width=True)
with col2:
st.subheader("Dataset Info")
st.write(f"**Shape:** {df.shape}")
st.write(f"**Columns:** {len(df.columns)}")
st.write(f"**Missing Values:** {df.isnull().sum().sum()}")
# Churn distribution
churn_count = df['Churn'].value_counts()
churn_rate = churn_count[1]
non_churn_rate = churn_count[0]
st.metric("Customers Likely to Churn", f"{churn_rate}")
st.metric("Customers Not Likely to Churn", f"{non_churn_rate}")
st.subheader("Data Description")
st.dataframe(df.describe(), use_container_width=True)
def show_eda(df):
st.header("📈 Exploratory Data Analysis")
# Churn distribution
col1, col2 = st.columns(2)
with col1:
# Convert churn to meaningful labels
df_churn_display = df.copy()
df_churn_display['Churn_Label'] = df_churn_display['Churn'].map({0: 'Not Likely to Churn', 1: 'Likely to Churn'})
fig = px.pie(df_churn_display, names='Churn_Label', title='Churn Distribution',
color='Churn_Label',
color_discrete_map={'Not Likely to Churn':'lightblue', 'Likely to Churn':'lightcoral'})
st.plotly_chart(fig, use_container_width=True)
with col2:
churn_by_marital = df.groupby('MaritalStatus')['Churn'].mean().reset_index()
churn_by_marital['Churn_Rate'] = churn_by_marital['Churn']
fig = px.bar(churn_by_marital, x='MaritalStatus', y='Churn_Rate',
title='Churn Rate by Marital Status', color='MaritalStatus')
st.plotly_chart(fig, use_container_width=True)
# Line graphs instead of box plots
st.subheader("Trend Analysis")
# Line graph 1: Churn rate vs Satisfaction Score
satisfaction_churn = df.groupby('SatisfactionScore')['Churn'].mean().reset_index()
fig1 = px.line(satisfaction_churn, x='SatisfactionScore', y='Churn',
title='Churn Rate vs Satisfaction Score',
markers=True)
fig1.update_layout(xaxis_title="Satisfaction Score", yaxis_title="Churn Rate")
st.plotly_chart(fig1, use_container_width=True)
# Line graph 2: Churn rate vs Tenure
col1, col2 = st.columns(2)
with col1:
tenure_churn = df.groupby('Tenure')['Churn'].mean().reset_index()
fig2 = px.line(tenure_churn, x='Tenure', y='Churn',
title='Churn Rate vs Customer Tenure',
markers=True)
fig2.update_layout(xaxis_title="Tenure (months)", yaxis_title="Churn Rate")
st.plotly_chart(fig2, use_container_width=True)
with col2:
# Line graph 3: Churn rate vs Cashback Amount (binned)
df_cashback_binned = df.copy()
df_cashback_binned['Cashback_Bin'] = pd.cut(df_cashback_binned['CashbackAmount'], bins=10)
cashback_churn = df_cashback_binned.groupby('Cashback_Bin')['Churn'].mean().reset_index()
cashback_churn['Cashback_Mid'] = cashback_churn['Cashback_Bin'].apply(lambda x: x.mid)
fig3 = px.line(cashback_churn, x='Cashback_Mid', y='Churn',
title='Churn Rate vs Cashback Amount',
markers=True)
fig3.update_layout(xaxis_title="Cashback Amount", yaxis_title="Churn Rate")
st.plotly_chart(fig3, use_container_width=True)
def show_churn_prediction(predictor, df):
st.header("🔮 Churn Prediction")
# Train model if not already trained
if predictor.model is None:
with st.spinner('Training model...'):
X_processed, y, feature_names = predictor.preprocess_data(df)
X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score = predictor.train_model(X_processed, y)
# Input form
st.subheader("Enter Customer Details")
col1, col2, col3 = st.columns(3)
with col1:
tenure = st.slider("Tenure (months)", 0, 60, 12)
warehouse_to_home = st.slider("Distance to Warehouse (km)", 5, 50, 15)
num_devices = st.slider("Number of Devices Registered", 1, 6, 3)
satisfaction = st.slider("Satisfaction Score", 1, 5, 3)
with col2:
num_addresses = st.slider("Number of Addresses", 1, 20, 4)
cashback = st.slider("Cashback Amount", 0.0, 300.0, 150.0)
days_since_order = st.slider("Days Since Last Order", 0, 60, 7)
# Changed complaint to meaningful labels
complain_option = st.selectbox("Complaint Status", ["No Complaint", "Complaint Filed"])
complain = 1 if complain_option == "Complaint Filed" else 0
with col3:
# Simplified category selection - you can remove this if not needed
preferred_category = st.selectbox("Preferred Category",
['Electronics', 'Fashion', 'Grocery', 'Home & Kitchen', 'Others'])
marital_status = st.selectbox("Marital Status", ['Single', 'Married', 'Divorced'])
# Map simplified categories to original format if needed
category_mapping = {
'Electronics': 'Laptop & Accessory',
'Fashion': 'Fashion',
'Grocery': 'Grocery',
'Home & Kitchen': 'Others',
'Others': 'Others'
}
# Create input dataframe
input_data = pd.DataFrame({
'Tenure': [tenure],
'WarehouseToHome': [warehouse_to_home],
'NumberOfDeviceRegistered': [num_devices],
'PreferedOrderCat': [category_mapping[preferred_category]],
'SatisfactionScore': [satisfaction],
'MaritalStatus': [marital_status],
'NumberOfAddress': [num_addresses],
'Complain': [complain],
'DaySinceLastOrder': [days_since_order],
'CashbackAmount': [cashback],
'RecentOrder': [1 if days_since_order <= 30 else 0]
})
if st.button("Predict Churn", type="primary"):
try:
prediction, probability = predictor.predict_churn(input_data)
# Display results
st.subheader("Prediction Results")
if prediction == 1:
risk_class = "churn-risk-high"
risk_text = "LIKELY TO CHURN"
risk_color = "red"
emoji = "🔴"
prediction_label = "Likely to Churn"
else:
risk_class = "churn-risk-low"
risk_text = "NOT LIKELY TO CHURN"
risk_color = "green"
emoji = "🟢"
prediction_label = "Not Likely to Churn"
st.markdown(f"""
{emoji} Churn Prediction: {risk_text}
Prediction: {prediction_label}
Probability: {probability:.2f}
""", unsafe_allow_html=True)
# Generate SHAP explanation
with st.spinner('Analyzing factors...'):
shap_values, expected_value, feature_names = predictor.explain_prediction(input_data)
# Create a DataFrame for SHAP values
shap_df = pd.DataFrame({
'Feature': feature_names,
'SHAP Value': shap_values
})
# Sort by absolute SHAP value
shap_df['Abs_SHAP'] = np.abs(shap_df['SHAP Value'])
shap_df = shap_df.sort_values('Abs_SHAP', ascending=False).head(10)
# Create horizontal bar chart
st.subheader("📊 Factors Influencing Prediction")
fig = px.bar(shap_df,
x='SHAP Value',
y='Feature',
orientation='h',
title='Top Factors Influencing Prediction',
color='SHAP Value',
color_continuous_scale='RdBu_r',
range_color=[-max(np.abs(shap_df['SHAP Value'])), max(np.abs(shap_df['SHAP Value']))])
fig.update_layout(yaxis={'categoryorder':'total ascending'})
st.plotly_chart(fig, use_container_width=True)
# Display key factors in a more user-friendly way
st.subheader("🔑 Key Factors")
# Get top 5 factors
top_factors = shap_df.head(5)
for _, row in top_factors.iterrows():
factor_name = row['Feature']
impact = row['SHAP Value']
# Convert feature names to more readable format
readable_names = {
'Tenure': 'Customer Tenure',
'SatisfactionScore': 'Satisfaction Score',
'CashbackAmount': 'Cashback Amount',
'Complain': 'Complaint Status',
'WarehouseToHome': 'Distance to Warehouse',
'NumberOfDeviceRegistered': 'Number of Devices',
'NumberOfAddress': 'Number of Addresses',
'RecentOrder': 'Recent Order Activity',
'PreferedOrderCat_Mobile': 'Preferred Category: Mobile',
'PreferedOrderCat_Laptop & Accessory': 'Preferred Category: Electronics',
'PreferedOrderCat_Fashion': 'Preferred Category: Fashion',
'PreferedOrderCat_Grocery': 'Preferred Category: Grocery',
'MaritalStatus_Married': 'Marital Status: Married',
'MaritalStatus_Single': 'Marital Status: Single'
}
display_name = readable_names.get(factor_name, factor_name)
if impact > 0:
st.write(f"🔴 **{display_name}** increased churn risk")
else:
st.write(f"🟢 **{display_name}** decreased churn risk")
# Recommendations
st.subheader("📋 Recommendations")
if prediction == 1:
st.markdown("""
🛑 Customer Retention Actions Recommended:
- Improve the service: Identify the causes of recent complaints
- Collect feedback: Carry out surveys in order to identify service issues
- Cashback: Increase cashback for loyal customers
- Loyalty programs: Special benefits and discounts for longterm customers
""", unsafe_allow_html=True)
else:
st.markdown("""
✅ Customer Retention Actions:
- Maintening current customers: Use loyalty programs, coupons
- Constant checkins: Send short surveys to prevent complaints
- Keep engagement: through special offers, bundles, time-limited offers
""", unsafe_allow_html=True)
except Exception as e:
st.error(f"Error making prediction: {str(e)}")
def show_what_if_analysis(predictor, df):
st.header("🔍 What-If Analysis")
st.markdown("Explore how changing different factors affects churn likelihood")
# Train model if not already trained
if predictor.model is None:
with st.spinner('Training model...'):
X_processed, y, feature_names = predictor.preprocess_data(df)
predictor.train_model(X_processed, y)
# Simplified Base Customer Profile
st.subheader("Base Customer Profile")
# Use columns for better layout
col1, col2 = st.columns(2)
with col1:
base_tenure = st.slider("Base Tenure (months)", 0, 60, 12, key="base_tenure")
base_satisfaction = st.slider("Base Satisfaction Score", 1, 5, 3, key="base_satisfaction")
with col2:
base_cashback = st.slider("Base Cashback Amount", 0.0, 300.0, 150.0, key="base_cashback")
base_warehouse_dist = st.slider("Base Warehouse Distance", 5, 50, 15, key="base_dist")
base_complain = st.selectbox("Base Complaint Status", ["No Complaint", "Complaint Filed"], key="base_complain")
base_complain_val = 1 if base_complain == "Complaint Filed" else 0
# Create base data
base_data = pd.DataFrame({
'Tenure': [base_tenure],
'WarehouseToHome': [base_warehouse_dist],
'NumberOfDeviceRegistered': [3],
'PreferedOrderCat': ['Laptop & Accessory'],
'SatisfactionScore': [base_satisfaction],
'MaritalStatus': ['Single'],
'NumberOfAddress': [4],
'Complain': [base_complain_val],
'DaySinceLastOrder': [7],
'CashbackAmount': [base_cashback],
'RecentOrder': [1]
})
# What-if scenario
st.subheader("What-If Scenario")
col1, col2 = st.columns(2)
with col1:
feature_to_test = st.selectbox(
"Feature to Analyze",
['SatisfactionScore', 'CashbackAmount', 'Tenure', 'WarehouseToHome']
)
with col2:
if feature_to_test == 'SatisfactionScore':
test_range = st.slider("Test Range", 1, 5, (1, 5))
values_range = list(range(test_range[0], test_range[1] + 1))
elif feature_to_test == 'CashbackAmount':
test_range = st.slider("Test Range", 0, 300, (0, 300))
values_range = list(range(test_range[0], test_range[1] + 1, 30))
elif feature_to_test == 'Tenure':
test_range = st.slider("Test Range", 0, 60, (0, 60))
values_range = list(range(test_range[0], test_range[1] + 1, 6))
else: # WarehouseToHome
test_range = st.slider("Test Range", 5, 50, (5, 50))
values_range = list(range(test_range[0], test_range[1] + 1, 5))
if st.button("Run What-If Analysis"):
with st.spinner('Analyzing scenarios...'):
probabilities = predictor.what_if_analysis(base_data, feature_to_test, values_range)
# Create what-if analysis chart
fig = go.Figure()
fig.add_trace(go.Scatter(
x=values_range,
y=probabilities,
mode='lines+markers',
name='Churn Probability',
line=dict(color='red', width=3),
marker=dict(size=8)
))
# Add threshold line
fig.add_hline(y=0.5, line_dash="dash", line_color="orange",
annotation_text="Decision Threshold",
annotation_position="bottom right")
fig.update_layout(
title=f'What-If Analysis: Churn Probability vs {feature_to_test}',
xaxis_title=feature_to_test,
yaxis_title='Churn Probability',
hovermode='x unified',
height=500
)
st.plotly_chart(fig, use_container_width=True)
# Insights
st.subheader("📊 Analysis Insights")
current_prob = predictor.predict_churn(base_data)[1]
min_prob = min(probabilities)
max_prob = max(probabilities)
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Current Probability", f"{current_prob:.2f}")
with col2:
st.metric("Minimum Probability", f"{min_prob:.2f}")
with col3:
st.metric("Maximum Probability", f"{max_prob:.2f}")
# Business recommendations based on analysis
if feature_to_test == 'SatisfactionScore':
st.info("**💡 Insight:** Improving satisfaction score from 1 to 5 can reduce churn probability by "
f"{(max(probabilities) - min(probabilities)):.2%}")
elif feature_to_test == 'CashbackAmount':
st.info("**💡 Insight:** Higher cashback amounts show diminishing returns on churn reduction. "
"Optimal range appears to be between 150-200 units.")
def show_model_insights(predictor, df):
st.header("🤖 Model Insights")
# Train model if not already trained
if predictor.model is None:
with st.spinner('Training model and generating insights...'):
X_processed, y, feature_names = predictor.preprocess_data(df)
X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score = predictor.train_model(X_processed, y)
col1, col2 = st.columns(2)
with col1:
st.metric("Model Accuracy", f"{accuracy:.1%}")
st.metric("AUC Score", f"{auc_score:.3f}")
with col2:
# Display the provided confusion matrix image
st.subheader("Confusion Matrix")
IMAGE_URL = "https://raw.githubusercontent.com/Ricendfish/M1-Assignment/main/image.png"
st.image(IMAGE_URL, caption="Final Confusion Matrix (Threshold = 0.4150)")
# Feature Importance
st.subheader("Feature Importance")
if hasattr(predictor.model, 'feature_importances_'):
feature_importance = pd.DataFrame({
'feature': predictor.feature_names,
'importance': predictor.model.feature_importances_
}).sort_values('importance', ascending=True)
fig = px.bar(feature_importance.tail(10),
x='importance', y='feature',
title='Top 10 Most Important Features',
orientation='h',
color='importance',
color_continuous_scale='Viridis')
st.plotly_chart(fig, use_container_width=True)
if __name__ == "__main__":
main()