Spaces:

Asalun
/

E25Assignment

Sleeping

File size: 26,916 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
import xgboost as xgb
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set page configuration
st.set_page_config(
    page_title="E-commerce Churn Prediction",
    page_icon="🛒",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .prediction-box {
        padding: 20px;
        border-radius: 10px;
        margin: 10px 0px;
        background-color: #f0f2f6;
    }
    .churn-risk-high {
        background-color: #ffe6e6;
        border-left: 5px solid #ff4d4d;
        color: #cc0000;
    }
    .churn-risk-low {
        background-color: #e6ffe6;
        border-left: 5px solid #00cc66;
        color: #006600;
    }
    .feature-importance {
        background-color: #ffffff;
        padding: 15px;
        border-radius: 10px;
        border: 1px solid #ddd;
    }
    .recommendation-box {
        padding: 15px;
        border-radius: 8px;
        margin: 10px 0px;
        background-color: #f0f8ff;
        border-left: 5px solid #4682b4;
        color: #2c3e50;
    }
    .shap-explanation {
        background-color: #f8f9fa;
        padding: 15px;
        border-radius: 8px;
        border: 1px solid #dee2e6;
        margin: 10px 0px;
    }
</style>
""", unsafe_allow_html=True)

class ChurnPredictor:
    def __init__(self):
        self.model = None
        self.preprocessor = None
        self.feature_names = None
        self.target_name = 'Churn'
        
    def load_data(self):
        """Load and preprocess the data"""
        url = "https://raw.githubusercontent.com/Ricendfish/M1-Assignment/main/data_ecommerce(in).csv"
        df = pd.read_csv(url)
        
        # Remove duplicates
        df = df.drop_duplicates()
        
        # Create RecentOrder feature
        df['RecentOrder'] = np.where(df['DaySinceLastOrder'] <= 30, 1, 0)
        
        return df
    
    def preprocess_data(self, df):
        """Preprocess the data for modeling"""
        # Separate features and target
        X = df.drop('Churn', axis=1)
        y = df['Churn']
        
        # Define features
        numerical_features = ['Tenure', 'WarehouseToHome', 'NumberOfDeviceRegistered', 
                            'SatisfactionScore', 'NumberOfAddress', 'CashbackAmount']
        
        categorical_features = ['PreferedOrderCat', 'MaritalStatus']
        
        binary_features = ['Complain', 'RecentOrder']
        
        # Preprocessors
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(drop='first', sparse_output=False))
        ])
        
        binary_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent'))
        ])
        
        # Column transformer
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features),
                ('bin', binary_transformer, binary_features)
            ])
        
        # Fit and transform the data
        X_processed = self.preprocessor.fit_transform(X)
        
        # Get feature names after preprocessing
        feature_names = numerical_features.copy()
        cat_features = self.preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
        feature_names.extend(cat_features)
        feature_names.extend(binary_features)
        
        self.feature_names = feature_names
        
        return X_processed, y, feature_names
    
    def train_model(self, X, y):
        """Train the prediction model"""
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Train XGBoost model
        self.model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42
        )
        
        self.model.fit(X_train, y_train)
        
        # Calculate performance metrics
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        
        accuracy = self.model.score(X_test, y_test)
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        return X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score
    
    def predict_churn(self, input_data):
        """Predict churn for new data"""
        if self.model is None or self.preprocessor is None:
            raise ValueError("Model not trained yet!")
        
        # Preprocess input data
        input_processed = self.preprocessor.transform(input_data)
        
        # Make prediction
        prediction = self.model.predict(input_processed)
        probability = self.model.predict_proba(input_processed)[:, 1]
        
        return prediction[0], probability[0]
    
    def what_if_analysis(self, base_data, feature_to_change, values_range):
        """Perform what-if analysis by changing one feature"""
        probabilities = []
        
        for value in values_range:
            modified_data = base_data.copy()
            modified_data[feature_to_change] = value
            _, probability = self.predict_churn(modified_data)
            probabilities.append(probability)
        
        return probabilities
    
    def explain_prediction(self, input_data):
        """Generate SHAP explanation for a prediction"""
        if self.model is None or self.preprocessor is None:
            raise ValueError("Model not trained yet!")
        
        # Preprocess input data
        input_processed = self.preprocessor.transform(input_data)
        
        # Create SHAP explainer
        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(input_processed)
        
        # For binary classification, shap_values might be a list with two arrays
        if isinstance(shap_values, list):
            shap_values = shap_values[1]  # Use the positive class (churn)
        
        # Get feature names
        feature_names = self.feature_names
        
        return shap_values[0], explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value, feature_names

def main():
    # Header
    st.markdown('<h1 class="main-header">🛒 E-commerce Customer Churn Prediction</h1>', 
                unsafe_allow_html=True)
    
    # Initialize predictor
    predictor = ChurnPredictor()
    
    # Sidebar
    st.sidebar.title("Navigation")
    app_mode = st.sidebar.selectbox("Choose App Mode", 
                                   ["Data Overview", "EDA", "Churn Prediction", "What-If Analysis", "Model Insights"])
    
    # Load data
    with st.spinner('Loading data...'):
        df = predictor.load_data()
    
    if app_mode == "Data Overview":
        show_data_overview(df)
    
    elif app_mode == "EDA":
        show_eda(df)
    
    elif app_mode == "Churn Prediction":
        show_churn_prediction(predictor, df)
    
    elif app_mode == "What-If Analysis":
        show_what_if_analysis(predictor, df)
    
    elif app_mode == "Model Insights":
        show_model_insights(predictor, df)

def show_data_overview(df):
    st.header("📊 Data Overview")
    
    col1, col2 = st.columns([2, 1])
    
    with col1:
        st.subheader("Dataset Preview")
        st.dataframe(df.head(10), use_container_width=True)
    
    with col2:
        st.subheader("Dataset Info")
        st.write(f"**Shape:** {df.shape}")
        st.write(f"**Columns:** {len(df.columns)}")
        st.write(f"**Missing Values:** {df.isnull().sum().sum()}")
        
        # Churn distribution
        churn_count = df['Churn'].value_counts()
        churn_rate = churn_count[1]
        non_churn_rate = churn_count[0]
        
        st.metric("Customers Likely to Churn", f"{churn_rate}")
        st.metric("Customers Not Likely to Churn", f"{non_churn_rate}")
    
    st.subheader("Data Description")
    st.dataframe(df.describe(), use_container_width=True)

def show_eda(df):
    st.header("📈 Exploratory Data Analysis")
    
    # Churn distribution
    col1, col2 = st.columns(2)
    
    with col1:
        # Convert churn to meaningful labels
        df_churn_display = df.copy()
        df_churn_display['Churn_Label'] = df_churn_display['Churn'].map({0: 'Not Likely to Churn', 1: 'Likely to Churn'})
        
        fig = px.pie(df_churn_display, names='Churn_Label', title='Churn Distribution',
                    color='Churn_Label', 
                    color_discrete_map={'Not Likely to Churn':'lightblue', 'Likely to Churn':'lightcoral'})
        st.plotly_chart(fig, use_container_width=True)
    
    with col2:
        churn_by_marital = df.groupby('MaritalStatus')['Churn'].mean().reset_index()
        churn_by_marital['Churn_Rate'] = churn_by_marital['Churn']
        fig = px.bar(churn_by_marital, x='MaritalStatus', y='Churn_Rate',
                    title='Churn Rate by Marital Status', color='MaritalStatus')
        st.plotly_chart(fig, use_container_width=True)
    
    # Line graphs instead of box plots
    st.subheader("Trend Analysis")
    
    # Line graph 1: Churn rate vs Satisfaction Score
    satisfaction_churn = df.groupby('SatisfactionScore')['Churn'].mean().reset_index()
    fig1 = px.line(satisfaction_churn, x='SatisfactionScore', y='Churn',
                  title='Churn Rate vs Satisfaction Score',
                  markers=True)
    fig1.update_layout(xaxis_title="Satisfaction Score", yaxis_title="Churn Rate")
    st.plotly_chart(fig1, use_container_width=True)
    
    # Line graph 2: Churn rate vs Tenure
    col1, col2 = st.columns(2)
    
    with col1:
        tenure_churn = df.groupby('Tenure')['Churn'].mean().reset_index()
        fig2 = px.line(tenure_churn, x='Tenure', y='Churn',
                      title='Churn Rate vs Customer Tenure',
                      markers=True)
        fig2.update_layout(xaxis_title="Tenure (months)", yaxis_title="Churn Rate")
        st.plotly_chart(fig2, use_container_width=True)
    
    with col2:
        # Line graph 3: Churn rate vs Cashback Amount (binned)
        df_cashback_binned = df.copy()
        df_cashback_binned['Cashback_Bin'] = pd.cut(df_cashback_binned['CashbackAmount'], bins=10)
        cashback_churn = df_cashback_binned.groupby('Cashback_Bin')['Churn'].mean().reset_index()
        cashback_churn['Cashback_Mid'] = cashback_churn['Cashback_Bin'].apply(lambda x: x.mid)
        
        fig3 = px.line(cashback_churn, x='Cashback_Mid', y='Churn',
                      title='Churn Rate vs Cashback Amount',
                      markers=True)
        fig3.update_layout(xaxis_title="Cashback Amount", yaxis_title="Churn Rate")
        st.plotly_chart(fig3, use_container_width=True)

def show_churn_prediction(predictor, df):
    st.header("🔮 Churn Prediction")
    
    # Train model if not already trained
    if predictor.model is None:
        with st.spinner('Training model...'):
            X_processed, y, feature_names = predictor.preprocess_data(df)
            X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score = predictor.train_model(X_processed, y)
    
    # Input form
    st.subheader("Enter Customer Details")
    
    col1, col2, col3 = st.columns(3)
    
    with col1:
        tenure = st.slider("Tenure (months)", 0, 60, 12)
        warehouse_to_home = st.slider("Distance to Warehouse (km)", 5, 50, 15)
        num_devices = st.slider("Number of Devices Registered", 1, 6, 3)
        satisfaction = st.slider("Satisfaction Score", 1, 5, 3)
    
    with col2:
        num_addresses = st.slider("Number of Addresses", 1, 20, 4)
        cashback = st.slider("Cashback Amount", 0.0, 300.0, 150.0)
        days_since_order = st.slider("Days Since Last Order", 0, 60, 7)
        # Changed complaint to meaningful labels
        complain_option = st.selectbox("Complaint Status", ["No Complaint", "Complaint Filed"])
        complain = 1 if complain_option == "Complaint Filed" else 0
    
    with col3:
        # Simplified category selection - you can remove this if not needed
        preferred_category = st.selectbox("Preferred Category", 
                                         ['Electronics', 'Fashion', 'Grocery', 'Home & Kitchen', 'Others'])
        marital_status = st.selectbox("Marital Status", ['Single', 'Married', 'Divorced'])
    
    # Map simplified categories to original format if needed
    category_mapping = {
        'Electronics': 'Laptop & Accessory',
        'Fashion': 'Fashion',
        'Grocery': 'Grocery',
        'Home & Kitchen': 'Others',
        'Others': 'Others'
    }
    
    # Create input dataframe
    input_data = pd.DataFrame({
        'Tenure': [tenure],
        'WarehouseToHome': [warehouse_to_home],
        'NumberOfDeviceRegistered': [num_devices],
        'PreferedOrderCat': [category_mapping[preferred_category]],
        'SatisfactionScore': [satisfaction],
        'MaritalStatus': [marital_status],
        'NumberOfAddress': [num_addresses],
        'Complain': [complain],
        'DaySinceLastOrder': [days_since_order],
        'CashbackAmount': [cashback],
        'RecentOrder': [1 if days_since_order <= 30 else 0]
    })
    
    if st.button("Predict Churn", type="primary"):
        try:
            prediction, probability = predictor.predict_churn(input_data)
            
            # Display results
            st.subheader("Prediction Results")
            
            if prediction == 1:
                risk_class = "churn-risk-high"
                risk_text = "LIKELY TO CHURN"
                risk_color = "red"
                emoji = "🔴"
                prediction_label = "Likely to Churn"
            else:
                risk_class = "churn-risk-low"
                risk_text = "NOT LIKELY TO CHURN"
                risk_color = "green"
                emoji = "🟢"
                prediction_label = "Not Likely to Churn"
            
            st.markdown(f"""
            <div class="prediction-box {risk_class}">
                <h3>{emoji} Churn Prediction: {risk_text}</h3>
                <p><strong>Prediction:</strong> {prediction_label}</p>
                <p><strong>Probability:</strong> {probability:.2f}</p>
            </div>
            """, unsafe_allow_html=True)
            
            # Generate SHAP explanation
            with st.spinner('Analyzing factors...'):
                shap_values, expected_value, feature_names = predictor.explain_prediction(input_data)
                
                # Create a DataFrame for SHAP values
                shap_df = pd.DataFrame({
                    'Feature': feature_names,
                    'SHAP Value': shap_values
                })
                
                # Sort by absolute SHAP value
                shap_df['Abs_SHAP'] = np.abs(shap_df['SHAP Value'])
                shap_df = shap_df.sort_values('Abs_SHAP', ascending=False).head(10)
                
                # Create horizontal bar chart
                st.subheader("📊 Factors Influencing Prediction")
                fig = px.bar(shap_df, 
                            x='SHAP Value', 
                            y='Feature',
                            orientation='h',
                            title='Top Factors Influencing Prediction',
                            color='SHAP Value',
                            color_continuous_scale='RdBu_r',
                            range_color=[-max(np.abs(shap_df['SHAP Value'])), max(np.abs(shap_df['SHAP Value']))])
                
                fig.update_layout(yaxis={'categoryorder':'total ascending'})
                st.plotly_chart(fig, use_container_width=True)
                
                # Display key factors in a more user-friendly way
                st.subheader("🔑 Key Factors")
                
                # Get top 5 factors
                top_factors = shap_df.head(5)
                
                for _, row in top_factors.iterrows():
                    factor_name = row['Feature']
                    impact = row['SHAP Value']
                    
                    # Convert feature names to more readable format
                    readable_names = {
                        'Tenure': 'Customer Tenure',
                        'SatisfactionScore': 'Satisfaction Score',
                        'CashbackAmount': 'Cashback Amount',
                        'Complain': 'Complaint Status',
                        'WarehouseToHome': 'Distance to Warehouse',
                        'NumberOfDeviceRegistered': 'Number of Devices',
                        'NumberOfAddress': 'Number of Addresses',
                        'RecentOrder': 'Recent Order Activity',
                        'PreferedOrderCat_Mobile': 'Preferred Category: Mobile',
                        'PreferedOrderCat_Laptop & Accessory': 'Preferred Category: Electronics',
                        'PreferedOrderCat_Fashion': 'Preferred Category: Fashion',
                        'PreferedOrderCat_Grocery': 'Preferred Category: Grocery',
                        'MaritalStatus_Married': 'Marital Status: Married',
                        'MaritalStatus_Single': 'Marital Status: Single'
                    }
                    
                    display_name = readable_names.get(factor_name, factor_name)
                    
                    if impact > 0:
                        st.write(f"🔴 **{display_name}** increased churn risk")
                    else:
                        st.write(f"🟢 **{display_name}** decreased churn risk")
            
            # Recommendations
            st.subheader("📋 Recommendations")
            
            if prediction == 1:
                st.markdown("""
                <div class="recommendation-box">
                <h4>🛑 Customer Retention Actions Recommended:</h4>
                <ul>
                    <li><strong>Improve the service:</strong> Identify the causes of recent complaints</li>
                    <li><strong>Collect feedback:</strong> Carry out surveys in order to identify service issues</li>
                    <li><strong>Cashback:</strong> Increase cashback for loyal customers</li>
                    <li><strong>Loyalty programs:</strong> Special benefits and discounts for longterm customers</li>
                </ul>
                </div>
                """, unsafe_allow_html=True)
            else:
                st.markdown("""
                <div class="recommendation-box">
                <h4>✅ Customer Retention Actions:</h4>
                <ul>
                    <li><strong>Maintening current customers:</strong> Use loyalty programs, coupons</li>
                    <li><strong>Constant checkins:</strong> Send short surveys to prevent complaints</li>
                    <li><strong>Keep engagement:</strong> through special offers, bundles, time-limited offers</li>
                </ul>
                </div>
                """, unsafe_allow_html=True)
            
        except Exception as e:
            st.error(f"Error making prediction: {str(e)}")

def show_what_if_analysis(predictor, df):
    st.header("🔍 What-If Analysis")
    st.markdown("Explore how changing different factors affects churn likelihood")
    
    # Train model if not already trained
    if predictor.model is None:
        with st.spinner('Training model...'):
            X_processed, y, feature_names = predictor.preprocess_data(df)
            predictor.train_model(X_processed, y)
    
    # Simplified Base Customer Profile
    st.subheader("Base Customer Profile")
    
    # Use columns for better layout
    col1, col2 = st.columns(2)
    
    with col1:
        base_tenure = st.slider("Base Tenure (months)", 0, 60, 12, key="base_tenure")
        base_satisfaction = st.slider("Base Satisfaction Score", 1, 5, 3, key="base_satisfaction")
    
    with col2:
        base_cashback = st.slider("Base Cashback Amount", 0.0, 300.0, 150.0, key="base_cashback")
        base_warehouse_dist = st.slider("Base Warehouse Distance", 5, 50, 15, key="base_dist")
    
    base_complain = st.selectbox("Base Complaint Status", ["No Complaint", "Complaint Filed"], key="base_complain")
    base_complain_val = 1 if base_complain == "Complaint Filed" else 0
    
    # Create base data
    base_data = pd.DataFrame({
        'Tenure': [base_tenure],
        'WarehouseToHome': [base_warehouse_dist],
        'NumberOfDeviceRegistered': [3],
        'PreferedOrderCat': ['Laptop & Accessory'],
        'SatisfactionScore': [base_satisfaction],
        'MaritalStatus': ['Single'],
        'NumberOfAddress': [4],
        'Complain': [base_complain_val],
        'DaySinceLastOrder': [7],
        'CashbackAmount': [base_cashback],
        'RecentOrder': [1]
    })
    
    # What-if scenario
    st.subheader("What-If Scenario")
    
    col1, col2 = st.columns(2)
    
    with col1:
        feature_to_test = st.selectbox(
            "Feature to Analyze",
            ['SatisfactionScore', 'CashbackAmount', 'Tenure', 'WarehouseToHome']
        )
    
    with col2:
        if feature_to_test == 'SatisfactionScore':
            test_range = st.slider("Test Range", 1, 5, (1, 5))
            values_range = list(range(test_range[0], test_range[1] + 1))
        elif feature_to_test == 'CashbackAmount':
            test_range = st.slider("Test Range", 0, 300, (0, 300))
            values_range = list(range(test_range[0], test_range[1] + 1, 30))
        elif feature_to_test == 'Tenure':
            test_range = st.slider("Test Range", 0, 60, (0, 60))
            values_range = list(range(test_range[0], test_range[1] + 1, 6))
        else:  # WarehouseToHome
            test_range = st.slider("Test Range", 5, 50, (5, 50))
            values_range = list(range(test_range[0], test_range[1] + 1, 5))
    
    if st.button("Run What-If Analysis"):
        with st.spinner('Analyzing scenarios...'):
            probabilities = predictor.what_if_analysis(base_data, feature_to_test, values_range)
            
            # Create what-if analysis chart
            fig = go.Figure()
            
            fig.add_trace(go.Scatter(
                x=values_range,
                y=probabilities,
                mode='lines+markers',
                name='Churn Probability',
                line=dict(color='red', width=3),
                marker=dict(size=8)
            ))
            
            # Add threshold line
            fig.add_hline(y=0.5, line_dash="dash", line_color="orange", 
                         annotation_text="Decision Threshold", 
                         annotation_position="bottom right")
            
            fig.update_layout(
                title=f'What-If Analysis: Churn Probability vs {feature_to_test}',
                xaxis_title=feature_to_test,
                yaxis_title='Churn Probability',
                hovermode='x unified',
                height=500
            )
            
            st.plotly_chart(fig, use_container_width=True)
            
            # Insights
            st.subheader("📊 Analysis Insights")
            
            current_prob = predictor.predict_churn(base_data)[1]
            min_prob = min(probabilities)
            max_prob = max(probabilities)
            
            col1, col2, col3 = st.columns(3)
            
            with col1:
                st.metric("Current Probability", f"{current_prob:.2f}")
            
            with col2:
                st.metric("Minimum Probability", f"{min_prob:.2f}")
            
            with col3:
                st.metric("Maximum Probability", f"{max_prob:.2f}")
            
            # Business recommendations based on analysis
            if feature_to_test == 'SatisfactionScore':
                st.info("**💡 Insight:** Improving satisfaction score from 1 to 5 can reduce churn probability by "
                       f"{(max(probabilities) - min(probabilities)):.2%}")
            
            elif feature_to_test == 'CashbackAmount':
                st.info("**💡 Insight:** Higher cashback amounts show diminishing returns on churn reduction. "
                       "Optimal range appears to be between 150-200 units.")

def show_model_insights(predictor, df):
    st.header("🤖 Model Insights")
    
    # Train model if not already trained
    if predictor.model is None:
        with st.spinner('Training model and generating insights...'):
            X_processed, y, feature_names = predictor.preprocess_data(df)
            X_test, y_test, y_pred, y_pred_proba, accuracy, auc_score = predictor.train_model(X_processed, y)
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.metric("Model Accuracy", f"{accuracy:.1%}")
        st.metric("AUC Score", f"{auc_score:.3f}")
    
    with col2:
        # Display the provided confusion matrix image
        st.subheader("Confusion Matrix")
        IMAGE_URL = "https://raw.githubusercontent.com/Ricendfish/M1-Assignment/main/image.png"
        st.image(IMAGE_URL, caption="Final Confusion Matrix (Threshold = 0.4150)")


    
    # Feature Importance
    st.subheader("Feature Importance")
    
    if hasattr(predictor.model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': predictor.feature_names,
            'importance': predictor.model.feature_importances_
        }).sort_values('importance', ascending=True)
        
        fig = px.bar(feature_importance.tail(10), 
                    x='importance', y='feature',
                    title='Top 10 Most Important Features',
                    orientation='h',
                    color='importance',
                    color_continuous_scale='Viridis')
        st.plotly_chart(fig, use_container_width=True)

if __name__ == "__main__":
    main()