chrun / pages /3_EDA_and_Feature_Engineering.py
Gowthamvemula's picture
Update pages/3_EDA_and_Feature_Engineering.py
661a992 verified
import streamlit as st
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from io import StringIO
import sys
st.markdown("<h1 style='text-align:center; color:white;'>EDA and Feature Engineering</h1>",unsafe_allow_html=True)
# Define the URL of the background image (use your own image URL)
background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67445925102349e867c92342/P23Y2Ok5clUQcgPX9DRgu.png"
# Apply custom CSS for the background image and overlay
st.markdown(
f"""
<style>
.stApp {{
background-image: url("{background_image_url}");
background-size: auto; /* Ensures the image retains its original size */
background-repeat: repeat; /* Makes the image repeat to cover the entire background */
background-position: top left; /* Starts repeating from the top-left corner */
background-attachment: fixed; /* Keeps the background fixed as you scroll */
}}
/* Semi-transparent overlay */
.stApp::before {{
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
z-index: -1;
}}
/* Container to center elements and limit width */
.content-container {{
max-width: 70%; /* Limit content width to 70% */
margin: 0 auto; /* Center the container */
padding: 50px; /* Add some padding for spacing */
}}
/* Styling the markdown content */
.stMarkdown {{
color: white; /* White text to ensure visibility */
font-size: 100px; /* Adjust font size for readability */
# text-align: center; /* Center align text */
}}
</style>
""",
unsafe_allow_html=True
)
# Title of the Streamlit app
st.title("Exploratory Data Analysis (EDA) on Electronics Sales Dataset")
st.markdown("""
This page provides advanced Exploratory Data Analysis (EDA) and Feature Engineering using the dataset loaded in memory.
---
""")
# Access dataset from session state
if 'cleaned_data' in st.session_state:
df= st.session_state.cleaned_data
# df = st.session_state['df']
# st.success("Dataset loaded successfully.")
# Display Dataset Overview
st.write("### Dataset Overview")
st.dataframe(df.head())
st.write(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
st.write("## Univariate Analysis")
# Set background style
sns.set_style("dark") # Options: "white", "dark", "whitegrid", "darkgrid", "ticks"
st.write("### Age Distribution")
# Create the figure
fig, ax = plt.subplots(figsize=(15*0.7, 8*0.7))
# Plot KDE
sns.kdeplot(df['age'], fill=True, color='skyblue', alpha=0.8, ax=ax)
# Add titles and labels
ax.set_title("Age Distribution", fontsize=16, fontweight='bold')
ax.set_xlabel("Age", fontsize=14)
ax.set_ylabel("Density", fontsize=14)
# Customize grid
ax.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
# Show the plot
st.pyplot(fig)
st.markdown('''
**Insights For Distribution of Ages:**
- The age distribution is positively skewed, with most participants concentrated between 30 and 50 years old, and a smaller representation from older age groups.
''')
# Streamlit app title
st.title('CreditScore Distribution')
# Create a Streamlit slider for the number of bins
bins = st.slider('Select Number of Bins', min_value=5, max_value=50, value=20)
# Create the histogram plot
plt.figure(figsize=(10, 6))
sns.histplot(df['creditscore'], bins=bins, color='green', edgecolor='black', kde=True)
plt.title('Distribution of CreditScore')
plt.xlabel('CreditScore')
plt.ylabel('No Of People')
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(plt)
st.markdown('''
**Insights for CreditScore:**
- The distribution of credit scores is approximately symmetric, centered around 600-700, with most individuals falling within this range.
- There are fewer individuals with very low (below 400) or very high (above 800) credit scores.
''')
st.write("### Balance Distribution")
# Create the figure
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x=df['balance'], color='orange', ax=ax)
# Add title and labels
ax.set_title('Balance Distribution')
ax.set_xlabel('Balance')
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**Insights for balance distribution:**
- **Right-skewed distribution:** Most clients have low balances, while a few have very high balances.
- **Segmentation potential:** Clients can be segmented based on balance levels (low, medium, high).
- **Marketing and product implications:** Tailor marketing and products to each segment.
- **Risk management:** High-balance clients may pose higher risk.
''')
st.write("### Gender Distribution")
# Create the figure
fig, ax = plt.subplots(figsize=(8, 8))
df['gender'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90,
colors=['skyblue', 'yellow'], ax=ax)
# Add title
ax.set_title('Gender Distribution')
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
### **Insights from Gender Distribution**
- **Gender Proportion** – Shows the male-to-female ratio; imbalance may impact analysis.
- **Market Implications** – Helps tailor financial services or marketing strategies.
- **Bias Consideration** – Unequal distribution may introduce bias in models.
''')
st.write("### Estimated Salary Distribution")
# Create the figure
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x=df['estimatedsalary'], color='gold', ax=ax)
# Add title and labels
ax.set_title('Estimated Salary Distribution')
ax.set_xlabel('Estimated Salary')
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
### **Insights from Estimated Salary Distribution**
- **Median Salary:** Around 100,000.
- **Range:** Salaries range from 0 to 200,000.
- **IQR:** Narrow, indicating a cluster of salaries around the median.
- **High-Income Individuals:** One individual with a salary exceeding 200,000.
''')
st.write("### Customer Tenure Distribution")
# Create the figure
fig, ax = plt.subplots(figsize=(14, 6))
# Create the countplot
ax = sns.countplot(data=df, x="tenure", palette="Set2")
# Add labels to all bars
for container in ax.containers:
ax.bar_label(container, fontweight="black", size=15)
# Customize the plot
ax.set_title("Customer Tenure Distribution", fontweight="black", size=20, pad=20)
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**Insights from Estimated Salary Distribution**
- **Distribution Shape:** The plot shows a histogram with a distinct peak around the 2-year tenure mark. This suggests that a significant portion of the individuals have been with the organization for approximately 2 years.
- **Tenure Range:** The tenure range appears to span from 0 to 10 years.
- **Trend:** A slight downward trend is visible from 2 years onwards, indicating that the number of individuals with longer tenures decreases gradually.
- **Curve:** The overlaid curve suggests a potential underlying distribution for the tenure data. It might be helpful to explore this curve further to understand the underlying probability distribution.
''')
st.write("### Customer Geography Distribution")
# Compute value counts for geography
count = df["geography"].value_counts()
# Create the figure
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Countplot (Bar Chart)
ax1 = sns.countplot(data=df, x="geography", palette="Set2", ax=axes[0])
for container in ax1.containers:
ax1.bar_label(container, fontweight="black", size=15)
ax1.set_title("Customer Geography Distribution", fontweight="black", size=20, pad=20)
# Pie Chart
axes[1].pie(count.values, labels=count.index, autopct="%1.1f%%", colors=sns.color_palette("Set2"),
textprops={"fontweight": "black"}, explode=[0, 0, 0.1])
axes[1].set_title("Customer Geography Distribution", fontweight="black", size=20, pad=20)
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**Insights from Customer Geography Distribution**
- France has the largest customer base (50.1%), followed by Spain (25.1%) and Germany (24.8%).
- The bar chart clearly shows the difference in customer numbers across countries.
- The pie chart visually represents the percentage distribution of customers.
''')
st.write("### Distribution of Exiting Category")
# Create the figure
fig, ax = plt.subplots(figsize=(8, 6))
# Create the countplot
sns.countplot(data=df, x='exiting_category', color='#A52A2A', ax=ax)
# Add title and labels
ax.set_title('Distribution Of Exiting Category')
ax.set_xlabel('Exiting Category')
ax.set_ylabel('Count')
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**Insights Distribution Of Exiting Category**
1. **Category Breakdown**:
- The **Active - Exited** group has the highest count, indicating that most individuals who exited were active.
- The **Inactive - Exited** group is the second largest, showing a significant number of inactive individuals also ex
- The **Active - Not Exited** group has the smallest representation.
- The **Inactive - Not Exited** group has a moderate count.cation.
2. **Retention Insights**:
- A significant proportion of both **Active** and **Inactive** individuals have exited.
- The **Active - Not Exited** group is relatively small, suggesting challenges in retaining active indiviour.image_file)
''')
st.write("## Bivariate Analysis")
st.write("### Exiting Category vs Various Features")
# Set up the figure and axes for subplots
fig, axes = plt.subplots(1, 3, figsize=(26, 8))
# Plot boxplot for 'creditscore' vs 'exited'
sns.boxplot(x='exiting_category', y='creditscore', data=df, palette='Set1', ax=axes[0])
axes[0].set_title('Exited vs Creditscore', fontsize=17)
axes[0].set_xlabel('Exited')
axes[0].set_ylabel('Creditscore')
# Plot boxplot for 'age' vs 'exited'
sns.boxplot(x='exiting_category', y='age', data=df, palette='Set3', ax=axes[1])
axes[1].set_title('Exited vs Age', fontsize=17)
axes[1].set_xlabel('Exited')
axes[1].set_ylabel('Age')
# Plot boxplot for 'estimatedsalary' vs 'exited'
sns.boxplot(x='exiting_category', y='estimatedsalary', data=df, palette='Set2', ax=axes[2])
axes[2].set_title('Exited vs EstimatedSalary', fontsize=17)
axes[2].set_xlabel('Exited')
axes[2].set_ylabel('EstimatedSalary')
plt.tight_layout()
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**1. Exited vs Credit Score:**
- The distribution of credit scores is very similar across all categories of "Exited."
**Insight:**
- Credit score is not a strong indicator of whether a customer will exit or stay.
**2. Exited vs Age:**
Customers who exited (especially "Inactive - Exited") tend to have a lower age range compared to those who did not exit.
**Insight:** Younger customers may be more likely to exit, indicating potential retention strategies should focus on this age group.
**3. Exited vs Estimated Salary:**
- The estimated salary distribution is consistent across all categories, with no visible differences between exited and non-exited groups.
**Insight:** Salary level does not appear to influence customer exit behavior.
**Summary:**
- Focus retention efforts on younger customers as age appears to have some influence, while credit score and estimated salary show minimal impact on exit behavior
''')
st.write("### Pairplot of Numeric Variables by Exiting Category")
# Create the pairplot
pairplot_fig = sns.pairplot(df[['creditscore', 'age', 'balance', 'estimatedsalary', 'exiting_category']],
hue='exiting_category')
# Display the plot in Streamlit
st.pyplot(pairplot_fig)
st.markdown('''
**1. Credit Score Distribution (First Row):**
- The credit score is generally concentrated between 400 and 800.
- The distribution of customers appears similar for all four categories of "exiting_category."
- No distinct group seems to have significantly higher or lower credit scores, meaning credit score alone does not differentiate customers who exit or remain.
**2. Age (Second Row):**
- The "Age" variable shows more spread, especially with some customers in the 60-80 age range.
- Older customers (around 50+) show more variation between the categories (e.g., those in "Exited" groups seem to have higher representation in some --- cases). This may suggest age could be slightly influential in predicting customer behavior.
**3. Balance (Third Row):**
- A significant cluster of customers has a balance near zero, likely indicating inactive accounts.
- Beyond zero, customers with higher balances are evenly spread across "Exited" and "Not Exited" categories, showing balance alone may not strongly - determine customer behavior.
**4. Estimated Salary (Fourth Row):**
- The "Estimated Salary" distribution is uniform across all customers, with no obvious difference between categories. This indicates salary is unlikely - to be a strong predictor of customer retention or exit behavior.
**Key Insight 1:**
- Variables like "Balance" and "Estimated Salary" have minimal differentiation across categories and are unlikely to be useful alone for predicting customer exit or retention.
**Key Insight 2:**
- "Age" shows some patterns, with older customers possibly being more likely to exit, suggesting a need to examine this variable further for potential targeting strategies.
**Key Insight 3:**
- The overall relationships between variables are weak, meaning you may need to look at interactions between variables or consider additional data (e.g., customer behavior patterns or satisfaction scores).
''')
st.write("### Contingency Tables for Gender & Credit Card Ownership vs Exiting")
# Gender vs Exiting
gender_exit_ct = pd.crosstab(df['gender'], df['exiting_category'])
st.write("#### Gender vs Exiting Contingency Table")
st.dataframe(gender_exit_ct)
# HasCrCard vs Exiting
creditcard_exit_ct = pd.crosstab(df['hascrcard'], df['exiting_category'])
st.write("#### HasCrCard vs Exiting Contingency Table")
st.dataframe(creditcard_exit_ct)
st.markdown('''
**Gender vs Exiting:**
**Active - Exited:**
- Males (2,546) exit more frequently than females (1,870) while being active, indicating higher churn among male customers.
Active - Not Exited:
- Females (414) stay active without exiting more than males (321), showing higher loyalty or satisfaction when engaged.
**Inactive - Exited:**
- Both genders show significant exits when inactive, but males (2,013) outnumber females (1,534), reinforcing their higher overall churn.
**Inactive - Not Exited:**
- Females (725) are more likely to remain in the system while inactive compared to males (577), suggesting a stronger tendency to stay connected even when disengaged.
**Has Credit Card vs Exiting:**
**Active - Exited:**
- Customers with credit cards (3,125) exit significantly more than those without (1,291), despite being active.
**Active - Not Exited:**
- Retention is slightly higher for credit card holders (482) compared to non-holders (253), though both numbers are relatively low.
**Inactive - Exited:**
- Inactive credit card holders (2,506) have much higher exits than non-holders (1,041), reflecting greater dissatisfaction or disengagement.
**Inactive - Not Exited:**
- Credit card holders (942) are more likely to remain inactive without exiting than non-holders (360), indicating potential for reactivation campaigns.
''')
st.write("### Grouped Bar Charts: Gender & Credit Card Ownership vs Exiting")
# Data for Gender vs Exiting
gender_categories = gender_exit_ct.index
exiting_categories = gender_exit_ct.columns
gender_values = gender_exit_ct.values.T # Transpose for grouped bar
# Data for Has Credit Card vs Exiting
creditcard_categories = creditcard_exit_ct.index
creditcard_values = creditcard_exit_ct.values.T # Transpose for grouped bar
# Create grouped bar charts
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
# Gender vs Exiting (Grouped Bar)
x = np.arange(len(gender_categories)) # x positions for bars
width = 0.2 # Width of each bar
for i, category in enumerate(exiting_categories):
axes[0].bar(x + i * width, gender_values[i], width, label=category)
axes[0].set_title('Gender vs Exiting')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Count')
axes[0].set_xticks(x + width * (len(exiting_categories) - 1) / 2)
axes[0].set_xticklabels(gender_categories)
axes[0].legend(title='Exiting Category')
# Has Credit Card vs Exiting (Grouped Bar)
x = np.arange(len(creditcard_categories)) # x positions for bars
for i, category in enumerate(exiting_categories):
axes[1].bar(x + i * width, creditcard_values[i], width, label=category)
axes[1].set_title('Has Credit Card vs Exiting')
axes[1].set_xlabel('Has Credit Card')
axes[1].set_xticks(x + width * (len(exiting_categories) - 1) / 2)
axes[1].set_xticklabels(creditcard_categories)
axes[1].legend(title='Exiting Category')
# Adjust layout and display the plot in Streamlit
plt.tight_layout()
st.pyplot(fig)
st.markdown('''
**Insights:**
**Gender vs Exiting:**
**Females:**
- A higher proportion of females are "Active - Exited" compared to "Active - Not Exited."
- A significant number of females fall into the "Inactive - Exited" category.
**Males:**
- More males are "Active - Not Exited" compared to those who exited.
- There are fewer males in the "Inactive" categories compared to females.
**Credit Card vs Exiting:**
- Individuals without credit cards (0) are more likely to exit the system across all categories.
- Individuals with credit cards (1) show a stronger trend of being "Active - Not Exited."
''')
st.write("### Gender vs Exiting Proportions (%)")
# Calculate proportions
gender_exit_prop = gender_exit_ct.div(gender_exit_ct.sum(axis=1), axis=0) * 100
# Display the proportions table
st.write("#### Gender vs Exiting Proportions Table")
st.dataframe(gender_exit_prop)
# Create the stacked bar chart
fig, ax = plt.subplots(figsize=(8, 5))
gender_exit_prop.plot(kind='bar', stacked=True, colormap='Set2', ax=ax)
# Customize the plot
ax.set_title('Gender vs Exiting (Proportions)')
ax.set_xlabel('Gender')
ax.set_ylabel('Percentage')
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**Insights from Gender vs Exiting Proportions:**
**Active - Exited:**
- Males (46.66%) have a higher proportion of exiting while being active compared to females (41.16%).
- This indicates that males, despite being active, are more likely to exit the system.
**Active - Not Exited:**
- Females (9.11%) have a significantly higher proportion of staying active without exiting compared to males (5.88%).
- Females appear to be more loyal or engaged while active compared to males.
**Inactive - Exited:**
- A similar proportion of females (33.77%) and males (36.89%) exit while being inactive.
- This shows that inactivity is a strong indicator of exiting, regardless of gender.
**Inactive - Not Exited:**
- Females (15.96%) have a higher proportion of staying inactive without exiting compared to males (10.57%).
- This could suggest that females are more likely to remain in the system, even in an inactive state, potentially reflecting patience or a tendency to give the service more time.
**Summary of Findings:**
- Males are more likely to exit overall, whether they are active or inactive.
- Females show higher retention, both when active and inactive, indicating a stronger customer loyalty tendency.
''')
st.write("### Scatter Plot & Correlation Heatmap")
# Create the 1x2 grid of subplots
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
# Scatter plot (First subplot)
sns.scatterplot(x='balance', y='creditscore', data=df, ax=axes[0])
axes[0].set_title('Scatter Plot: Creditscore vs Balance', fontsize=14)
axes[0].set_xlabel('Balance')
axes[0].set_ylabel('Creditscore')
# Correlation Heatmap (Second subplot)
correlation_matrix = df[['age','tenure','estimatedsalary']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size': 12}, ax=axes[1])
axes[1].set_title('Correlation Heatmap', fontsize=14)
# Adjust layout and display the plots in Streamlit
plt.tight_layout()
st.pyplot(fig)
st.markdown('''
### Insights from Scatter Plot and Correlation Heatmap
### Scatter Plot: Creditscore vs. Balance
1. **General Observation**:
- The scatter plot shows the distribution of **credit scores** (y-axis) against **balance** (x-axis).
- There is no strong visible trend or pattern between the two variables, suggesting a weak or no correlation.
2. **Key Observations**:
- A significant number of points have a **balance of 0**, indicating a portion of customers with no account balance.
- Credit scores range widely from ~400 to 850 across all balance levels, showing no specific clustering based on credit score.#
---
## Correlation Heatmap
1. **Key Observations**:
- The heatmap shows correlations between numerical variables (age, tenure, estimated salary, etc.).
- All correlations are very close to 0 (ranging from -0.01 to 0.01), indicating no significant linear relationships among the variables.
2. **Interpretation**:
- **Age vs. Tenure**: Correlation is essentially zero (-0.01), indicating no relationship between these two variables.
- **Tenure vs. Estimated Salary**: Correlation of 0.01, which is negligible.
- **Estimated Salary vs. Age**: Correlation of -0.01, indicating no meaningful relationship.
''')
st.write("### Box Plot: Geography vs Balance")
# Create the box plot
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x='geography', y='balance', data=df, ax=ax)
# Customize the plot
ax.set_title('Box Plot: Geography vs Balance')
ax.set_xlabel('Geography')
ax.set_ylabel('Balance')
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**Insights from Box Plot: Geography vs. Balance**
1. **Median Balance**:
- Customers from **Germany** have the highest median balance, indicating greater financial stability.
- **France** and **Spain** show similar median balances, which are moderately lower than Germany.
2. **Balance Spread**:
- **France** and **Spain** exhibit a wider interquartile range (IQR), reflecting higher variability in balances.
- **Germany** shows a narrower IQR, suggesting more consistent balances among its customers.
3. **Outliers**:
- **Germany** has a notable number of low-balance outliers, which could represent a specific subset of customers.
- **France** and **Spain** have fewer extreme outliers, showing more balanced customer distributions.
''')
st.write('# Multivariate Analysis')
st.write("### Correlation Heatmap with Multiple Features")
# Calculate the correlation matrix for more columns
correlation_matrix = df[['age', 'tenure', 'estimatedsalary', 'creditscore', 'balance', 'hascrcard']].corr()
# Create the heatmap
fig, ax = plt.subplots(figsize=(8, 6)) # Set the figure size
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size': 12}, ax=ax)
# Set the title
ax.set_title('Correlation Heatmap', fontsize=14)
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
### Insights from heatmap:
- **Independence of Variables:** Variables like age, balance, creditscore, and tenure show minimal correlation, suggesting they don't significantly influence each other.
- **Weak Relationships:** For example, hascard has almost no relationship with other variables, meaning it may not directly impact features like balance or estimatedsalary.
''')
st.write("### Grouped Data by Geography and Exiting Category")
# Group the data by Geography and Exiting Category
grouped_data = df.groupby(['geography', 'exiting_category']).agg({
'creditscore': 'mean',
'balance': 'mean',
'estimatedsalary': 'mean',
'age': 'mean'
}).reset_index()
# Display the grouped data in a table
st.dataframe(grouped_data)
st.write('''
### Insights for Grouped Data by Geography and Exiting Category
1. **Geography Influence on Customer Behavior**:
- The **creditscore**, **balance**, **estimated salary**, and **age** of customers may vary significantly across different geographies. For example, if one geography has a higher average **balance** and **estimated salary**, it could indicate wealthier customers in that region.
2. **Exiting Category Patterns**:
- The **exiting_category** can show patterns of customer churn. For instance, if the average **creditscore** is significantly lower for customers who exited, it could indicate that customers with lower scores are more likely to leave.
- Similar insights can be made for **balance** and **estimated salary**. Customers with lower balances or salaries might be more prone to churn.
3. **Age Distribution**:
- By analyzing the **age** column, you might find that younger or older customers are more likely to exit. If younger customers have a lower average age in the exited category, businesses may need to focus on improving retention strategies for that demographic.
4. **Geography and Exiting Correlation**:
- If you notice certain geographies have higher **exiting_category** proportions (e.g., more exits in one region), it could indicate regional issues or dissatisfaction that need to be addressed.
5. **Targeted Strategies Based on Grouping**:
- Businesses can develop **targeted marketing or retention strategies** based on this grouping. For example, focusing on high-creditscore customers in a specific region to reduce churn, or providing more incentives for customers in low-income areas.
### Actionable Strategies Based on Insights:
1. **Customer Segmentation**: Use the insights to segment customers based on their geography and behavior (e.g., exited vs. retained), and tailor marketing or retention efforts for each group.
2. **Improvement of Customer Retention Programs**: If exiting customers tend to have lower balances or salaries, businesses could offer more targeted retention strategies (e.g., loyalty rewards, personalized offers).
3. **Localized Marketing Campaigns**: If certain geographies show higher churn, localized campaigns addressing the concerns specific to those areas can be launched.
''')
st.write("### Average Balance by Geography and Exiting Category")
# Create the pivot table
pivot_table = df.pivot_table(
index='geography', columns='exiting_category',
values='balance', aggfunc='mean'
)
# Create the heatmap
fig, ax = plt.subplots(figsize=(8, 6)) # Set the figure size
sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="YlGnBu", ax=ax)
# Set the title
ax.set_title("Average Balance by Geography and Exiting Category")
# Display the plot in Streamlit
st.pyplot(fig)
st.markdown('''
**1. Geography Insights:**
**Germany:**
- Customers in Germany have the highest average balances across all categories (~119,000–120,000).
- This suggests German customers may hold larger accounts or higher financial engagement with the bank.
**France:**
- French customers have lower average balances (~59,000–72,000) compared to Germany, with some variation between Active/Exited and Exited/Not Exited categories.
**Spain:**
- Spanish customers fall in the mid-range, with average balances between ~56,000–77,000, slightly higher than France but significantly lower than Germany.
**2. Customer Status Insights**
**Exited Customers:**
- Exited customers in all geographies (France, Germany, Spain) tend to have slightly lower balances than their active counterparts.
Exception: In France, exited customers have balances slightly higher than active ones in the "Exited" categories, which could indicate retention issues with higher-value customers.
**Not Exited Customers:**
- Customers who have not exited generally hold higher balances, particularly in Germany and Spain.
Suggests that customers with higher balances are more likely to remain loyal.
**3. Geography-Specific Observations:**
**France**:
- Active and non-exited customers have higher balances (72,622) compared to their exited counterparts (59,780–61,009).
**Germany**:
- **Insight**: Little variation in average balances across all categories (~119,000–120,000). This indicates a consistently high-value customer base regardless of status.
**Spain**:
- **Insight**: The largest variation is seen in Spain. Non-exited customers (77,529) have much higher balances than exited customers (56,480).
- This suggests a significant difference in engagement between customer segments in Spain, indicating that non-exited customers may be more engaged or satisfied with the service compared to exited customers.
**Key Takeaways**:
- **Germany** holds the most high-value customers with consistently high balances.
- **France** shows retention issues, as exited customers can still hold moderate balances.
- **Spain** shows the largest difference in balances between exited and non-exited customers, indicating potential for targeted retention efforts.
**Retention Opportunity**:
- Focus on retaining high-value customers in **Spain** and **France** to improve loyalty.
''')
else:
st.error("No dataset found. Please upload a dataset on the main page first.")
if st.button("Previous ⏮️"):
st.switch_page("pages/2_Data_CLeaning_and_Preprocessing.py")
if st.button("Next ⏭️"):
st.switch_page("pages/4_Model_Creation_and_Evaluation.py")