market-analysis / app.py
Markndrei's picture
Update app.py
abb8bd6 verified
import streamlit as st
import pandas as pd
import joblib
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Set page config
st.set_page_config(
page_title="Market Basket Analysis & Customer Clustering",
page_icon="πŸ›’",
layout="wide"
)
# Application introduction
st.title("πŸ›’ Market Basket Analysis & Customer Segmentation Dashboard")
st.write("πŸ”¬NOTE: STILL FIXING THE LIVE CLUSTERING GRAPH PROBLEM AS IT IS NOT UPDATING REAL-TIMEπŸ”¬")
with st.expander("πŸ“‹ About This Application", expanded=True):
st.markdown("""
### Welcome to the Market Basket Analysis & Customer Segmentation Dashboard πŸŽ‰
This application is designed to help businesses gain valuable insights from their sales data through two powerful analytical techniques:
**1. Market Basket Analysis 🧺**
This technique identifies relationships between products that customers tend to purchase together. By understanding these patterns, businesses can optimize product placement, create effective bundle offers, and improve recommendation systems.
**2. Customer Segmentation πŸ‘₯**
This approach groups customers based on their purchasing behavior using the RFM model (Recency, Frequency, Monetary value). This helps in developing targeted marketing strategies for different customer segments.
### How to Use This Application πŸ› οΈ
1. Navigate through the two tabs to explore different aspects of your data:
- **Dataset Preview πŸ“Š:** Examine your data structure and basic statistics.
- **Live Application πŸš€:** Perform live customer segmentation with customizable parameters.
""")
# Load cluster profile data from pickle file
@st.cache_data
def load_rfm():
try:
rfm_cluster = joblib.load("rfm_cluster.pkl")
return rfm_cluster
except Exception as e:
st.error(f"❌ Error loading rfm cluster data: {e}")
return pd.DataFrame() # Return an empty DataFrame if there's an error
rfm_cluster = load_rfm()
@st.cache_resource
def load_sample_data():
try:
sample_data = pd.read_csv("market_basket_data.csv", sep=";")
return sample_data
except Exception as e:
st.error(f"❌ Error loading sample data: {e}")
return pd.DataFrame() # Return an empty DataFrame if there's an error
dataset = load_sample_data()
# Create tabs
tab1, tab2 = st.tabs(["πŸ“Š Dataset Preview", "πŸš€ Live Application"])
# Tab 1: Dataset Preview
with tab1:
st.header("πŸ“Š Dataset Preview")
st.markdown("""
### Understanding Your Data 🧐
This section provides an overview of your dataset structure, allowing you to examine its contents before diving into the analysis. A good understanding of your data is essential for interpreting the results in the subsequent tabs.
""")
if not rfm_cluster.empty:
st.write(f"πŸ“ Dataset shape: {rfm_cluster.shape[0]} rows, {rfm_cluster.shape[1]} columns")
# Data summary
col1, col2 = st.columns(2)
with col1:
st.subheader("πŸ“„ Data Sample")
st.markdown("This table shows the first 10 rows of your data, allowing you to see the actual values and structure.")
st.dataframe(dataset.head(10))
with col2:
st.subheader("πŸ“ˆ Data Statistics")
st.markdown("This summary provides statistical measures for numerical columns in your dataset.")
st.dataframe(dataset.describe())
# Data columns info
st.subheader("πŸ“‘ Column Information")
st.markdown("This table provides details about each column in your dataset.")
col_info = pd.DataFrame({
'Column': dataset.columns,
'Type': dataset.dtypes.values,
'Non-Null Count': dataset.count().values,
'Null Count': dataset.isnull().sum().values,
'Unique Values': [dataset[col].nunique() for col in dataset.columns]
})
st.dataframe(col_info)
else:
st.error("❌ No data available in the cluster profile.")
# Tab 2: Live Application
with tab2:
st.header("πŸš€ Live Customer Clustering")
st.markdown("""
### RFM Analysis & Customer Segmentation πŸ‘₯
This tab uses the RFM (Recency, Frequency, Monetary) model to segment customers based on their purchasing behavior. Understanding these segments helps in developing targeted marketing strategies.
**Key Concepts:**
- **Recency:** How recently a customer made a purchase (fewer days = better) ⏳
- **Frequency:** How often a customer makes purchases (more purchases = better) πŸ”„
- **Monetary Value:** How much money a customer spends (higher amount = better) πŸ’°
- **Clustering:** Grouping similar customers together based on their RFM values 🎯
Use the controls in the sidebar to customize your analysis and see how different parameters affect customer segmentation.
""")
if not rfm_cluster.empty:
# Ensure required columns are present
required_cols = ['recency', 'frequency', 'monetary']
if all(col in rfm_cluster.columns for col in required_cols):
# Sidebar for clustering parameters
st.sidebar.markdown("---")
st.sidebar.subheader("βš™οΈ Clustering Parameters")
# RFM weight adjustment sliders
recency_weight = st.sidebar.slider(
"⏳ Recency Importance",
0.1, 2.0, 1.0, 0.1,
help="Increase this value to give more importance to how recently customers purchased."
)
frequency_weight = st.sidebar.slider(
"πŸ”„ Frequency Importance",
0.1, 2.0, 1.0, 0.1,
help="Increase this value to give more importance to how often customers purchase."
)
monetary_weight = st.sidebar.slider(
"πŸ’° Monetary Importance",
0.1, 2.0, 1.0, 0.1,
help="Increase this value to give more importance to how much customers spend."
)
# Number of clusters
num_clusters = st.sidebar.slider(
"🎯 Number of Clusters",
1, 3, 3,
help="This determines how many customer segments to create. More segments means more granular groups, but they may be harder to interpret."
)
# Perform live clustering
X = rfm_cluster[required_cols].copy()
# Apply weights
X['recency'] = X['recency'] * recency_weight
X['frequency'] = X['frequency'] * frequency_weight
X['monetary'] = X['monetary'] * monetary_weight
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
# Add cluster labels to the data
rfm_cluster['Live_Cluster'] = clusters
# Display the clustering results
st.subheader("πŸ“Š Recency vs Frequency by Cluster")
fig_rfm = px.scatter(
rfm_cluster,
x="recency",
y="frequency",
color="Live_Cluster",
size="monetary",
hover_data=["recency", "frequency", "monetary"],
title="Live RFM Clustering: Recency vs Frequency",
height=600,
color_continuous_scale=px.colors.qualitative.G10
)
fig_rfm.update_layout(
xaxis_title="Recency (days since last purchase)",
yaxis_title="Frequency (number of purchases)"
)
st.plotly_chart(fig_rfm, use_container_width=True)
# Cluster Analysis
st.subheader("πŸ“ˆ Live Cluster Analysis")
cluster_stats = rfm_cluster.groupby('Live_Cluster').agg({
'recency': ['mean', 'min', 'max'],
'frequency': ['mean', 'min', 'max'],
'monetary': ['mean', 'min', 'max'],
'Live_Cluster': 'count'
}).reset_index()
cluster_stats.columns = ['Cluster', 'Avg Recency', 'Min Recency', 'Max Recency',
'Avg Frequency', 'Min Frequency', 'Max Frequency',
'Avg Monetary', 'Min Monetary', 'Max Monetary', 'Count']
st.dataframe(cluster_stats)
else:
st.error(f"❌ Required columns for clustering not found. Ensure your dataset contains: {required_cols}")
else:
st.error("❌ No data available in the cluster profile.")