Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import joblib | |
| import plotly.express as px | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.cluster import KMeans | |
| # Set page config | |
| st.set_page_config( | |
| page_title="Market Basket Analysis & Customer Clustering", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # Application introduction | |
| st.title("π Market Basket Analysis & Customer Segmentation Dashboard") | |
| st.write("π¬NOTE: STILL FIXING THE LIVE CLUSTERING GRAPH PROBLEM AS IT IS NOT UPDATING REAL-TIMEπ¬") | |
| with st.expander("π About This Application", expanded=True): | |
| st.markdown(""" | |
| ### Welcome to the Market Basket Analysis & Customer Segmentation Dashboard π | |
| This application is designed to help businesses gain valuable insights from their sales data through two powerful analytical techniques: | |
| **1. Market Basket Analysis π§Ί** | |
| This technique identifies relationships between products that customers tend to purchase together. By understanding these patterns, businesses can optimize product placement, create effective bundle offers, and improve recommendation systems. | |
| **2. Customer Segmentation π₯** | |
| This approach groups customers based on their purchasing behavior using the RFM model (Recency, Frequency, Monetary value). This helps in developing targeted marketing strategies for different customer segments. | |
| ### How to Use This Application π οΈ | |
| 1. Navigate through the two tabs to explore different aspects of your data: | |
| - **Dataset Preview π:** Examine your data structure and basic statistics. | |
| - **Live Application π:** Perform live customer segmentation with customizable parameters. | |
| """) | |
| # Load cluster profile data from pickle file | |
| def load_rfm(): | |
| try: | |
| rfm_cluster = joblib.load("rfm_cluster.pkl") | |
| return rfm_cluster | |
| except Exception as e: | |
| st.error(f"β Error loading rfm cluster data: {e}") | |
| return pd.DataFrame() # Return an empty DataFrame if there's an error | |
| rfm_cluster = load_rfm() | |
| def load_sample_data(): | |
| try: | |
| sample_data = pd.read_csv("market_basket_data.csv", sep=";") | |
| return sample_data | |
| except Exception as e: | |
| st.error(f"β Error loading sample data: {e}") | |
| return pd.DataFrame() # Return an empty DataFrame if there's an error | |
| dataset = load_sample_data() | |
| # Create tabs | |
| tab1, tab2 = st.tabs(["π Dataset Preview", "π Live Application"]) | |
| # Tab 1: Dataset Preview | |
| with tab1: | |
| st.header("π Dataset Preview") | |
| st.markdown(""" | |
| ### Understanding Your Data π§ | |
| This section provides an overview of your dataset structure, allowing you to examine its contents before diving into the analysis. A good understanding of your data is essential for interpreting the results in the subsequent tabs. | |
| """) | |
| if not rfm_cluster.empty: | |
| st.write(f"π Dataset shape: {rfm_cluster.shape[0]} rows, {rfm_cluster.shape[1]} columns") | |
| # Data summary | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π Data Sample") | |
| st.markdown("This table shows the first 10 rows of your data, allowing you to see the actual values and structure.") | |
| st.dataframe(dataset.head(10)) | |
| with col2: | |
| st.subheader("π Data Statistics") | |
| st.markdown("This summary provides statistical measures for numerical columns in your dataset.") | |
| st.dataframe(dataset.describe()) | |
| # Data columns info | |
| st.subheader("π Column Information") | |
| st.markdown("This table provides details about each column in your dataset.") | |
| col_info = pd.DataFrame({ | |
| 'Column': dataset.columns, | |
| 'Type': dataset.dtypes.values, | |
| 'Non-Null Count': dataset.count().values, | |
| 'Null Count': dataset.isnull().sum().values, | |
| 'Unique Values': [dataset[col].nunique() for col in dataset.columns] | |
| }) | |
| st.dataframe(col_info) | |
| else: | |
| st.error("β No data available in the cluster profile.") | |
| # Tab 2: Live Application | |
| with tab2: | |
| st.header("π Live Customer Clustering") | |
| st.markdown(""" | |
| ### RFM Analysis & Customer Segmentation π₯ | |
| This tab uses the RFM (Recency, Frequency, Monetary) model to segment customers based on their purchasing behavior. Understanding these segments helps in developing targeted marketing strategies. | |
| **Key Concepts:** | |
| - **Recency:** How recently a customer made a purchase (fewer days = better) β³ | |
| - **Frequency:** How often a customer makes purchases (more purchases = better) π | |
| - **Monetary Value:** How much money a customer spends (higher amount = better) π° | |
| - **Clustering:** Grouping similar customers together based on their RFM values π― | |
| Use the controls in the sidebar to customize your analysis and see how different parameters affect customer segmentation. | |
| """) | |
| if not rfm_cluster.empty: | |
| # Ensure required columns are present | |
| required_cols = ['recency', 'frequency', 'monetary'] | |
| if all(col in rfm_cluster.columns for col in required_cols): | |
| # Sidebar for clustering parameters | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("βοΈ Clustering Parameters") | |
| # RFM weight adjustment sliders | |
| recency_weight = st.sidebar.slider( | |
| "β³ Recency Importance", | |
| 0.1, 2.0, 1.0, 0.1, | |
| help="Increase this value to give more importance to how recently customers purchased." | |
| ) | |
| frequency_weight = st.sidebar.slider( | |
| "π Frequency Importance", | |
| 0.1, 2.0, 1.0, 0.1, | |
| help="Increase this value to give more importance to how often customers purchase." | |
| ) | |
| monetary_weight = st.sidebar.slider( | |
| "π° Monetary Importance", | |
| 0.1, 2.0, 1.0, 0.1, | |
| help="Increase this value to give more importance to how much customers spend." | |
| ) | |
| # Number of clusters | |
| num_clusters = st.sidebar.slider( | |
| "π― Number of Clusters", | |
| 1, 3, 3, | |
| help="This determines how many customer segments to create. More segments means more granular groups, but they may be harder to interpret." | |
| ) | |
| # Perform live clustering | |
| X = rfm_cluster[required_cols].copy() | |
| # Apply weights | |
| X['recency'] = X['recency'] * recency_weight | |
| X['frequency'] = X['frequency'] * frequency_weight | |
| X['monetary'] = X['monetary'] * monetary_weight | |
| # Standardize the data | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # Perform clustering | |
| kmeans = KMeans(n_clusters=num_clusters, random_state=42) | |
| clusters = kmeans.fit_predict(X_scaled) | |
| # Add cluster labels to the data | |
| rfm_cluster['Live_Cluster'] = clusters | |
| # Display the clustering results | |
| st.subheader("π Recency vs Frequency by Cluster") | |
| fig_rfm = px.scatter( | |
| rfm_cluster, | |
| x="recency", | |
| y="frequency", | |
| color="Live_Cluster", | |
| size="monetary", | |
| hover_data=["recency", "frequency", "monetary"], | |
| title="Live RFM Clustering: Recency vs Frequency", | |
| height=600, | |
| color_continuous_scale=px.colors.qualitative.G10 | |
| ) | |
| fig_rfm.update_layout( | |
| xaxis_title="Recency (days since last purchase)", | |
| yaxis_title="Frequency (number of purchases)" | |
| ) | |
| st.plotly_chart(fig_rfm, use_container_width=True) | |
| # Cluster Analysis | |
| st.subheader("π Live Cluster Analysis") | |
| cluster_stats = rfm_cluster.groupby('Live_Cluster').agg({ | |
| 'recency': ['mean', 'min', 'max'], | |
| 'frequency': ['mean', 'min', 'max'], | |
| 'monetary': ['mean', 'min', 'max'], | |
| 'Live_Cluster': 'count' | |
| }).reset_index() | |
| cluster_stats.columns = ['Cluster', 'Avg Recency', 'Min Recency', 'Max Recency', | |
| 'Avg Frequency', 'Min Frequency', 'Max Frequency', | |
| 'Avg Monetary', 'Min Monetary', 'Max Monetary', 'Count'] | |
| st.dataframe(cluster_stats) | |
| else: | |
| st.error(f"β Required columns for clustering not found. Ensure your dataset contains: {required_cols}") | |
| else: | |
| st.error("β No data available in the cluster profile.") |