Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import datetime as dt | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| from gensim.models import Word2Vec | |
| # Set the page configuration | |
| st.set_page_config(page_title="Customer Segmentation and Product Recommendation", layout="wide") | |
| # Title and Description | |
| st.title("πCustomer Segmentation & Product Recommendation App") | |
| st.markdown(""" | |
| This application performs **Customer Segmentation** using RFM analysis and clustering, | |
| and provides **Product Recommendations** based on purchase patterns. | |
| Upload your dataset, analyze customer behavior, and visualize results interactively. | |
| """) | |
| # Sidebar for uploading data | |
| st.sidebar.header("Upload Dataset") | |
| uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"]) | |
| if uploaded_file: | |
| # Load data | |
| df = pd.read_csv(uploaded_file, encoding="ISO-8859-1", dtype={'CustomerID': str, 'InvoiceID': str}) | |
| st.sidebar.success("Dataset uploaded successfully!") | |
| else: | |
| st.sidebar.warning("Please upload a CSV file to start!") | |
| st.stop() | |
| # Data Cleaning and Preprocessing | |
| st.header("π§Ή Data Cleaning and Preprocessing") | |
| # Create 'Amount' column | |
| df["Amount"] = df["Quantity"] * df["UnitPrice"] | |
| st.markdown("### Initial Data Preview") | |
| st.write(df.head()) | |
| # Filter UK customers | |
| df = df[df["Country"] == "United Kingdom"] | |
| df = df[df["Quantity"] > 0] | |
| df.dropna(subset=['CustomerID'], inplace=True) | |
| df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]) | |
| df["date"] = df["InvoiceDate"].dt.date | |
| # Cleaned data preview | |
| st.markdown("### Cleaned Data Overview") | |
| st.write(df.describe()) | |
| # Summary Statistics | |
| st.subheader("π Summary Statistics") | |
| metrics = { | |
| "Number of Invoices": df['InvoiceNo'].nunique(), | |
| "Number of Products Bought": df['StockCode'].nunique(), | |
| "Number of Customers": df['CustomerID'].nunique(), | |
| "Average Quantity per Customer": round(df.groupby("CustomerID").Quantity.sum().mean(), 0), | |
| "Average Revenue per Customer (Β£)": round(df.groupby("CustomerID").Amount.sum().mean(), 2), | |
| } | |
| st.write(pd.DataFrame(metrics.items(), columns=["Metric", "Value"])) | |
| # Monthly Transactions Analysis | |
| st.subheader("π Monthly Transactions Analysis") | |
| df['month'] = df['InvoiceDate'].dt.month | |
| monthly_counts = df.groupby('month').size() | |
| # Plot using Plotly | |
| fig_monthly = px.bar( | |
| monthly_counts, | |
| x=monthly_counts.index, | |
| y=monthly_counts.values, | |
| labels={"x": "Month", "y": "Transactions"}, | |
| title="Transactions Per Month" | |
| ) | |
| st.plotly_chart(fig_monthly) | |
| # RFM Analysis | |
| st.header("π RFM Analysis") | |
| # Recency Calculation | |
| now = pd.Timestamp("2011-12-09") | |
| recency_df = df.groupby("CustomerID")["date"].max().reset_index() | |
| recency_df["Recency"] = (now - pd.to_datetime(recency_df["date"])).dt.days | |
| # Frequency Calculation | |
| frequency_df = df.groupby("CustomerID")["InvoiceNo"].nunique().reset_index() | |
| frequency_df.rename(columns={"InvoiceNo": "Frequency"}, inplace=True) | |
| # Monetary Calculation | |
| monetary_df = df.groupby("CustomerID")["Amount"].sum().reset_index() | |
| monetary_df.rename(columns={"Amount": "Monetary"}, inplace=True) | |
| # Combine RFM | |
| rfm = recency_df.merge(frequency_df, on="CustomerID").merge(monetary_df, on="CustomerID") | |
| st.write("### RFM Data") | |
| st.write(rfm.head()) | |
| # Visualize RFM Distributions | |
| fig_rfm = px.scatter_3d( | |
| rfm, | |
| x="Recency", | |
| y="Frequency", | |
| z="Monetary", | |
| color="Monetary", | |
| size="Monetary", | |
| title="RFM Scatter Plot" | |
| ) | |
| st.plotly_chart(fig_rfm) | |
| # K-Means Clustering | |
| st.header("π K-Means Clustering") | |
| st.sidebar.subheader("Clustering Parameters") | |
| num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, value=4) | |
| kmeans = KMeans(n_clusters=num_clusters, random_state=42) | |
| rfm["Cluster"] = kmeans.fit_predict(rfm[["Recency", "Frequency", "Monetary"]]) | |
| # Cluster Visualization | |
| fig_cluster = px.scatter_3d( | |
| rfm, | |
| x="Recency", | |
| y="Frequency", | |
| z="Monetary", | |
| color="Cluster", | |
| title=f"Customer Segmentation with {num_clusters} Clusters", | |
| symbol="Cluster", | |
| size="Monetary", | |
| ) | |
| st.plotly_chart(fig_cluster) | |
| #Enhanced RFM Analysis | |
| st.header("π Enhanced RFM Analysis") | |
| # Interactive RFM Heatmap | |
| heatmap_data = rfm[["Recency", "Frequency", "Monetary", "Cluster"]].groupby("Cluster").mean() | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True, ax=ax) | |
| ax.set_title("Average RFM Values per Cluster", fontsize=16) | |
| st.pyplot(fig) | |
| # Animated RFM Scatter | |
| st.subheader("π Animated RFM Scatter Plot") | |
| fig_rfm_animated = px.scatter_3d( | |
| rfm, | |
| x="Recency", | |
| y="Frequency", | |
| z="Monetary", | |
| color="Cluster", | |
| animation_frame="Cluster", # Add animation based on clusters | |
| title="RFM Clusters Over Time", | |
| size="Monetary", | |
| ) | |
| st.plotly_chart(fig_rfm_animated) | |
| # Product Recommendation | |
| st.header("π― Product Recommendations") | |
| # Train Word2Vec Model | |
| st.subheader("π Train Word2Vec Model") | |
| with st.spinner("Training Word2Vec model..."): | |
| invoices = df.groupby("InvoiceNo")["Description"].apply(list) # Group products by invoices | |
| model = Word2Vec(sentences=invoices, vector_size=50, window=5, min_count=1, workers=4, sg=1) | |
| st.success("Word2Vec model trained successfully!") | |
| # Display similar products | |
| st.subheader("π Find Similar Products") | |
| selected_product = st.selectbox("Select a product to find recommendations:", df["Description"].unique()) | |
| if st.button("Recommend Products for Customers"): | |
| try: | |
| similar_products = model.wv.most_similar(selected_product, topn=5) # Top 5 recommendations | |
| st.write("### Recommended Products") | |
| for product, similarity in similar_products: | |
| st.write(f"- **{product}** (Similarity: {similarity:.2f})") | |
| except KeyError: | |
| st.warning("The selected product is not in the vocabulary. Please choose another.") | |
| # Recommendations for Cluster-Based Segmentation | |
| st.subheader("π Recommendations by Cluster") | |
| cluster_to_recommend = st.selectbox("Select a cluster:", rfm["Cluster"].unique()) | |
| if st.button("Recommend for Cluster"): | |
| cluster_customers = rfm[rfm["Cluster"] == cluster_to_recommend]["CustomerID"] | |
| cluster_df = df[df["CustomerID"].isin(cluster_customers)] | |
| cluster_invoices = cluster_df.groupby("InvoiceNo")["Description"].apply(list) | |
| with st.spinner("Training cluster-specific Word2Vec model..."): | |
| cluster_model = Word2Vec(sentences=cluster_invoices, vector_size=50, window=5, min_count=1, workers=4, sg=1) | |
| try: | |
| cluster_similar_products = cluster_model.wv.most_similar(selected_product, topn=5) | |
| st.write(f"### Recommended Products for Cluster {cluster_to_recommend}") | |
| for product, similarity in cluster_similar_products: | |
| st.write(f"- **{product}** (Similarity: {similarity:.2f})") | |
| except KeyError: | |
| st.warning("The selected product is not in the vocabulary for this cluster.") | |
| # PCA to visualize Word2Vec embeddings | |
| st.subheader("π Word2Vec Embedding Visualization") | |
| vectors = model.wv[model.wv.key_to_index.keys()] # Product vectors | |
| pca = PCA(n_components=2) | |
| pca_result = pca.fit_transform(vectors) | |
| # Create DataFrame for visualization | |
| embedding_df = pd.DataFrame(pca_result, columns=["PCA1", "PCA2"]) | |
| embedding_df["Product"] = model.wv.key_to_index.keys() | |
| # Interactive Plot | |
| fig_embed = px.scatter( | |
| embedding_df, | |
| x="PCA1", | |
| y="PCA2", | |
| hover_data=["Product"], | |
| title="Word2Vec Product Embeddings", | |
| template="plotly_dark", | |
| ) | |
| st.plotly_chart(fig_embed) | |
| # Export Data | |
| st.header("π€ Export Processed Data") | |
| if st.button("Export RFM Data"): | |
| rfm.to_csv("rfm_data.csv", index=False) | |
| st.success("RFM data exported as `rfm_data.csv`!") | |
| st.markdown("### Enjoy exploring your customer data! π") | |