Spaces:
Sleeping
Sleeping
Mahmoud Adel commited on
Commit ·
5e2aaa0
0
Parent(s):
Clean Hugging Face deployment
Browse files- .streamlit/config.toml +7 -0
- .streamlit/config.toml +13 -0
- README.md +423 -0
- data/Mall_Customers.csv +201 -0
- requirements.txt +8 -0
- run_app.py +51 -0
- src/__init__.py +1 -0
- src/__pycache__/__init__.cpython-39.pyc +0 -0
- src/__pycache__/clustering.cpython-312.pyc +0 -0
- src/__pycache__/clustering.cpython-39.pyc +0 -0
- src/__pycache__/data_loader.cpython-312.pyc +0 -0
- src/__pycache__/data_loader.cpython-39.pyc +0 -0
- src/__pycache__/visualizations.cpython-312.pyc +0 -0
- src/__pycache__/visualizations.cpython-39.pyc +0 -0
- src/clustering.py +260 -0
- src/data_loader.py +151 -0
- src/visualizations.py +780 -0
- streamlit_app/main.py +1112 -0
- utils/__init__.py +1 -0
- utils/__pycache__/data_generator.cpython-311.pyc +0 -0
- utils/data_generator.py +73 -0
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base = "dark"
|
| 3 |
+
primaryColor = "#818CF8"
|
| 4 |
+
backgroundColor = "#0F172A"
|
| 5 |
+
secondaryBackgroundColor = "#111827"
|
| 6 |
+
textColor = "#E5E7EB"
|
| 7 |
+
font = "sans serif"
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
primaryColor = "#3498db"
|
| 3 |
+
backgroundColor = "#0e1117"
|
| 4 |
+
secondaryBackgroundColor = "#262730"
|
| 5 |
+
textColor = "#ffffff"
|
| 6 |
+
|
| 7 |
+
[server]
|
| 8 |
+
headless = true
|
| 9 |
+
enableCORS = false
|
| 10 |
+
enableXsrfProtection = false
|
| 11 |
+
|
| 12 |
+
[browser]
|
| 13 |
+
gatherUsageStats = false
|
README.md
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🛍️ Customer Segmentation Analysis
|
| 2 |
+
|
| 3 |
+
[](https://customer-segmentation-mqnhet38emja8xtgffpzjt.streamlit.app/)
|
| 4 |
+
[](https://www.python.org/downloads/)
|
| 5 |
+
[](LICENSE)
|
| 6 |
+
[](https://streamlit.io/)
|
| 7 |
+
|
| 8 |
+
> **🎯 Live Application**: [Customer Segmentation Analysis](https://customer-segmentation-mqnhet38emja8xtgffpzjt.streamlit.app/)
|
| 9 |
+
|
| 10 |
+
A comprehensive, interactive web application for customer segmentation analysis using machine learning clustering algorithms. This project provides an end-to-end solution for identifying distinct customer groups based on purchasing behavior and demographic characteristics.
|
| 11 |
+
|
| 12 |
+
## 🌟 Live Demo
|
| 13 |
+
|
| 14 |
+
**🚀 Try the application now:** [Customer Segmentation Analysis](https://customer-segmentation-mqnhet38emja8xtgffpzjt.streamlit.app/)
|
| 15 |
+
|
| 16 |
+
The live application features:
|
| 17 |
+
- ✨ **Interactive Data Exploration** with real-time visualizations
|
| 18 |
+
- 🎯 **K-Means & DBSCAN Clustering** with optimal parameter selection
|
| 19 |
+
- 📊 **Beautiful Visualizations** with dark theme and modern UI
|
| 20 |
+
- 💡 **Business Insights** and actionable recommendations
|
| 21 |
+
- 📱 **Responsive Design** that works on all devices
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## 📋 Table of Contents
|
| 26 |
+
|
| 27 |
+
- [🎯 Project Overview](#-project-overview)
|
| 28 |
+
- [✨ Key Features](#-key-features)
|
| 29 |
+
- [📊 Dataset Information](#-dataset-information)
|
| 30 |
+
- [🛠️ Technology Stack](#️-technology-stack)
|
| 31 |
+
- [🚀 Quick Start](#-quick-start)
|
| 32 |
+
- [📁 Project Structure](#-project-structure)
|
| 33 |
+
- [🔍 Analysis Workflow](#-analysis-workflow)
|
| 34 |
+
- [📈 Results & Insights](#-results--insights)
|
| 35 |
+
- [🎨 Screenshots](#-screenshots)
|
| 36 |
+
- [⚙️ Configuration](#️-configuration)
|
| 37 |
+
- [🤝 Contributing](#-contributing)
|
| 38 |
+
- [📝 License](#-license)
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
## 🎯 Project Overview
|
| 43 |
+
|
| 44 |
+
This project implements advanced customer segmentation using unsupervised machine learning techniques. It provides a complete solution for businesses to understand their customer base through data-driven insights and actionable recommendations.
|
| 45 |
+
|
| 46 |
+
### 🎯 Business Value
|
| 47 |
+
|
| 48 |
+
- **Customer Understanding**: Identify distinct customer segments based on behavior patterns
|
| 49 |
+
- **Targeted Marketing**: Develop personalized marketing strategies for each segment
|
| 50 |
+
- **Resource Optimization**: Allocate marketing budgets more effectively
|
| 51 |
+
- **Product Development**: Tailor products and services to specific customer needs
|
| 52 |
+
- **Customer Retention**: Implement segment-specific retention strategies
|
| 53 |
+
|
| 54 |
+
---
|
| 55 |
+
|
| 56 |
+
## ✨ Key Features
|
| 57 |
+
|
| 58 |
+
### 🎨 **Modern User Interface**
|
| 59 |
+
- **Dark Theme**: Beautiful, modern dark interface with gradient accents
|
| 60 |
+
- **Responsive Design**: Works seamlessly on desktop, tablet, and mobile
|
| 61 |
+
- **Interactive Elements**: Hover effects, animations, and smooth transitions
|
| 62 |
+
- **Real-time Updates**: Dynamic visualizations that update instantly
|
| 63 |
+
|
| 64 |
+
### 📊 **Comprehensive Data Analysis**
|
| 65 |
+
- **Data Exploration**: Interactive histograms, scatter plots, and correlation matrices
|
| 66 |
+
- **Statistical Summary**: Detailed descriptive statistics and data quality checks
|
| 67 |
+
- **Feature Relationships**: Visual analysis of correlations between variables
|
| 68 |
+
- **Missing Value Detection**: Automatic identification and handling of data issues
|
| 69 |
+
|
| 70 |
+
### 🎯 **Advanced Clustering Algorithms**
|
| 71 |
+
- **K-Means Clustering**: With optimal cluster determination using multiple metrics
|
| 72 |
+
- **DBSCAN Clustering**: Density-based clustering for comparison
|
| 73 |
+
- **Parameter Optimization**: Automatic selection of optimal clustering parameters
|
| 74 |
+
- **Performance Metrics**: Silhouette score, Calinski-Harabasz score, and inertia
|
| 75 |
+
|
| 76 |
+
### 📈 **Rich Visualizations**
|
| 77 |
+
- **2D Cluster Plots**: Interactive scatter plots with cluster assignments
|
| 78 |
+
- **Distribution Analysis**: Box plots and histograms for each segment
|
| 79 |
+
- **Comparative Analysis**: Side-by-side comparison of different algorithms
|
| 80 |
+
- **Business Metrics**: Spending analysis and customer profile visualizations
|
| 81 |
+
|
| 82 |
+
### 💡 **Business Intelligence**
|
| 83 |
+
- **Customer Profiles**: Detailed characteristics of each segment
|
| 84 |
+
- **Spending Analysis**: Average spending patterns and trends
|
| 85 |
+
- **Actionable Recommendations**: Specific strategies for each customer segment
|
| 86 |
+
- **Download Results**: Export analysis results for further processing
|
| 87 |
+
|
| 88 |
+
---
|
| 89 |
+
|
| 90 |
+
## 📊 Dataset Information
|
| 91 |
+
|
| 92 |
+
The application uses the **Mall Customer Segmentation** dataset, which simulates real-world customer data with the following features:
|
| 93 |
+
|
| 94 |
+
| Feature | Description | Type | Range |
|
| 95 |
+
|---------|-------------|------|-------|
|
| 96 |
+
| **CustomerID** | Unique customer identifier | Integer | 1-200 |
|
| 97 |
+
| **Gender** | Customer gender | Categorical | Male/Female |
|
| 98 |
+
| **Age** | Customer age in years | Integer | 18-70 |
|
| 99 |
+
| **Annual Income (k$)** | Annual income in thousands | Integer | 15-137 |
|
| 100 |
+
| **Spending Score (1-100)** | Mall-assigned spending score | Integer | 1-100 |
|
| 101 |
+
|
| 102 |
+
### 📈 **Dataset Characteristics**
|
| 103 |
+
- **Size**: 200 customers
|
| 104 |
+
- **Features**: 5 variables (3 numeric, 2 categorical)
|
| 105 |
+
- **Quality**: Clean data with no missing values
|
| 106 |
+
- **Realism**: Simulates realistic customer behavior patterns
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
## 🛠️ Technology Stack
|
| 111 |
+
|
| 112 |
+
### **Core Technologies**
|
| 113 |
+
- **Python 3.8+**: Primary programming language
|
| 114 |
+
- **Streamlit 1.28+**: Interactive web application framework
|
| 115 |
+
- **Pandas**: Data manipulation and analysis
|
| 116 |
+
- **NumPy**: Numerical computing and array operations
|
| 117 |
+
|
| 118 |
+
### **Machine Learning**
|
| 119 |
+
- **Scikit-learn**: Clustering algorithms (K-Means, DBSCAN)
|
| 120 |
+
- **Silhouette Analysis**: Cluster quality evaluation
|
| 121 |
+
- **StandardScaler**: Feature normalization
|
| 122 |
+
|
| 123 |
+
### **Visualization**
|
| 124 |
+
- **Plotly**: Interactive charts and graphs
|
| 125 |
+
- **Custom CSS**: Modern dark theme styling
|
| 126 |
+
- **Responsive Design**: Mobile-friendly interface
|
| 127 |
+
|
| 128 |
+
### **Development Tools**
|
| 129 |
+
- **YAML**: Configuration management
|
| 130 |
+
- **Git**: Version control
|
| 131 |
+
- **Streamlit Cloud**: Deployment platform
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
## 🚀 Quick Start
|
| 136 |
+
|
| 137 |
+
### **Option 1: Use the Live Application**
|
| 138 |
+
1. Visit [Customer Segmentation Analysis](https://customer-segmentation-mqnhet38emja8xtgffpzjt.streamlit.app/)
|
| 139 |
+
2. Start exploring the data immediately
|
| 140 |
+
3. No installation required!
|
| 141 |
+
|
| 142 |
+
### **Option 2: Run Locally**
|
| 143 |
+
|
| 144 |
+
#### **Prerequisites**
|
| 145 |
+
```bash
|
| 146 |
+
# Ensure you have Python 3.8+ installed
|
| 147 |
+
python --version
|
| 148 |
+
|
| 149 |
+
# Install Git (if not already installed)
|
| 150 |
+
git --version
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
#### **Installation Steps**
|
| 154 |
+
|
| 155 |
+
1. **Clone the repository**
|
| 156 |
+
```bash
|
| 157 |
+
git clone https://github.com/yourusername/customer-segmentation.git
|
| 158 |
+
cd customer-segmentation
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
2. **Install dependencies**
|
| 162 |
+
```bash
|
| 163 |
+
pip install -r requirements.txt
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
3. **Launch the application**
|
| 167 |
+
```bash
|
| 168 |
+
python run_app.py
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
Or directly with Streamlit:
|
| 172 |
+
```bash
|
| 173 |
+
streamlit run streamlit_app/main.py
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
4. **Access the application**
|
| 177 |
+
- Open your browser and navigate to `http://localhost:8501`
|
| 178 |
+
- The application will automatically load the sample dataset
|
| 179 |
+
- Start exploring the different analysis sections
|
| 180 |
+
|
| 181 |
+
---
|
| 182 |
+
|
| 183 |
+
## 📁 Project Structure
|
| 184 |
+
|
| 185 |
+
```
|
| 186 |
+
Customer segmentation/
|
| 187 |
+
├── 📁 streamlit_app/
|
| 188 |
+
│ └── 🐍 main.py # Main Streamlit application
|
| 189 |
+
├── 📁 src/
|
| 190 |
+
│ ├── 🐍 __init__.py # Package initialization
|
| 191 |
+
│ ├── 🐍 data_loader.py # Data loading and preprocessing
|
| 192 |
+
│ ├── 🐍 clustering.py # Clustering algorithms
|
| 193 |
+
│ └── 🐍 visualizations.py # Visualization components
|
| 194 |
+
├── 📁 utils/
|
| 195 |
+
│ ├── 🐍 __init__.py # Utilities package
|
| 196 |
+
│ └── 🐍 data_generator.py # Sample data generation
|
| 197 |
+
├── 📁 config/
|
| 198 |
+
│ └── ⚙️ config.yaml # Configuration settings
|
| 199 |
+
├── 📁 data/
|
| 200 |
+
│ └── 📊 Mall_Customers.csv # Main dataset
|
| 201 |
+
├── 📁 .streamlit/
|
| 202 |
+
│ └── ⚙️ config.toml # Streamlit configuration
|
| 203 |
+
├── 📋 requirements.txt # Python dependencies
|
| 204 |
+
├── 🚀 run_app.py # Application launcher
|
| 205 |
+
└── 📖 README.md # Project documentation
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
## 🔍 Analysis Workflow
|
| 211 |
+
|
| 212 |
+
### **1. Data Exploration** 📊
|
| 213 |
+
- **Dataset Overview**: Basic statistics and data quality assessment
|
| 214 |
+
- **Distribution Analysis**: Histograms and density plots for all features
|
| 215 |
+
- **Correlation Analysis**: Heatmaps showing feature relationships
|
| 216 |
+
- **Visual Exploration**: Interactive scatter plots and box plots
|
| 217 |
+
|
| 218 |
+
### **2. Data Preprocessing** ⚙️
|
| 219 |
+
- **Feature Selection**: Choose relevant variables for clustering
|
| 220 |
+
- **Data Scaling**: Normalize features using StandardScaler
|
| 221 |
+
- **Missing Value Handling**: Automatic detection and treatment
|
| 222 |
+
- **Data Validation**: Ensure data quality and consistency
|
| 223 |
+
|
| 224 |
+
### **3. Optimal Cluster Determination** 🎯
|
| 225 |
+
- **Elbow Method**: Find optimal number of clusters using inertia
|
| 226 |
+
- **Silhouette Analysis**: Evaluate cluster quality and separation
|
| 227 |
+
- **Calinski-Harabasz Score**: Alternative cluster evaluation metric
|
| 228 |
+
- **Visual Assessment**: Interactive plots for parameter selection
|
| 229 |
+
|
| 230 |
+
### **4. K-Means Clustering** 🔵
|
| 231 |
+
- **Algorithm Application**: Apply K-Means with optimal parameters
|
| 232 |
+
- **Cluster Assignment**: Generate labels for each customer
|
| 233 |
+
- **Performance Metrics**: Calculate silhouette and Calinski scores
|
| 234 |
+
- **Center Visualization**: Plot cluster centroids
|
| 235 |
+
|
| 236 |
+
### **5. DBSCAN Clustering** 🌟
|
| 237 |
+
- **Density-Based Clustering**: Apply DBSCAN algorithm
|
| 238 |
+
- **Parameter Tuning**: Adjust epsilon and min_samples
|
| 239 |
+
- **Noise Detection**: Identify outlier points
|
| 240 |
+
- **Comparison Analysis**: Compare with K-Means results
|
| 241 |
+
|
| 242 |
+
### **6. Visualization & Analysis** 📈
|
| 243 |
+
- **2D Cluster Plots**: Interactive scatter plots with cluster assignments
|
| 244 |
+
- **Distribution Analysis**: Box plots showing feature distributions per cluster
|
| 245 |
+
- **Spending Analysis**: Detailed spending patterns for each segment
|
| 246 |
+
- **Comparative Visualizations**: Side-by-side algorithm comparison
|
| 247 |
+
|
| 248 |
+
### **7. Business Intelligence** 💡
|
| 249 |
+
- **Customer Profiling**: Detailed characteristics of each segment
|
| 250 |
+
- **Spending Patterns**: Average spending and variance analysis
|
| 251 |
+
- **Actionable Insights**: Specific recommendations for each segment
|
| 252 |
+
- **Export Results**: Download analysis results for further use
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
## 📈 Results & Insights
|
| 257 |
+
|
| 258 |
+
### **Typical Customer Segments Identified**
|
| 259 |
+
|
| 260 |
+
| Segment | Characteristics | Business Strategy |
|
| 261 |
+
|---------|----------------|-------------------|
|
| 262 |
+
| **💎 High Value** | High income, high spending | Premium products, VIP services |
|
| 263 |
+
| **💼 Conservative** | High income, low spending | Upselling, value propositions |
|
| 264 |
+
| **🎯 Budget Spenders** | Low income, high spending | Value-based offerings, loyalty programs |
|
| 265 |
+
| **📉 Low Engagement** | Low income, low spending | Retention strategies, engagement campaigns |
|
| 266 |
+
| **⚖️ Balanced** | Moderate income and spending | Personalized marketing, core offerings |
|
| 267 |
+
|
| 268 |
+
### **Performance Metrics**
|
| 269 |
+
|
| 270 |
+
The analysis provides comprehensive evaluation metrics:
|
| 271 |
+
|
| 272 |
+
- **Silhouette Score**: Measures cluster cohesion and separation (0-1, higher is better)
|
| 273 |
+
- **Calinski-Harabasz Score**: Evaluates cluster definition quality
|
| 274 |
+
- **Inertia**: Within-cluster sum of squares for K-Means
|
| 275 |
+
- **Number of Clusters**: Optimal cluster count determined automatically
|
| 276 |
+
- **Noise Points**: Outlier detection in DBSCAN
|
| 277 |
+
|
| 278 |
+
### **Business Recommendations**
|
| 279 |
+
|
| 280 |
+
Based on clustering results, the application provides:
|
| 281 |
+
|
| 282 |
+
- **Marketing Strategies**: Segment-specific campaign recommendations
|
| 283 |
+
- **Product Positioning**: Align products with cluster preferences
|
| 284 |
+
- **Pricing Strategies**: Dynamic pricing based on segment characteristics
|
| 285 |
+
- **Customer Retention**: Targeted programs for each segment
|
| 286 |
+
- **Growth Opportunities**: Cross-selling and upselling strategies
|
| 287 |
+
|
| 288 |
+
---
|
| 289 |
+
|
| 290 |
+
## 🎨 Screenshots
|
| 291 |
+
|
| 292 |
+
### **Main Dashboard**
|
| 293 |
+

|
| 294 |
+
|
| 295 |
+
### **Data Exploration**
|
| 296 |
+

|
| 297 |
+
|
| 298 |
+
### **Clustering Results**
|
| 299 |
+

|
| 300 |
+
|
| 301 |
+
### **Business Insights**
|
| 302 |
+

|
| 303 |
+
|
| 304 |
+
---
|
| 305 |
+
|
| 306 |
+
## ⚙️ Configuration
|
| 307 |
+
|
| 308 |
+
### **Customizing Clustering Parameters**
|
| 309 |
+
|
| 310 |
+
#### **K-Means Parameters**
|
| 311 |
+
```python
|
| 312 |
+
# In the application interface
|
| 313 |
+
n_clusters = 5 # Number of clusters
|
| 314 |
+
random_state = 42 # For reproducible results
|
| 315 |
+
```
|
| 316 |
+
|
| 317 |
+
#### **DBSCAN Parameters**
|
| 318 |
+
```python
|
| 319 |
+
eps = 0.5 # Neighborhood distance
|
| 320 |
+
min_samples = 5 # Minimum points per cluster
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
### **Feature Selection**
|
| 324 |
+
```python
|
| 325 |
+
# Default features for clustering
|
| 326 |
+
features = ['Annual Income (k$)', 'Spending Score (1-100)']
|
| 327 |
+
|
| 328 |
+
# Custom feature selection
|
| 329 |
+
features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
|
| 330 |
+
```
|
| 331 |
+
|
| 332 |
+
### **Visualization Settings**
|
| 333 |
+
```python
|
| 334 |
+
# Color schemes
|
| 335 |
+
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
|
| 336 |
+
|
| 337 |
+
# Chart dimensions
|
| 338 |
+
height = 450
|
| 339 |
+
width = '100%'
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
---
|
| 343 |
+
|
| 344 |
+
## 🤝 Contributing
|
| 345 |
+
|
| 346 |
+
We welcome contributions! Here's how you can help:
|
| 347 |
+
|
| 348 |
+
### **How to Contribute**
|
| 349 |
+
|
| 350 |
+
1. **Fork the repository**
|
| 351 |
+
2. **Create a feature branch**
|
| 352 |
+
```bash
|
| 353 |
+
git checkout -b feature/amazing-feature
|
| 354 |
+
```
|
| 355 |
+
3. **Make your changes**
|
| 356 |
+
4. **Test thoroughly**
|
| 357 |
+
5. **Commit your changes**
|
| 358 |
+
```bash
|
| 359 |
+
git commit -m 'Add amazing feature'
|
| 360 |
+
```
|
| 361 |
+
6. **Push to the branch**
|
| 362 |
+
```bash
|
| 363 |
+
git push origin feature/amazing-feature
|
| 364 |
+
```
|
| 365 |
+
7. **Open a Pull Request**
|
| 366 |
+
|
| 367 |
+
### **Areas for Improvement**
|
| 368 |
+
|
| 369 |
+
- **Additional Algorithms**: Hierarchical clustering, Gaussian Mixture Models
|
| 370 |
+
- **Enhanced Visualizations**: 3D plots, interactive dashboards
|
| 371 |
+
- **Advanced Analytics**: Customer lifetime value, churn prediction
|
| 372 |
+
- **Performance Optimization**: Faster processing for large datasets
|
| 373 |
+
- **Mobile Experience**: Improved mobile interface
|
| 374 |
+
- **API Integration**: REST API for programmatic access
|
| 375 |
+
|
| 376 |
+
### **Bug Reports**
|
| 377 |
+
|
| 378 |
+
Please use the [GitHub Issues](https://github.com/yourusername/customer-segmentation/issues) page to report bugs or request features.
|
| 379 |
+
|
| 380 |
+
---
|
| 381 |
+
|
| 382 |
+
## 📝 License
|
| 383 |
+
|
| 384 |
+
This project is licensed under the **MIT License** - see the [LICENSE](LICENSE) file for details.
|
| 385 |
+
|
| 386 |
+
### **MIT License Summary**
|
| 387 |
+
- ✅ **Commercial Use**: Allowed
|
| 388 |
+
- ✅ **Modification**: Allowed
|
| 389 |
+
- ✅ **Distribution**: Allowed
|
| 390 |
+
- ✅ **Private Use**: Allowed
|
| 391 |
+
- ❌ **Liability**: Limited
|
| 392 |
+
- ❌ **Warranty**: None
|
| 393 |
+
|
| 394 |
+
---
|
| 395 |
+
|
| 396 |
+
## 🙏 Acknowledgments
|
| 397 |
+
|
| 398 |
+
- **Dataset Source**: [Kaggle Mall Customer Segmentation](https://www.kaggle.com/vjchoudhary7/customer-segmentation-tutorial-in-python)
|
| 399 |
+
- **Streamlit**: For the amazing web application framework
|
| 400 |
+
- **Scikit-learn**: For robust machine learning algorithms
|
| 401 |
+
- **Plotly**: For beautiful interactive visualizations
|
| 402 |
+
- **Open Source Community**: For inspiration and support
|
| 403 |
+
|
| 404 |
+
---
|
| 405 |
+
|
| 406 |
+
## 📞 Support & Contact
|
| 407 |
+
|
| 408 |
+
- **Live Application**: [Customer Segmentation Analysis](https://customer-segmentation-mqnhet38emja8xtgffpzjt.streamlit.app/)
|
| 409 |
+
- **GitHub Repository**: [Customer Segmentation](https://github.com/yourusername/customer-segmentation)
|
| 410 |
+
- **Issues**: [GitHub Issues](https://github.com/yourusername/customer-segmentation/issues)
|
| 411 |
+
- **Email**: your.email@example.com
|
| 412 |
+
|
| 413 |
+
---
|
| 414 |
+
|
| 415 |
+
<div align="center">
|
| 416 |
+
|
| 417 |
+
**🎯 Happy Clustering! 📊**
|
| 418 |
+
|
| 419 |
+
[](https://customer-segmentation-mqnhet38emja8xtgffpzjt.streamlit.app/)
|
| 420 |
+
|
| 421 |
+
*Made with ❤️ using Streamlit and Python*
|
| 422 |
+
|
| 423 |
+
</div>
|
data/Mall_Customers.csv
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
|
| 2 |
+
1,Male,19,15,39
|
| 3 |
+
2,Male,21,15,81
|
| 4 |
+
3,Female,20,16,6
|
| 5 |
+
4,Female,23,16,77
|
| 6 |
+
5,Female,31,17,40
|
| 7 |
+
6,Female,22,17,76
|
| 8 |
+
7,Female,35,18,6
|
| 9 |
+
8,Female,23,18,94
|
| 10 |
+
9,Male,64,19,3
|
| 11 |
+
10,Female,30,19,72
|
| 12 |
+
11,Male,67,19,14
|
| 13 |
+
12,Female,35,19,99
|
| 14 |
+
13,Female,58,20,15
|
| 15 |
+
14,Female,24,20,77
|
| 16 |
+
15,Male,37,20,13
|
| 17 |
+
16,Male,22,20,79
|
| 18 |
+
17,Female,35,21,35
|
| 19 |
+
18,Male,20,21,66
|
| 20 |
+
19,Male,52,23,29
|
| 21 |
+
20,Female,35,23,98
|
| 22 |
+
21,Male,35,24,35
|
| 23 |
+
22,Male,25,24,73
|
| 24 |
+
23,Female,46,25,5
|
| 25 |
+
24,Male,31,25,73
|
| 26 |
+
25,Female,54,28,14
|
| 27 |
+
26,Male,29,28,82
|
| 28 |
+
27,Female,45,28,32
|
| 29 |
+
28,Male,35,28,61
|
| 30 |
+
29,Female,40,29,31
|
| 31 |
+
30,Female,23,29,87
|
| 32 |
+
31,Male,60,30,4
|
| 33 |
+
32,Female,21,30,73
|
| 34 |
+
33,Male,53,33,4
|
| 35 |
+
34,Male,18,33,92
|
| 36 |
+
35,Female,49,33,14
|
| 37 |
+
36,Female,21,33,81
|
| 38 |
+
37,Female,42,34,17
|
| 39 |
+
38,Female,30,34,73
|
| 40 |
+
39,Female,36,37,26
|
| 41 |
+
40,Female,20,37,75
|
| 42 |
+
41,Female,65,38,35
|
| 43 |
+
42,Male,24,38,92
|
| 44 |
+
43,Male,48,39,36
|
| 45 |
+
44,Female,31,39,61
|
| 46 |
+
45,Female,49,39,28
|
| 47 |
+
46,Female,24,39,65
|
| 48 |
+
47,Female,50,40,55
|
| 49 |
+
48,Female,27,40,47
|
| 50 |
+
49,Female,29,40,42
|
| 51 |
+
50,Female,31,40,42
|
| 52 |
+
51,Female,49,42,52
|
| 53 |
+
52,Male,33,42,60
|
| 54 |
+
53,Female,31,43,54
|
| 55 |
+
54,Male,59,43,60
|
| 56 |
+
55,Female,50,43,45
|
| 57 |
+
56,Male,47,43,41
|
| 58 |
+
57,Female,51,44,50
|
| 59 |
+
58,Male,69,44,46
|
| 60 |
+
59,Female,27,46,51
|
| 61 |
+
60,Male,53,46,46
|
| 62 |
+
61,Male,70,46,56
|
| 63 |
+
62,Male,19,46,55
|
| 64 |
+
63,Female,67,47,52
|
| 65 |
+
64,Female,54,47,59
|
| 66 |
+
65,Male,63,48,51
|
| 67 |
+
66,Male,18,48,59
|
| 68 |
+
67,Female,43,48,50
|
| 69 |
+
68,Female,68,48,48
|
| 70 |
+
69,Male,19,48,59
|
| 71 |
+
70,Female,32,48,47
|
| 72 |
+
71,Male,70,49,55
|
| 73 |
+
72,Female,47,49,42
|
| 74 |
+
73,Female,60,50,49
|
| 75 |
+
74,Female,60,50,56
|
| 76 |
+
75,Male,59,54,47
|
| 77 |
+
76,Male,26,54,54
|
| 78 |
+
77,Female,45,54,53
|
| 79 |
+
78,Male,40,54,48
|
| 80 |
+
79,Female,23,54,52
|
| 81 |
+
80,Female,49,54,42
|
| 82 |
+
81,Male,57,54,51
|
| 83 |
+
82,Male,38,54,55
|
| 84 |
+
83,Male,67,54,41
|
| 85 |
+
84,Female,46,54,44
|
| 86 |
+
85,Female,21,54,57
|
| 87 |
+
86,Male,48,54,46
|
| 88 |
+
87,Female,55,57,58
|
| 89 |
+
88,Female,22,57,55
|
| 90 |
+
89,Female,34,58,60
|
| 91 |
+
90,Female,50,58,46
|
| 92 |
+
91,Female,68,59,55
|
| 93 |
+
92,Male,18,59,41
|
| 94 |
+
93,Male,48,60,49
|
| 95 |
+
94,Female,40,60,40
|
| 96 |
+
95,Female,32,60,42
|
| 97 |
+
96,Male,24,60,52
|
| 98 |
+
97,Female,47,60,47
|
| 99 |
+
98,Female,27,60,50
|
| 100 |
+
99,Male,48,61,42
|
| 101 |
+
100,Male,20,61,49
|
| 102 |
+
101,Female,23,62,41
|
| 103 |
+
102,Female,49,62,48
|
| 104 |
+
103,Male,67,62,59
|
| 105 |
+
104,Male,26,62,55
|
| 106 |
+
105,Male,49,62,56
|
| 107 |
+
106,Female,21,62,42
|
| 108 |
+
107,Female,66,63,50
|
| 109 |
+
108,Male,54,63,46
|
| 110 |
+
109,Male,68,63,43
|
| 111 |
+
110,Male,66,63,48
|
| 112 |
+
111,Male,65,63,52
|
| 113 |
+
112,Female,19,63,54
|
| 114 |
+
113,Female,38,64,42
|
| 115 |
+
114,Male,19,64,46
|
| 116 |
+
115,Female,18,65,48
|
| 117 |
+
116,Female,19,65,50
|
| 118 |
+
117,Female,63,65,43
|
| 119 |
+
118,Female,49,65,59
|
| 120 |
+
119,Female,51,67,43
|
| 121 |
+
120,Female,50,67,57
|
| 122 |
+
121,Male,27,67,56
|
| 123 |
+
122,Female,38,67,40
|
| 124 |
+
123,Female,40,69,58
|
| 125 |
+
124,Male,39,69,91
|
| 126 |
+
125,Female,23,70,29
|
| 127 |
+
126,Female,31,70,77
|
| 128 |
+
127,Male,43,71,35
|
| 129 |
+
128,Male,40,71,95
|
| 130 |
+
129,Male,59,71,11
|
| 131 |
+
130,Male,38,71,75
|
| 132 |
+
131,Male,47,71,9
|
| 133 |
+
132,Male,39,71,75
|
| 134 |
+
133,Female,25,72,34
|
| 135 |
+
134,Female,31,72,71
|
| 136 |
+
135,Male,20,73,5
|
| 137 |
+
136,Female,29,73,88
|
| 138 |
+
137,Female,44,73,7
|
| 139 |
+
138,Male,32,73,73
|
| 140 |
+
139,Male,19,74,10
|
| 141 |
+
140,Female,35,74,72
|
| 142 |
+
141,Female,57,75,5
|
| 143 |
+
142,Male,32,75,93
|
| 144 |
+
143,Female,28,76,40
|
| 145 |
+
144,Female,32,76,87
|
| 146 |
+
145,Male,25,77,12
|
| 147 |
+
146,Male,28,77,97
|
| 148 |
+
147,Male,48,77,36
|
| 149 |
+
148,Female,32,77,74
|
| 150 |
+
149,Female,34,78,22
|
| 151 |
+
150,Male,34,78,90
|
| 152 |
+
151,Male,43,78,17
|
| 153 |
+
152,Male,39,78,88
|
| 154 |
+
153,Female,44,78,20
|
| 155 |
+
154,Female,38,78,76
|
| 156 |
+
155,Female,47,78,16
|
| 157 |
+
156,Female,27,78,89
|
| 158 |
+
157,Male,37,78,1
|
| 159 |
+
158,Female,30,78,78
|
| 160 |
+
159,Male,34,78,1
|
| 161 |
+
160,Female,30,78,73
|
| 162 |
+
161,Female,56,79,35
|
| 163 |
+
162,Female,29,79,83
|
| 164 |
+
163,Male,19,81,5
|
| 165 |
+
164,Female,31,81,93
|
| 166 |
+
165,Male,50,85,26
|
| 167 |
+
166,Female,36,85,75
|
| 168 |
+
167,Male,42,86,20
|
| 169 |
+
168,Female,33,86,95
|
| 170 |
+
169,Female,36,87,27
|
| 171 |
+
170,Male,32,87,63
|
| 172 |
+
171,Male,40,87,13
|
| 173 |
+
172,Male,28,87,75
|
| 174 |
+
173,Male,36,87,10
|
| 175 |
+
174,Male,36,87,92
|
| 176 |
+
175,Female,52,88,13
|
| 177 |
+
176,Female,30,88,86
|
| 178 |
+
177,Male,58,88,15
|
| 179 |
+
178,Male,27,88,69
|
| 180 |
+
179,Male,59,93,14
|
| 181 |
+
180,Male,35,93,90
|
| 182 |
+
181,Female,37,97,32
|
| 183 |
+
182,Female,32,97,86
|
| 184 |
+
183,Male,46,98,15
|
| 185 |
+
184,Female,29,98,88
|
| 186 |
+
185,Female,41,99,39
|
| 187 |
+
186,Male,30,99,97
|
| 188 |
+
187,Female,54,101,24
|
| 189 |
+
188,Male,28,101,68
|
| 190 |
+
189,Female,41,103,17
|
| 191 |
+
190,Female,36,103,85
|
| 192 |
+
191,Female,34,103,23
|
| 193 |
+
192,Female,32,103,69
|
| 194 |
+
193,Male,33,113,8
|
| 195 |
+
194,Female,38,113,91
|
| 196 |
+
195,Female,47,120,16
|
| 197 |
+
196,Female,35,120,79
|
| 198 |
+
197,Female,45,126,28
|
| 199 |
+
198,Male,32,126,74
|
| 200 |
+
199,Male,32,137,18
|
| 201 |
+
200,Male,30,137,83
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
matplotlib>=3.7.0
|
| 5 |
+
seaborn>=0.12.0
|
| 6 |
+
scikit-learn>=1.3.0
|
| 7 |
+
plotly>=5.15.0
|
| 8 |
+
pyyaml>=6.0
|
run_app.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Customer Segmentation App Launcher
|
| 4 |
+
=================================
|
| 5 |
+
|
| 6 |
+
Launch script for the Customer Segmentation Streamlit application.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
"""Launch the Streamlit application."""
|
| 15 |
+
|
| 16 |
+
# Change to the project directory
|
| 17 |
+
project_dir = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
+
os.chdir(project_dir)
|
| 19 |
+
|
| 20 |
+
# Path to the main Streamlit app
|
| 21 |
+
app_path = os.path.join("streamlit_app", "main.py")
|
| 22 |
+
|
| 23 |
+
# Check if the app file exists
|
| 24 |
+
if not os.path.exists(app_path):
|
| 25 |
+
print(f"❌ Error: Streamlit app not found at {app_path}")
|
| 26 |
+
sys.exit(1)
|
| 27 |
+
|
| 28 |
+
print("🚀 Launching Customer Segmentation App...")
|
| 29 |
+
print(f"📂 Project directory: {project_dir}")
|
| 30 |
+
print(f"🎯 App path: {app_path}")
|
| 31 |
+
print("-" * 50)
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
# Launch Streamlit
|
| 35 |
+
subprocess.run([
|
| 36 |
+
sys.executable, "-m", "streamlit", "run", app_path,
|
| 37 |
+
"--server.address", "localhost",
|
| 38 |
+
"--server.port", "8501",
|
| 39 |
+
"--browser.gatherUsageStats", "false"
|
| 40 |
+
], check=True)
|
| 41 |
+
except subprocess.CalledProcessError as e:
|
| 42 |
+
print(f"❌ Error launching Streamlit: {e}")
|
| 43 |
+
sys.exit(1)
|
| 44 |
+
except KeyboardInterrupt:
|
| 45 |
+
print("\n👋 Application stopped by user.")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"❌ Unexpected error: {e}")
|
| 48 |
+
sys.exit(1)
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
main()
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Customer Segmentation Package
|
src/__pycache__/__init__.cpython-39.pyc
ADDED
|
Binary file (174 Bytes). View file
|
|
|
src/__pycache__/clustering.cpython-312.pyc
ADDED
|
Binary file (10.3 kB). View file
|
|
|
src/__pycache__/clustering.cpython-39.pyc
ADDED
|
Binary file (6.58 kB). View file
|
|
|
src/__pycache__/data_loader.cpython-312.pyc
ADDED
|
Binary file (6.97 kB). View file
|
|
|
src/__pycache__/data_loader.cpython-39.pyc
ADDED
|
Binary file (4.58 kB). View file
|
|
|
src/__pycache__/visualizations.cpython-312.pyc
ADDED
|
Binary file (15.8 kB). View file
|
|
|
src/__pycache__/visualizations.cpython-39.pyc
ADDED
|
Binary file (17.2 kB). View file
|
|
|
src/clustering.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Clustering Analysis Module
|
| 3 |
+
=========================
|
| 4 |
+
|
| 5 |
+
This module implements various clustering algorithms for customer segmentation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from sklearn.cluster import KMeans, DBSCAN
|
| 11 |
+
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
| 12 |
+
import streamlit as st
|
| 13 |
+
|
| 14 |
+
class ClusteringAnalyzer:
|
| 15 |
+
"""
|
| 16 |
+
Handles clustering analysis for customer segmentation.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.kmeans_model = None
|
| 21 |
+
self.dbscan_model = None
|
| 22 |
+
self.optimal_clusters = None
|
| 23 |
+
self.cluster_labels = {}
|
| 24 |
+
|
| 25 |
+
def find_optimal_clusters(self, scaled_data, max_clusters=10):
|
| 26 |
+
"""Find optimal number of clusters using multiple methods."""
|
| 27 |
+
if scaled_data is None:
|
| 28 |
+
st.error("No scaled data available. Please preprocess data first.")
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
cluster_range = range(2, max_clusters + 1)
|
| 32 |
+
inertias = []
|
| 33 |
+
silhouette_scores = []
|
| 34 |
+
calinski_scores = []
|
| 35 |
+
|
| 36 |
+
progress_bar = st.progress(0)
|
| 37 |
+
status_text = st.empty()
|
| 38 |
+
|
| 39 |
+
for i, k in enumerate(cluster_range):
|
| 40 |
+
status_text.text(f'Evaluating {k} clusters...')
|
| 41 |
+
|
| 42 |
+
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 43 |
+
cluster_labels = kmeans.fit_predict(scaled_data)
|
| 44 |
+
|
| 45 |
+
inertias.append(kmeans.inertia_)
|
| 46 |
+
silhouette_scores.append(silhouette_score(scaled_data, cluster_labels))
|
| 47 |
+
calinski_scores.append(calinski_harabasz_score(scaled_data, cluster_labels))
|
| 48 |
+
|
| 49 |
+
progress_bar.progress((i + 1) / len(cluster_range))
|
| 50 |
+
|
| 51 |
+
status_text.text('Optimization complete!')
|
| 52 |
+
|
| 53 |
+
# Find optimal clusters based on silhouette score
|
| 54 |
+
optimal_silhouette = cluster_range[np.argmax(silhouette_scores)]
|
| 55 |
+
optimal_calinski = cluster_range[np.argmax(calinski_scores)]
|
| 56 |
+
|
| 57 |
+
# Store results
|
| 58 |
+
self.optimization_results = {
|
| 59 |
+
'cluster_range': list(cluster_range),
|
| 60 |
+
'inertias': inertias,
|
| 61 |
+
'silhouette_scores': silhouette_scores,
|
| 62 |
+
'calinski_scores': calinski_scores,
|
| 63 |
+
'optimal_silhouette': optimal_silhouette,
|
| 64 |
+
'optimal_calinski': optimal_calinski
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
self.optimal_clusters = optimal_silhouette
|
| 68 |
+
|
| 69 |
+
st.success(f"✅ Optimal clusters found: {self.optimal_clusters} (based on Silhouette Score)")
|
| 70 |
+
|
| 71 |
+
return self.optimization_results
|
| 72 |
+
|
| 73 |
+
def apply_kmeans(self, scaled_data, n_clusters=None):
|
| 74 |
+
"""Apply K-Means clustering."""
|
| 75 |
+
if scaled_data is None:
|
| 76 |
+
st.error("No scaled data available. Please preprocess data first.")
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
if n_clusters is None:
|
| 80 |
+
n_clusters = self.optimal_clusters or 5
|
| 81 |
+
|
| 82 |
+
with st.spinner(f'Applying K-Means clustering with {n_clusters} clusters...'):
|
| 83 |
+
self.kmeans_model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
| 84 |
+
kmeans_labels = self.kmeans_model.fit_predict(scaled_data)
|
| 85 |
+
|
| 86 |
+
# Calculate metrics
|
| 87 |
+
silhouette_avg = silhouette_score(scaled_data, kmeans_labels)
|
| 88 |
+
calinski_score = calinski_harabasz_score(scaled_data, kmeans_labels)
|
| 89 |
+
|
| 90 |
+
self.cluster_labels['kmeans'] = kmeans_labels
|
| 91 |
+
|
| 92 |
+
results = {
|
| 93 |
+
'labels': kmeans_labels,
|
| 94 |
+
'n_clusters': n_clusters,
|
| 95 |
+
'silhouette_score': silhouette_avg,
|
| 96 |
+
'calinski_score': calinski_score,
|
| 97 |
+
'inertia': self.kmeans_model.inertia_,
|
| 98 |
+
'centers': self.kmeans_model.cluster_centers_
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
st.success(f"✅ K-Means clustering completed!")
|
| 102 |
+
st.info(f"Silhouette Score: {silhouette_avg:.3f} | Calinski-Harabasz Score: {calinski_score:.3f}")
|
| 103 |
+
|
| 104 |
+
return results
|
| 105 |
+
|
| 106 |
+
def apply_dbscan(self, scaled_data, eps=0.5, min_samples=5):
|
| 107 |
+
"""Apply DBSCAN clustering."""
|
| 108 |
+
if scaled_data is None:
|
| 109 |
+
st.error("No scaled data available. Please preprocess data first.")
|
| 110 |
+
return None
|
| 111 |
+
|
| 112 |
+
with st.spinner(f'Applying DBSCAN clustering (eps={eps}, min_samples={min_samples})...'):
|
| 113 |
+
self.dbscan_model = DBSCAN(eps=eps, min_samples=min_samples)
|
| 114 |
+
dbscan_labels = self.dbscan_model.fit_predict(scaled_data)
|
| 115 |
+
|
| 116 |
+
# Calculate metrics
|
| 117 |
+
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
|
| 118 |
+
n_noise = list(dbscan_labels).count(-1)
|
| 119 |
+
|
| 120 |
+
self.cluster_labels['dbscan'] = dbscan_labels
|
| 121 |
+
|
| 122 |
+
results = {
|
| 123 |
+
'labels': dbscan_labels,
|
| 124 |
+
'n_clusters': n_clusters,
|
| 125 |
+
'n_noise': n_noise,
|
| 126 |
+
'eps': eps,
|
| 127 |
+
'min_samples': min_samples
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
# Calculate silhouette score only if we have more than 1 cluster and non-noise points
|
| 131 |
+
if n_clusters > 1:
|
| 132 |
+
non_noise_mask = dbscan_labels != -1
|
| 133 |
+
if np.sum(non_noise_mask) > 1:
|
| 134 |
+
silhouette_avg = silhouette_score(scaled_data[non_noise_mask],
|
| 135 |
+
dbscan_labels[non_noise_mask])
|
| 136 |
+
results['silhouette_score'] = silhouette_avg
|
| 137 |
+
|
| 138 |
+
st.success(f"✅ DBSCAN clustering completed!")
|
| 139 |
+
st.info(f"Clusters: {n_clusters} | Noise points: {n_noise}")
|
| 140 |
+
|
| 141 |
+
return results
|
| 142 |
+
|
| 143 |
+
def analyze_clusters(self, data, algorithm='kmeans'):
|
| 144 |
+
"""Analyze cluster characteristics."""
|
| 145 |
+
# Normalize algorithm name
|
| 146 |
+
algo_key = algorithm.lower().replace('-', '').replace(' ', '')
|
| 147 |
+
|
| 148 |
+
if algo_key not in self.cluster_labels:
|
| 149 |
+
st.error(f"No {algorithm} clustering results found. Please run clustering first.")
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
cluster_labels = self.cluster_labels[algo_key]
|
| 153 |
+
|
| 154 |
+
# Create consistent column name (use the format that actually gets created)
|
| 155 |
+
if algo_key == 'kmeans':
|
| 156 |
+
cluster_col = 'Kmeans_Cluster' # Match what we see in the error
|
| 157 |
+
elif algo_key == 'dbscan':
|
| 158 |
+
cluster_col = 'DBSCAN_Cluster'
|
| 159 |
+
else:
|
| 160 |
+
cluster_col = f'{algorithm}_Cluster'
|
| 161 |
+
|
| 162 |
+
# Add cluster labels to data
|
| 163 |
+
analysis_data = data.copy()
|
| 164 |
+
analysis_data[cluster_col] = cluster_labels
|
| 165 |
+
|
| 166 |
+
# Calculate cluster statistics
|
| 167 |
+
numeric_cols = analysis_data.select_dtypes(include=[np.number]).columns
|
| 168 |
+
numeric_cols = [col for col in numeric_cols if not col.endswith('_Cluster')]
|
| 169 |
+
|
| 170 |
+
cluster_stats = analysis_data.groupby(cluster_col)[numeric_cols].agg(['mean', 'std', 'count'])
|
| 171 |
+
|
| 172 |
+
# Calculate spending analysis if available
|
| 173 |
+
spending_analysis = None
|
| 174 |
+
if 'Spending Score (1-100)' in analysis_data.columns:
|
| 175 |
+
spending_analysis = analysis_data.groupby(cluster_col)['Spending Score (1-100)'].agg(['mean', 'std', 'min', 'max', 'count'])
|
| 176 |
+
|
| 177 |
+
results = {
|
| 178 |
+
'data_with_clusters': analysis_data,
|
| 179 |
+
'cluster_stats': cluster_stats,
|
| 180 |
+
'spending_analysis': spending_analysis,
|
| 181 |
+
'cluster_distribution': analysis_data[cluster_col].value_counts().sort_index()
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
return results
|
| 185 |
+
|
| 186 |
+
def get_cluster_profiles(self, data, algorithm='kmeans'):
|
| 187 |
+
"""Generate customer profiles for each cluster."""
|
| 188 |
+
# Normalize algorithm name
|
| 189 |
+
algo_key = algorithm.lower().replace('-', '').replace(' ', '')
|
| 190 |
+
|
| 191 |
+
if algo_key not in self.cluster_labels:
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
cluster_labels = self.cluster_labels[algo_key]
|
| 195 |
+
|
| 196 |
+
# Create consistent column name (use the format that actually gets created)
|
| 197 |
+
if algo_key == 'kmeans':
|
| 198 |
+
cluster_col = 'Kmeans_Cluster' # Match what we see in the error
|
| 199 |
+
elif algo_key == 'dbscan':
|
| 200 |
+
cluster_col = 'DBSCAN_Cluster'
|
| 201 |
+
else:
|
| 202 |
+
cluster_col = f'{algorithm}_Cluster'
|
| 203 |
+
|
| 204 |
+
analysis_data = data.copy()
|
| 205 |
+
analysis_data[cluster_col] = cluster_labels
|
| 206 |
+
|
| 207 |
+
profiles = []
|
| 208 |
+
|
| 209 |
+
for cluster in sorted(analysis_data[cluster_col].unique()):
|
| 210 |
+
if cluster == -1: # Skip noise points in DBSCAN
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
cluster_data = analysis_data[analysis_data[cluster_col] == cluster]
|
| 214 |
+
|
| 215 |
+
profile = {
|
| 216 |
+
'cluster': cluster,
|
| 217 |
+
'size': len(cluster_data),
|
| 218 |
+
'percentage': len(cluster_data) / len(analysis_data) * 100
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
# Add feature statistics
|
| 222 |
+
if 'Age' in cluster_data.columns:
|
| 223 |
+
profile['avg_age'] = cluster_data['Age'].mean()
|
| 224 |
+
profile['age_std'] = cluster_data['Age'].std()
|
| 225 |
+
|
| 226 |
+
if 'Annual Income (k$)' in cluster_data.columns:
|
| 227 |
+
profile['avg_income'] = cluster_data['Annual Income (k$)'].mean()
|
| 228 |
+
profile['income_std'] = cluster_data['Annual Income (k$)'].std()
|
| 229 |
+
|
| 230 |
+
if 'Spending Score (1-100)' in cluster_data.columns:
|
| 231 |
+
profile['avg_spending'] = cluster_data['Spending Score (1-100)'].mean()
|
| 232 |
+
profile['spending_std'] = cluster_data['Spending Score (1-100)'].std()
|
| 233 |
+
|
| 234 |
+
if 'Gender' in cluster_data.columns:
|
| 235 |
+
profile['gender_dist'] = cluster_data['Gender'].value_counts().to_dict()
|
| 236 |
+
|
| 237 |
+
# Generate profile characterization
|
| 238 |
+
if 'avg_income' in profile and 'avg_spending' in profile:
|
| 239 |
+
avg_income = profile['avg_income']
|
| 240 |
+
avg_spending = profile['avg_spending']
|
| 241 |
+
|
| 242 |
+
if avg_income > 70 and avg_spending > 70:
|
| 243 |
+
profile['type'] = "💎 HIGH VALUE"
|
| 244 |
+
profile['description'] = "High income, high spending - Premium customers"
|
| 245 |
+
elif avg_income > 70 and avg_spending < 40:
|
| 246 |
+
profile['type'] = "💼 CONSERVATIVE"
|
| 247 |
+
profile['description'] = "High income, low spending - Potential for upselling"
|
| 248 |
+
elif avg_income < 40 and avg_spending > 70:
|
| 249 |
+
profile['type'] = "🎯 BUDGET SPENDERS"
|
| 250 |
+
profile['description'] = "Low income, high spending - Price-sensitive loyal customers"
|
| 251 |
+
elif avg_income < 40 and avg_spending < 40:
|
| 252 |
+
profile['type'] = "📉 LOW ENGAGEMENT"
|
| 253 |
+
profile['description'] = "Low income, low spending - Need retention strategies"
|
| 254 |
+
else:
|
| 255 |
+
profile['type'] = "⚖️ BALANCED"
|
| 256 |
+
profile['description'] = "Moderate income and spending - Core customer base"
|
| 257 |
+
|
| 258 |
+
profiles.append(profile)
|
| 259 |
+
|
| 260 |
+
return profiles
|
src/data_loader.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Loading and Preprocessing Module
|
| 3 |
+
====================================
|
| 4 |
+
|
| 5 |
+
This module handles data loading, preprocessing, and validation for customer segmentation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
import os
|
| 11 |
+
from sklearn.preprocessing import StandardScaler
|
| 12 |
+
import streamlit as st
|
| 13 |
+
|
| 14 |
+
class DataLoader:
|
| 15 |
+
"""
|
| 16 |
+
Handles data loading and preprocessing for customer segmentation analysis.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.data = None
|
| 21 |
+
self.scaled_data = None
|
| 22 |
+
self.scaler = StandardScaler()
|
| 23 |
+
self.feature_names = None
|
| 24 |
+
|
| 25 |
+
def create_sample_dataset(self, n_customers=200):
|
| 26 |
+
"""Create a realistic sample Mall Customers dataset."""
|
| 27 |
+
np.random.seed(42)
|
| 28 |
+
|
| 29 |
+
customer_ids = range(1, n_customers + 1)
|
| 30 |
+
|
| 31 |
+
# Gender distribution (approximately 56% Female, 44% Male)
|
| 32 |
+
genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56])
|
| 33 |
+
|
| 34 |
+
# Age distribution (mean ~39, std ~14)
|
| 35 |
+
ages = np.random.normal(38.85, 13.97, n_customers).astype(int)
|
| 36 |
+
ages = np.clip(ages, 18, 70)
|
| 37 |
+
|
| 38 |
+
# Create realistic income distribution (mean ~61k, std ~26k)
|
| 39 |
+
annual_incomes = np.random.normal(60.56, 26.26, n_customers)
|
| 40 |
+
annual_incomes = np.clip(annual_incomes, 15, 137)
|
| 41 |
+
|
| 42 |
+
# Create spending scores with realistic patterns
|
| 43 |
+
base_spending = np.random.normal(50, 25, n_customers)
|
| 44 |
+
|
| 45 |
+
# Add some income correlation
|
| 46 |
+
income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min())
|
| 47 |
+
income_effect = (income_normalized - 0.5) * 30
|
| 48 |
+
|
| 49 |
+
# Add age effect
|
| 50 |
+
age_normalized = (ages - ages.min()) / (ages.max() - ages.min())
|
| 51 |
+
age_effect = np.where(age_normalized < 0.3, 10,
|
| 52 |
+
np.where(age_normalized > 0.7, -5, 0))
|
| 53 |
+
|
| 54 |
+
spending_scores = base_spending + income_effect * 0.6 + age_effect + np.random.normal(0, 10, n_customers)
|
| 55 |
+
spending_scores = np.clip(spending_scores, 1, 100)
|
| 56 |
+
|
| 57 |
+
# Create DataFrame
|
| 58 |
+
sample_data = pd.DataFrame({
|
| 59 |
+
'CustomerID': customer_ids,
|
| 60 |
+
'Gender': genders,
|
| 61 |
+
'Age': ages,
|
| 62 |
+
'Annual Income (k$)': annual_incomes.round().astype(int),
|
| 63 |
+
'Spending Score (1-100)': spending_scores.round().astype(int)
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
return sample_data
|
| 67 |
+
|
| 68 |
+
def load_data(self, file_path=None):
|
| 69 |
+
"""Load customer data from file or create sample data."""
|
| 70 |
+
# Check for default dataset location first
|
| 71 |
+
default_path = os.path.join("data", "Mall_Customers.csv")
|
| 72 |
+
|
| 73 |
+
if file_path and os.path.exists(file_path):
|
| 74 |
+
try:
|
| 75 |
+
self.data = pd.read_csv(file_path)
|
| 76 |
+
st.success(f"✅ Data loaded successfully from {file_path}")
|
| 77 |
+
return self.data
|
| 78 |
+
except Exception as e:
|
| 79 |
+
st.error(f"Error loading data: {e}")
|
| 80 |
+
return None
|
| 81 |
+
elif os.path.exists(default_path):
|
| 82 |
+
try:
|
| 83 |
+
self.data = pd.read_csv(default_path)
|
| 84 |
+
st.success(f"✅ Mall Customers dataset loaded from {default_path}")
|
| 85 |
+
return self.data
|
| 86 |
+
except Exception as e:
|
| 87 |
+
st.error(f"Error loading default dataset: {e}")
|
| 88 |
+
return None
|
| 89 |
+
else:
|
| 90 |
+
# Create sample data
|
| 91 |
+
self.data = self.create_sample_dataset()
|
| 92 |
+
st.info("📊 Using generated sample dataset (Mall Customer simulation)")
|
| 93 |
+
# Save the sample data for future use
|
| 94 |
+
try:
|
| 95 |
+
os.makedirs("data", exist_ok=True)
|
| 96 |
+
self.data.to_csv(default_path, index=False)
|
| 97 |
+
st.info(f"💾 Sample dataset saved to {default_path}")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
st.warning(f"Could not save sample dataset: {e}")
|
| 100 |
+
return self.data
|
| 101 |
+
|
| 102 |
+
def get_data_info(self):
|
| 103 |
+
"""Get comprehensive data information."""
|
| 104 |
+
if self.data is None:
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
info = {
|
| 108 |
+
'shape': self.data.shape,
|
| 109 |
+
'columns': list(self.data.columns),
|
| 110 |
+
'dtypes': self.data.dtypes.to_dict(),
|
| 111 |
+
'missing_values': self.data.isnull().sum().to_dict(),
|
| 112 |
+
'statistics': self.data.describe().to_dict()
|
| 113 |
+
}
|
| 114 |
+
return info
|
| 115 |
+
|
| 116 |
+
def preprocess_data(self, features=None):
|
| 117 |
+
"""Preprocess and scale data for clustering."""
|
| 118 |
+
if self.data is None:
|
| 119 |
+
st.error("No data loaded. Please load data first.")
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
# Default features for clustering
|
| 123 |
+
if features is None:
|
| 124 |
+
features = ['Annual Income (k$)', 'Spending Score (1-100)']
|
| 125 |
+
|
| 126 |
+
# Check if features exist in data
|
| 127 |
+
available_features = [f for f in features if f in self.data.columns]
|
| 128 |
+
if not available_features:
|
| 129 |
+
st.error(f"None of the specified features {features} found in data.")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
# Extract features for clustering
|
| 133 |
+
X = self.data[available_features].copy()
|
| 134 |
+
|
| 135 |
+
# Handle missing values if any
|
| 136 |
+
if X.isnull().sum().sum() > 0:
|
| 137 |
+
X = X.fillna(X.mean())
|
| 138 |
+
st.warning("Missing values filled with mean values.")
|
| 139 |
+
|
| 140 |
+
# Scale the features
|
| 141 |
+
self.scaled_data = self.scaler.fit_transform(X)
|
| 142 |
+
self.feature_names = available_features
|
| 143 |
+
|
| 144 |
+
st.success(f"✅ Data preprocessed successfully using features: {available_features}")
|
| 145 |
+
return self.scaled_data
|
| 146 |
+
|
| 147 |
+
def get_feature_data(self):
|
| 148 |
+
"""Get the original feature data."""
|
| 149 |
+
if self.data is None or self.feature_names is None:
|
| 150 |
+
return None
|
| 151 |
+
return self.data[self.feature_names]
|
src/visualizations.py
ADDED
|
@@ -0,0 +1,780 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visualization Module
|
| 3 |
+
===================
|
| 4 |
+
|
| 5 |
+
This module handles all visualization components for the customer segmentation analysis.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Matplotlib and Seaborn removed to avoid extra dependency
|
| 9 |
+
# All charts use Plotly for interactive visualization
|
| 10 |
+
import plotly.express as px
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
from plotly.subplots import make_subplots
|
| 13 |
+
import plotly.io as pio
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import numpy as np
|
| 16 |
+
import streamlit as st
|
| 17 |
+
|
| 18 |
+
# Global Plotly template: dark backgrounds to match app theme
|
| 19 |
+
pio.templates.default = "plotly_dark"
|
| 20 |
+
pio.templates["plotly_dark"].layout.update(
|
| 21 |
+
paper_bgcolor="#0F172A",
|
| 22 |
+
plot_bgcolor="#0F172A",
|
| 23 |
+
font=dict(color="#E5E7EB")
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Plot styling handled via Plotly theme settings per figure
|
| 27 |
+
|
| 28 |
+
class Visualizer:
|
| 29 |
+
"""
|
| 30 |
+
Handles all visualizations for customer segmentation analysis.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self):
|
| 34 |
+
# Enhanced color palettes for better visual appeal
|
| 35 |
+
self.colors = px.colors.qualitative.Set1 # More vibrant colors
|
| 36 |
+
self.gradient_colors = [
|
| 37 |
+
'#FF6B6B', # Coral Red
|
| 38 |
+
'#4ECDC4', # Turquoise
|
| 39 |
+
'#45B7D1', # Sky Blue
|
| 40 |
+
'#96CEB4', # Mint Green
|
| 41 |
+
'#FFEAA7', # Warm Yellow
|
| 42 |
+
'#DDA0DD', # Plum
|
| 43 |
+
'#98D8C8', # Seafoam
|
| 44 |
+
'#F7DC6F', # Golden Yellow
|
| 45 |
+
'#BB8FCE', # Lavender
|
| 46 |
+
'#85C1E9' # Light Blue
|
| 47 |
+
]
|
| 48 |
+
self.modern_colors = [
|
| 49 |
+
'#6C5CE7', # Purple
|
| 50 |
+
'#00B894', # Green
|
| 51 |
+
'#E17055', # Orange
|
| 52 |
+
'#0984E3', # Blue
|
| 53 |
+
'#FDCB6E', # Yellow
|
| 54 |
+
'#E84393', # Pink
|
| 55 |
+
'#00CEC9', # Cyan
|
| 56 |
+
'#A29BFE', # Light Purple
|
| 57 |
+
'#FD79A8', # Light Pink
|
| 58 |
+
'#81ECEC' # Light Cyan
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
def plot_data_exploration(self, data):
|
| 62 |
+
"""Create comprehensive data exploration plots with enhanced styling."""
|
| 63 |
+
if data is None:
|
| 64 |
+
st.error("❌ No data available for visualization.")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
# Debug: Show data info
|
| 68 |
+
st.info(f"🔍 **Data shape:** {data.shape}")
|
| 69 |
+
st.info(f"🔍 **Data columns:** {list(data.columns)}")
|
| 70 |
+
|
| 71 |
+
st.subheader("📊 Data Distribution Analysis")
|
| 72 |
+
|
| 73 |
+
# Create subplots for different visualizations
|
| 74 |
+
col1, col2 = st.columns(2)
|
| 75 |
+
|
| 76 |
+
with col1:
|
| 77 |
+
# Age distribution with enhanced styling
|
| 78 |
+
if 'Age' in data.columns:
|
| 79 |
+
st.write("📊 Creating Age distribution plot...")
|
| 80 |
+
fig_age = px.histogram(
|
| 81 |
+
data, x='Age', nbins=20,
|
| 82 |
+
title='👥 Age Distribution',
|
| 83 |
+
color_discrete_sequence=[self.gradient_colors[0]]
|
| 84 |
+
)
|
| 85 |
+
fig_age.update_layout(
|
| 86 |
+
height=450,
|
| 87 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 88 |
+
plot_bgcolor='#0F172A',
|
| 89 |
+
paper_bgcolor='#0F172A',
|
| 90 |
+
xaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB')),
|
| 91 |
+
yaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB'))
|
| 92 |
+
)
|
| 93 |
+
fig_age.update_traces(marker=dict(line=dict(width=1, color='white')))
|
| 94 |
+
st.plotly_chart(fig_age, use_container_width=True, theme=None)
|
| 95 |
+
st.success("✅ Age distribution plot created!")
|
| 96 |
+
|
| 97 |
+
# Income distribution with enhanced styling
|
| 98 |
+
if 'Annual Income (k$)' in data.columns:
|
| 99 |
+
st.write("💰 Creating Income distribution plot...")
|
| 100 |
+
fig_income = px.histogram(
|
| 101 |
+
data, x='Annual Income (k$)', nbins=20,
|
| 102 |
+
title='💰 Annual Income Distribution',
|
| 103 |
+
color_discrete_sequence=[self.gradient_colors[1]]
|
| 104 |
+
)
|
| 105 |
+
fig_income.update_layout(
|
| 106 |
+
height=450,
|
| 107 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 108 |
+
plot_bgcolor='#0F172A',
|
| 109 |
+
paper_bgcolor='#0F172A',
|
| 110 |
+
xaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB')),
|
| 111 |
+
yaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB'))
|
| 112 |
+
)
|
| 113 |
+
fig_income.update_traces(marker=dict(line=dict(width=1, color='white')))
|
| 114 |
+
st.plotly_chart(fig_income, use_container_width=True, theme=None)
|
| 115 |
+
st.success("✅ Income distribution plot created!")
|
| 116 |
+
|
| 117 |
+
with col2:
|
| 118 |
+
# Spending Score distribution with enhanced styling
|
| 119 |
+
if 'Spending Score (1-100)' in data.columns:
|
| 120 |
+
st.write("🛍️ Creating Spending Score distribution plot...")
|
| 121 |
+
fig_spending = px.histogram(
|
| 122 |
+
data, x='Spending Score (1-100)', nbins=20,
|
| 123 |
+
title='🛍️ Spending Score Distribution',
|
| 124 |
+
color_discrete_sequence=[self.gradient_colors[2]]
|
| 125 |
+
)
|
| 126 |
+
fig_spending.update_layout(
|
| 127 |
+
height=450,
|
| 128 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 129 |
+
plot_bgcolor='#0F172A',
|
| 130 |
+
paper_bgcolor='#0F172A',
|
| 131 |
+
xaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB')),
|
| 132 |
+
yaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB'))
|
| 133 |
+
)
|
| 134 |
+
fig_spending.update_traces(marker=dict(line=dict(width=1, color='white')))
|
| 135 |
+
st.plotly_chart(fig_spending, use_container_width=True, theme=None)
|
| 136 |
+
st.success("✅ Spending Score distribution plot created!")
|
| 137 |
+
|
| 138 |
+
# Gender distribution with enhanced styling
|
| 139 |
+
if 'Gender' in data.columns:
|
| 140 |
+
gender_counts = data['Gender'].value_counts()
|
| 141 |
+
fig_gender = px.pie(
|
| 142 |
+
values=gender_counts.values,
|
| 143 |
+
names=gender_counts.index,
|
| 144 |
+
title='👫 Gender Distribution',
|
| 145 |
+
color_discrete_sequence=self.modern_colors[:len(gender_counts)]
|
| 146 |
+
)
|
| 147 |
+
fig_gender.update_layout(
|
| 148 |
+
height=450,
|
| 149 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 150 |
+
plot_bgcolor='#0F172A',
|
| 151 |
+
paper_bgcolor='#0F172A'
|
| 152 |
+
)
|
| 153 |
+
fig_gender.update_traces(
|
| 154 |
+
textposition='inside',
|
| 155 |
+
textinfo='percent+label',
|
| 156 |
+
textfont_size=14,
|
| 157 |
+
marker=dict(line=dict(color='white', width=2))
|
| 158 |
+
)
|
| 159 |
+
st.plotly_chart(fig_gender, use_container_width=True)
|
| 160 |
+
|
| 161 |
+
# Enhanced correlation analysis
|
| 162 |
+
st.subheader("🔗 Feature Correlations")
|
| 163 |
+
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
| 164 |
+
if len(numeric_cols) > 1:
|
| 165 |
+
corr_matrix = data[numeric_cols].corr()
|
| 166 |
+
fig_corr = px.imshow(
|
| 167 |
+
corr_matrix,
|
| 168 |
+
text_auto=True,
|
| 169 |
+
title='🔗 Feature Correlation Matrix',
|
| 170 |
+
color_continuous_scale='RdYlBu',
|
| 171 |
+
aspect='auto'
|
| 172 |
+
)
|
| 173 |
+
fig_corr.update_layout(
|
| 174 |
+
height=500,
|
| 175 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 176 |
+
plot_bgcolor='#0F172A',
|
| 177 |
+
paper_bgcolor='#0F172A',
|
| 178 |
+
font=dict(size=12, color='#E5E7EB')
|
| 179 |
+
)
|
| 180 |
+
fig_corr.update_traces(
|
| 181 |
+
textfont=dict(size=12, color='#E5E7EB'),
|
| 182 |
+
hoverongaps=False
|
| 183 |
+
)
|
| 184 |
+
st.plotly_chart(fig_corr, theme=None, use_container_width=True)
|
| 185 |
+
|
| 186 |
+
# Enhanced scatter plots
|
| 187 |
+
st.subheader("🔍 Feature Relationships")
|
| 188 |
+
col1, col2 = st.columns(2)
|
| 189 |
+
|
| 190 |
+
with col1:
|
| 191 |
+
if 'Annual Income (k$)' in data.columns and 'Spending Score (1-100)' in data.columns:
|
| 192 |
+
fig_scatter1 = px.scatter(
|
| 193 |
+
data, x='Annual Income (k$)', y='Spending Score (1-100)',
|
| 194 |
+
title='💰 Income vs Spending Score',
|
| 195 |
+
hover_data=['Age'] if 'Age' in data.columns else None,
|
| 196 |
+
color_discrete_sequence=[self.modern_colors[3]]
|
| 197 |
+
)
|
| 198 |
+
fig_scatter1.update_layout(
|
| 199 |
+
height=450,
|
| 200 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 201 |
+
plot_bgcolor='#0F172A',
|
| 202 |
+
paper_bgcolor='#0F172A',
|
| 203 |
+
xaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB')),
|
| 204 |
+
yaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB'))
|
| 205 |
+
)
|
| 206 |
+
fig_scatter1.update_traces(
|
| 207 |
+
marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white'))
|
| 208 |
+
)
|
| 209 |
+
st.plotly_chart(fig_scatter1, use_container_width=True)
|
| 210 |
+
|
| 211 |
+
with col2:
|
| 212 |
+
if 'Age' in data.columns and 'Spending Score (1-100)' in data.columns:
|
| 213 |
+
fig_scatter2 = px.scatter(
|
| 214 |
+
data, x='Age', y='Spending Score (1-100)',
|
| 215 |
+
title='👥 Age vs Spending Score',
|
| 216 |
+
hover_data=['Annual Income (k$)'] if 'Annual Income (k$)' in data.columns else None,
|
| 217 |
+
color_discrete_sequence=[self.modern_colors[4]]
|
| 218 |
+
)
|
| 219 |
+
fig_scatter2.update_layout(
|
| 220 |
+
height=450,
|
| 221 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 222 |
+
plot_bgcolor='#0F172A',
|
| 223 |
+
paper_bgcolor='#0F172A',
|
| 224 |
+
xaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB')),
|
| 225 |
+
yaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB'))
|
| 226 |
+
)
|
| 227 |
+
fig_scatter2.update_traces(
|
| 228 |
+
marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white'))
|
| 229 |
+
)
|
| 230 |
+
st.plotly_chart(fig_scatter2, use_container_width=True)
|
| 231 |
+
|
| 232 |
+
def plot_optimization_results(self, results):
|
| 233 |
+
"""Plot cluster optimization results."""
|
| 234 |
+
if results is None:
|
| 235 |
+
st.error("No optimization results available.")
|
| 236 |
+
return
|
| 237 |
+
|
| 238 |
+
# Create subplots
|
| 239 |
+
fig = make_subplots(
|
| 240 |
+
rows=1, cols=3,
|
| 241 |
+
subplot_titles=('Elbow Method', 'Silhouette Score', 'Calinski-Harabasz Score'),
|
| 242 |
+
specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}]]
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
cluster_range = results['cluster_range']
|
| 246 |
+
|
| 247 |
+
# Elbow method
|
| 248 |
+
fig.add_trace(
|
| 249 |
+
go.Scatter(x=cluster_range, y=results['inertias'],
|
| 250 |
+
mode='lines+markers', name='Inertia',
|
| 251 |
+
line=dict(color='blue')),
|
| 252 |
+
row=1, col=1
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Silhouette score
|
| 256 |
+
fig.add_trace(
|
| 257 |
+
go.Scatter(x=cluster_range, y=results['silhouette_scores'],
|
| 258 |
+
mode='lines+markers', name='Silhouette Score',
|
| 259 |
+
line=dict(color='red')),
|
| 260 |
+
row=1, col=2
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Calinski-Harabasz score
|
| 264 |
+
fig.add_trace(
|
| 265 |
+
go.Scatter(x=cluster_range, y=results['calinski_scores'],
|
| 266 |
+
mode='lines+markers', name='Calinski-Harabasz Score',
|
| 267 |
+
line=dict(color='green')),
|
| 268 |
+
row=1, col=3
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Update layout
|
| 272 |
+
fig.update_layout(
|
| 273 |
+
title_text="Cluster Optimization Results",
|
| 274 |
+
height=400,
|
| 275 |
+
showlegend=False,
|
| 276 |
+
paper_bgcolor="#0F172A",
|
| 277 |
+
plot_bgcolor="#0F172A",
|
| 278 |
+
font=dict(color="#E5E7EB")
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
fig.update_xaxes(title_text="Number of Clusters")
|
| 282 |
+
fig.update_yaxes(title_text="Inertia", row=1, col=1)
|
| 283 |
+
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)
|
| 284 |
+
fig.update_yaxes(title_text="Calinski-Harabasz Score", row=1, col=3)
|
| 285 |
+
|
| 286 |
+
st.plotly_chart(fig, theme=None, use_container_width=True)
|
| 287 |
+
|
| 288 |
+
# Display optimal results
|
| 289 |
+
col1, col2, col3 = st.columns(3)
|
| 290 |
+
with col1:
|
| 291 |
+
st.metric("Optimal Clusters (Silhouette)", results['optimal_silhouette'])
|
| 292 |
+
with col2:
|
| 293 |
+
st.metric("Optimal Clusters (Calinski-Harabasz)", results['optimal_calinski'])
|
| 294 |
+
with col3:
|
| 295 |
+
st.metric("Recommended", results['optimal_silhouette'])
|
| 296 |
+
|
| 297 |
+
def plot_clusters(self, data, cluster_labels, algorithm='K-Means', scaler=None, centers=None):
|
| 298 |
+
"""Plot cluster visualizations."""
|
| 299 |
+
if data is None or cluster_labels is None:
|
| 300 |
+
st.error("No data or cluster labels available for visualization.")
|
| 301 |
+
return
|
| 302 |
+
|
| 303 |
+
# Prepare data with clusters
|
| 304 |
+
plot_data = data.copy()
|
| 305 |
+
plot_data['Cluster'] = cluster_labels
|
| 306 |
+
|
| 307 |
+
# Main clustering visualization
|
| 308 |
+
st.subheader(f"🎯 {algorithm} Clustering Results")
|
| 309 |
+
|
| 310 |
+
col1, col2 = st.columns(2)
|
| 311 |
+
|
| 312 |
+
with col1:
|
| 313 |
+
if 'Annual Income (k$)' in data.columns and 'Spending Score (1-100)' in data.columns:
|
| 314 |
+
fig_main = px.scatter(plot_data,
|
| 315 |
+
x='Annual Income (k$)',
|
| 316 |
+
y='Spending Score (1-100)',
|
| 317 |
+
color='Cluster',
|
| 318 |
+
title=f'{algorithm}: Income vs Spending Score',
|
| 319 |
+
hover_data=['Age'] if 'Age' in data.columns else None,
|
| 320 |
+
color_discrete_sequence=self.colors)
|
| 321 |
+
|
| 322 |
+
# Add cluster centers if available
|
| 323 |
+
if centers is not None and scaler is not None:
|
| 324 |
+
centers_original = scaler.inverse_transform(centers)
|
| 325 |
+
centers_df = pd.DataFrame(centers_original,
|
| 326 |
+
columns=['Annual Income (k$)', 'Spending Score (1-100)'])
|
| 327 |
+
centers_df['Cluster'] = range(len(centers_df))
|
| 328 |
+
|
| 329 |
+
fig_main.add_scatter(x=centers_df['Annual Income (k$)'],
|
| 330 |
+
y=centers_df['Spending Score (1-100)'],
|
| 331 |
+
mode='markers',
|
| 332 |
+
marker=dict(symbol='x', size=15, color='red', line=dict(width=2)),
|
| 333 |
+
name='Centers',
|
| 334 |
+
showlegend=True)
|
| 335 |
+
|
| 336 |
+
fig_main.update_layout(
|
| 337 |
+
height=500,
|
| 338 |
+
paper_bgcolor="#0F172A",
|
| 339 |
+
plot_bgcolor="#0F172A",
|
| 340 |
+
font=dict(color="#E5E7EB"),
|
| 341 |
+
xaxis=dict(gridcolor="rgba(229,231,235,0.12)"),
|
| 342 |
+
yaxis=dict(gridcolor="rgba(229,231,235,0.12)")
|
| 343 |
+
)
|
| 344 |
+
st.plotly_chart(fig_main, theme=None, use_container_width=True)
|
| 345 |
+
|
| 346 |
+
with col2:
|
| 347 |
+
if 'Age' in data.columns and 'Spending Score (1-100)' in data.columns:
|
| 348 |
+
fig_age = px.scatter(plot_data,
|
| 349 |
+
x='Age',
|
| 350 |
+
y='Spending Score (1-100)',
|
| 351 |
+
color='Cluster',
|
| 352 |
+
title=f'{algorithm}: Age vs Spending Score',
|
| 353 |
+
color_discrete_sequence=self.colors)
|
| 354 |
+
fig_age.update_layout(
|
| 355 |
+
height=500,
|
| 356 |
+
paper_bgcolor="#0F172A",
|
| 357 |
+
plot_bgcolor="#0F172A",
|
| 358 |
+
font=dict(color="#E5E7EB"),
|
| 359 |
+
xaxis=dict(gridcolor="rgba(229,231,235,0.12)"),
|
| 360 |
+
yaxis=dict(gridcolor="rgba(229,231,235,0.12)")
|
| 361 |
+
)
|
| 362 |
+
st.plotly_chart(fig_age, theme=None, use_container_width=True)
|
| 363 |
+
|
| 364 |
+
# Enhanced cluster distribution
|
| 365 |
+
st.subheader("📊 Cluster Distribution")
|
| 366 |
+
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
|
| 367 |
+
|
| 368 |
+
fig_dist = px.bar(
|
| 369 |
+
x=cluster_counts.index, y=cluster_counts.values,
|
| 370 |
+
title='📊 Number of Customers per Cluster',
|
| 371 |
+
labels={'x': 'Cluster', 'y': 'Number of Customers'},
|
| 372 |
+
color=cluster_counts.values,
|
| 373 |
+
color_continuous_scale='Turbo'
|
| 374 |
+
)
|
| 375 |
+
fig_dist.update_layout(
|
| 376 |
+
height=450,
|
| 377 |
+
title=dict(font=dict(size=18, color='#E5E7EB'), x=0.5),
|
| 378 |
+
plot_bgcolor='#0F172A',
|
| 379 |
+
paper_bgcolor='#0F172A',
|
| 380 |
+
xaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB')),
|
| 381 |
+
yaxis=dict(gridcolor='rgba(229,231,235,0.12)', title_font=dict(size=14, color='#E5E7EB'))
|
| 382 |
+
)
|
| 383 |
+
fig_dist.update_traces(
|
| 384 |
+
marker=dict(line=dict(width=1, color='white'))
|
| 385 |
+
)
|
| 386 |
+
st.plotly_chart(fig_dist, theme=None, use_container_width=True)
|
| 387 |
+
|
| 388 |
+
def plot_cluster_analysis(self, analysis_results, algorithm='K-Means'):
|
| 389 |
+
"""Plot detailed cluster analysis with enhanced visualizations."""
|
| 390 |
+
if analysis_results is None:
|
| 391 |
+
st.error("❌ No analysis results available.")
|
| 392 |
+
return
|
| 393 |
+
|
| 394 |
+
try:
|
| 395 |
+
data_with_clusters = analysis_results['data_with_clusters']
|
| 396 |
+
spending_analysis = analysis_results['spending_analysis']
|
| 397 |
+
|
| 398 |
+
# COMPLETELY REWRITTEN: Find cluster column with bulletproof detection
|
| 399 |
+
available_columns = list(data_with_clusters.columns)
|
| 400 |
+
st.info(f"🔍 **Available columns in data:** {available_columns}")
|
| 401 |
+
|
| 402 |
+
# Find ANY column that contains 'cluster' (case insensitive)
|
| 403 |
+
cluster_columns = [col for col in available_columns if 'cluster' in col.lower()]
|
| 404 |
+
st.info(f"🎯 **Found cluster columns:** {cluster_columns}")
|
| 405 |
+
|
| 406 |
+
if not cluster_columns:
|
| 407 |
+
st.error("❌ No cluster column found in the data!")
|
| 408 |
+
st.write("Available columns:", available_columns)
|
| 409 |
+
st.write("Please ensure clustering has been performed first.")
|
| 410 |
+
return
|
| 411 |
+
|
| 412 |
+
# Use the first cluster column found
|
| 413 |
+
cluster_col = cluster_columns[0]
|
| 414 |
+
st.success(f"✅ **Using cluster column:** `{cluster_col}`")
|
| 415 |
+
|
| 416 |
+
# EXTRA SAFETY: Ensure the column actually exists before proceeding
|
| 417 |
+
if cluster_col not in data_with_clusters.columns:
|
| 418 |
+
st.error(f"❌ Column `{cluster_col}` not found in data!")
|
| 419 |
+
st.write("This should not happen. Please report this bug.")
|
| 420 |
+
return
|
| 421 |
+
|
| 422 |
+
# Create a beautiful header with metrics
|
| 423 |
+
st.markdown(f"""
|
| 424 |
+
<div style="
|
| 425 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 426 |
+
padding: 2rem;
|
| 427 |
+
border-radius: 15px;
|
| 428 |
+
color: white;
|
| 429 |
+
text-align: center;
|
| 430 |
+
margin: 2rem 0;
|
| 431 |
+
box-shadow: 0 10px 25px rgba(0,0,0,0.1);
|
| 432 |
+
">
|
| 433 |
+
<h2 style="margin: 0; font-size: 2.5rem; font-weight: 700;">📈 {algorithm} Cluster Analysis</h2>
|
| 434 |
+
<p style="margin: 0.5rem 0 0 0; font-size: 1.2rem; opacity: 0.9;">Interactive Cluster Visualization & Analysis</p>
|
| 435 |
+
</div>
|
| 436 |
+
""", unsafe_allow_html=True)
|
| 437 |
+
|
| 438 |
+
# Quick stats
|
| 439 |
+
num_clusters = len(data_with_clusters[cluster_col].unique())
|
| 440 |
+
total_customers = len(data_with_clusters)
|
| 441 |
+
|
| 442 |
+
metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
|
| 443 |
+
with metric_col1:
|
| 444 |
+
st.metric("🎯 Total Clusters", num_clusters)
|
| 445 |
+
with metric_col2:
|
| 446 |
+
st.metric("👥 Total Customers", total_customers)
|
| 447 |
+
with metric_col3:
|
| 448 |
+
avg_cluster_size = total_customers / num_clusters
|
| 449 |
+
st.metric("📊 Avg Cluster Size", f"{avg_cluster_size:.0f}")
|
| 450 |
+
with metric_col4:
|
| 451 |
+
if 'Spending Score (1-100)' in data_with_clusters.columns:
|
| 452 |
+
avg_spending = data_with_clusters['Spending Score (1-100)'].mean()
|
| 453 |
+
st.metric("💰 Avg Spending", f"{avg_spending:.1f}")
|
| 454 |
+
|
| 455 |
+
st.markdown("---")
|
| 456 |
+
|
| 457 |
+
# Enhanced Box plots with better styling
|
| 458 |
+
st.subheader("📊 Distribution Analysis")
|
| 459 |
+
col1, col2 = st.columns(2)
|
| 460 |
+
|
| 461 |
+
with col1:
|
| 462 |
+
if 'Spending Score (1-100)' in data_with_clusters.columns:
|
| 463 |
+
# Convert cluster column to string to ensure proper categorical handling
|
| 464 |
+
plot_data = data_with_clusters.copy()
|
| 465 |
+
plot_data[cluster_col] = plot_data[cluster_col].astype(str)
|
| 466 |
+
|
| 467 |
+
# DEBUG: Show exactly what we're passing to plotly
|
| 468 |
+
st.write(f"🔍 **DEBUG - About to create box plot with:**")
|
| 469 |
+
st.write(f"- x column: `{cluster_col}`")
|
| 470 |
+
st.write(f"- Columns in plot_data: {list(plot_data.columns)}")
|
| 471 |
+
st.write(f"- First few rows of plot_data:")
|
| 472 |
+
st.dataframe(plot_data.head(3))
|
| 473 |
+
|
| 474 |
+
fig_spending_box = px.box(
|
| 475 |
+
plot_data,
|
| 476 |
+
x=cluster_col,
|
| 477 |
+
y='Spending Score (1-100)',
|
| 478 |
+
title='💰 Spending Score Distribution by Cluster',
|
| 479 |
+
color=cluster_col,
|
| 480 |
+
color_discrete_sequence=self.modern_colors
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Enhanced styling for maximum visibility
|
| 484 |
+
fig_spending_box.update_layout(
|
| 485 |
+
height=600,
|
| 486 |
+
title=dict(
|
| 487 |
+
text='💰 Spending Score Distribution by Cluster',
|
| 488 |
+
font=dict(size=20, color='#E5E7EB'),
|
| 489 |
+
x=0.5,
|
| 490 |
+
y=0.95
|
| 491 |
+
),
|
| 492 |
+
plot_bgcolor='#0F172A',
|
| 493 |
+
paper_bgcolor='#0F172A',
|
| 494 |
+
font=dict(size=14, family="Arial, sans-serif", color='#E5E7EB'),
|
| 495 |
+
xaxis=dict(
|
| 496 |
+
title=dict(text='Cluster', font=dict(size=16, color='#E5E7EB')),
|
| 497 |
+
tickfont=dict(size=14, color='#E5E7EB'),
|
| 498 |
+
gridcolor='rgba(229,231,235,0.12)',
|
| 499 |
+
gridwidth=1,
|
| 500 |
+
showgrid=True
|
| 501 |
+
),
|
| 502 |
+
yaxis=dict(
|
| 503 |
+
title=dict(text='Spending Score', font=dict(size=16, color='#E5E7EB')),
|
| 504 |
+
tickfont=dict(size=14, color='#E5E7EB'),
|
| 505 |
+
gridcolor='rgba(229,231,235,0.12)',
|
| 506 |
+
gridwidth=1,
|
| 507 |
+
showgrid=True
|
| 508 |
+
),
|
| 509 |
+
showlegend=False,
|
| 510 |
+
margin=dict(t=80, b=60, l=60, r=40)
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
fig_spending_box.update_traces(
|
| 514 |
+
marker=dict(size=6, opacity=0.8),
|
| 515 |
+
line=dict(width=3),
|
| 516 |
+
fillcolor='rgba(0,0,0,0)',
|
| 517 |
+
boxpoints='outliers'
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
st.plotly_chart(fig_spending_box, theme=None, use_container_width=True)
|
| 521 |
+
|
| 522 |
+
with col2:
|
| 523 |
+
if 'Annual Income (k$)' in data_with_clusters.columns:
|
| 524 |
+
# Convert cluster column to string to ensure proper categorical handling
|
| 525 |
+
plot_data = data_with_clusters.copy()
|
| 526 |
+
plot_data[cluster_col] = plot_data[cluster_col].astype(str)
|
| 527 |
+
|
| 528 |
+
fig_income_box = px.box(
|
| 529 |
+
plot_data,
|
| 530 |
+
x=cluster_col,
|
| 531 |
+
y='Annual Income (k$)',
|
| 532 |
+
title='💵 Income Distribution by Cluster',
|
| 533 |
+
color=cluster_col,
|
| 534 |
+
color_discrete_sequence=self.modern_colors
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
# Enhanced styling for maximum visibility
|
| 538 |
+
fig_income_box.update_layout(
|
| 539 |
+
height=600,
|
| 540 |
+
title=dict(
|
| 541 |
+
text='💵 Annual Income Distribution by Cluster',
|
| 542 |
+
font=dict(size=20, color='#E5E7EB'),
|
| 543 |
+
x=0.5,
|
| 544 |
+
y=0.95
|
| 545 |
+
),
|
| 546 |
+
plot_bgcolor='#0F172A',
|
| 547 |
+
paper_bgcolor='#0F172A',
|
| 548 |
+
font=dict(size=14, family="Arial, sans-serif", color='#E5E7EB'),
|
| 549 |
+
xaxis=dict(
|
| 550 |
+
title=dict(text='Cluster', font=dict(size=16, color='#E5E7EB')),
|
| 551 |
+
tickfont=dict(size=14, color='#E5E7EB'),
|
| 552 |
+
gridcolor='rgba(229,231,235,0.12)',
|
| 553 |
+
gridwidth=1,
|
| 554 |
+
showgrid=True
|
| 555 |
+
),
|
| 556 |
+
yaxis=dict(
|
| 557 |
+
title=dict(text='Annual Income (k$)', font=dict(size=16, color='#E5E7EB')),
|
| 558 |
+
tickfont=dict(size=14, color='#E5E7EB'),
|
| 559 |
+
gridcolor='rgba(229,231,235,0.12)',
|
| 560 |
+
gridwidth=1,
|
| 561 |
+
showgrid=True
|
| 562 |
+
),
|
| 563 |
+
showlegend=False,
|
| 564 |
+
margin=dict(t=80, b=60, l=60, r=40)
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
fig_income_box.update_traces(
|
| 568 |
+
marker=dict(size=6, opacity=0.8),
|
| 569 |
+
line=dict(width=3),
|
| 570 |
+
fillcolor='rgba(0,0,0,0)',
|
| 571 |
+
boxpoints='outliers'
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
st.plotly_chart(fig_income_box, theme=None, use_container_width=True)
|
| 575 |
+
|
| 576 |
+
# Average spending per cluster with stunning visualization
|
| 577 |
+
if spending_analysis is not None:
|
| 578 |
+
st.markdown("---")
|
| 579 |
+
|
| 580 |
+
# Beautiful section header
|
| 581 |
+
st.markdown(f"""
|
| 582 |
+
<div style="
|
| 583 |
+
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
| 584 |
+
padding: 1.5rem;
|
| 585 |
+
border-radius: 15px;
|
| 586 |
+
color: white;
|
| 587 |
+
text-align: center;
|
| 588 |
+
margin: 2rem 0 1rem 0;
|
| 589 |
+
box-shadow: 0 8px 20px rgba(240, 147, 251, 0.3);
|
| 590 |
+
">
|
| 591 |
+
<h3 style="margin: 0; font-size: 1.8rem; font-weight: 600;">💰 Average Spending Analysis</h3>
|
| 592 |
+
</div>
|
| 593 |
+
""", unsafe_allow_html=True)
|
| 594 |
+
|
| 595 |
+
# Create stunning bar chart with enhanced colors
|
| 596 |
+
fig_avg_spending = px.bar(
|
| 597 |
+
x=spending_analysis.index.astype(str),
|
| 598 |
+
y=spending_analysis['mean'],
|
| 599 |
+
title='📊 Average Spending Score by Cluster',
|
| 600 |
+
labels={'x': 'Cluster', 'y': 'Average Spending Score'},
|
| 601 |
+
error_y=spending_analysis['std'],
|
| 602 |
+
color=spending_analysis['mean'],
|
| 603 |
+
color_continuous_scale='Viridis'
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
# Ultra-enhanced styling
|
| 607 |
+
fig_avg_spending.update_layout(
|
| 608 |
+
height=650,
|
| 609 |
+
title=dict(
|
| 610 |
+
text='📊 Average Spending Score by Cluster',
|
| 611 |
+
font=dict(size=24, color='#E5E7EB', family="Arial Black"),
|
| 612 |
+
x=0.5,
|
| 613 |
+
y=0.95
|
| 614 |
+
),
|
| 615 |
+
plot_bgcolor='#0F172A',
|
| 616 |
+
paper_bgcolor='#0F172A',
|
| 617 |
+
font=dict(size=16, family="Arial, sans-serif", color='#E5E7EB'),
|
| 618 |
+
xaxis=dict(
|
| 619 |
+
title=dict(text='Cluster', font=dict(size=18, color='#E5E7EB')),
|
| 620 |
+
tickfont=dict(size=16, color='#E5E7EB'),
|
| 621 |
+
gridcolor='rgba(229,231,235,0.12)',
|
| 622 |
+
gridwidth=1,
|
| 623 |
+
showgrid=True,
|
| 624 |
+
zeroline=False
|
| 625 |
+
),
|
| 626 |
+
yaxis=dict(
|
| 627 |
+
title=dict(text='Average Spending Score', font=dict(size=18, color='#E5E7EB')),
|
| 628 |
+
tickfont=dict(size=16, color='#E5E7EB'),
|
| 629 |
+
gridcolor='rgba(229,231,235,0.12)',
|
| 630 |
+
gridwidth=1,
|
| 631 |
+
showgrid=True,
|
| 632 |
+
zeroline=False
|
| 633 |
+
),
|
| 634 |
+
showlegend=False,
|
| 635 |
+
margin=dict(t=100, b=80, l=80, r=80)
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
# Add stylish value labels on bars
|
| 639 |
+
for i, (cluster, value) in enumerate(zip(spending_analysis.index, spending_analysis['mean'])):
|
| 640 |
+
fig_avg_spending.add_annotation(
|
| 641 |
+
x=str(cluster),
|
| 642 |
+
y=value + spending_analysis.loc[cluster, 'std'] + 5,
|
| 643 |
+
text=f'<b>{value:.1f}</b>',
|
| 644 |
+
showarrow=False,
|
| 645 |
+
font=dict(size=16, color='white', family="Arial Black"),
|
| 646 |
+
bgcolor='rgba(44, 62, 80, 0.9)',
|
| 647 |
+
bordercolor='rgba(44, 62, 80, 1)',
|
| 648 |
+
borderwidth=2,
|
| 649 |
+
borderpad=8
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
# Enhance the bars themselves
|
| 653 |
+
fig_avg_spending.update_traces(
|
| 654 |
+
marker=dict(
|
| 655 |
+
line=dict(width=2, color='rgba(44, 62, 80, 0.8)'),
|
| 656 |
+
opacity=0.9
|
| 657 |
+
),
|
| 658 |
+
width=0.6
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
st.plotly_chart(fig_avg_spending, theme=None, use_container_width=True)
|
| 662 |
+
|
| 663 |
+
# Beautiful cluster insights table
|
| 664 |
+
st.markdown("""
|
| 665 |
+
<div style="
|
| 666 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 667 |
+
padding: 1.5rem;
|
| 668 |
+
border-radius: 15px;
|
| 669 |
+
color: white;
|
| 670 |
+
text-align: center;
|
| 671 |
+
margin: 2rem 0 1rem 0;
|
| 672 |
+
box-shadow: 0 8px 20px rgba(102, 126, 234, 0.3);
|
| 673 |
+
">
|
| 674 |
+
<h3 style="margin: 0; font-size: 1.8rem; font-weight: 600;">📋 Detailed Cluster Statistics</h3>
|
| 675 |
+
</div>
|
| 676 |
+
""", unsafe_allow_html=True)
|
| 677 |
+
|
| 678 |
+
summary_df = spending_analysis.round(2)
|
| 679 |
+
summary_df.columns = ['🎯 Avg Spending', '📊 Std Dev', '📉 Min', '📈 Max', '👥 Count']
|
| 680 |
+
|
| 681 |
+
# Create a Plotly table instead of using background_gradient
|
| 682 |
+
fig_table = go.Figure(data=[go.Table(
|
| 683 |
+
header=dict(
|
| 684 |
+
values=list(summary_df.columns),
|
| 685 |
+
fill_color='#1F2937',
|
| 686 |
+
font=dict(color='#E5E7EB', size=14, family='Inter'),
|
| 687 |
+
align='center',
|
| 688 |
+
height=40
|
| 689 |
+
),
|
| 690 |
+
cells=dict(
|
| 691 |
+
values=[summary_df[col] for col in summary_df.columns],
|
| 692 |
+
fill_color='#0F172A',
|
| 693 |
+
font=dict(color='#E5E7EB', size=12, family='Inter'),
|
| 694 |
+
align='center',
|
| 695 |
+
height=35,
|
| 696 |
+
format=[None, '.2f', '.2f', '.2f', '.2f', '.0f']
|
| 697 |
+
)
|
| 698 |
+
)])
|
| 699 |
+
|
| 700 |
+
fig_table.update_layout(
|
| 701 |
+
height=300,
|
| 702 |
+
title=dict(
|
| 703 |
+
text='📊 Cluster Spending Analysis',
|
| 704 |
+
font=dict(size=18, color='#E5E7EB', family='Inter'),
|
| 705 |
+
x=0.5
|
| 706 |
+
),
|
| 707 |
+
plot_bgcolor='#0F172A',
|
| 708 |
+
paper_bgcolor='#0F172A',
|
| 709 |
+
margin=dict(t=60, b=20, l=20, r=20)
|
| 710 |
+
)
|
| 711 |
+
st.plotly_chart(fig_table, use_container_width=True, theme=None)
|
| 712 |
+
|
| 713 |
+
except Exception as e:
|
| 714 |
+
st.error(f"❌ Error in cluster analysis visualization: {str(e)}")
|
| 715 |
+
st.write("Please try the 'Clear Session' button in the sidebar and run clustering again.")
|
| 716 |
+
|
| 717 |
+
def plot_comparison(self, data, kmeans_labels, dbscan_labels):
|
| 718 |
+
"""Plot comparison between K-Means and DBSCAN."""
|
| 719 |
+
st.subheader("🔄 Algorithm Comparison")
|
| 720 |
+
|
| 721 |
+
col1, col2 = st.columns(2)
|
| 722 |
+
|
| 723 |
+
with col1:
|
| 724 |
+
# K-Means
|
| 725 |
+
plot_data_kmeans = data.copy()
|
| 726 |
+
plot_data_kmeans['Cluster'] = kmeans_labels
|
| 727 |
+
|
| 728 |
+
fig_kmeans = px.scatter(plot_data_kmeans,
|
| 729 |
+
x='Annual Income (k$)',
|
| 730 |
+
y='Spending Score (1-100)',
|
| 731 |
+
color='Cluster',
|
| 732 |
+
title='K-Means Clustering',
|
| 733 |
+
color_discrete_sequence=self.colors)
|
| 734 |
+
fig_kmeans.update_layout(
|
| 735 |
+
height=400,
|
| 736 |
+
paper_bgcolor="#0F172A",
|
| 737 |
+
plot_bgcolor="#0F172A",
|
| 738 |
+
font=dict(color="#E5E7EB")
|
| 739 |
+
)
|
| 740 |
+
st.plotly_chart(fig_kmeans, theme=None, use_container_width=True)
|
| 741 |
+
|
| 742 |
+
with col2:
|
| 743 |
+
# DBSCAN
|
| 744 |
+
plot_data_dbscan = data.copy()
|
| 745 |
+
plot_data_dbscan['Cluster'] = dbscan_labels
|
| 746 |
+
plot_data_dbscan['Cluster'] = plot_data_dbscan['Cluster'].astype(str)
|
| 747 |
+
plot_data_dbscan.loc[plot_data_dbscan['Cluster'] == '-1', 'Cluster'] = 'Noise'
|
| 748 |
+
|
| 749 |
+
fig_dbscan = px.scatter(plot_data_dbscan,
|
| 750 |
+
x='Annual Income (k$)',
|
| 751 |
+
y='Spending Score (1-100)',
|
| 752 |
+
color='Cluster',
|
| 753 |
+
title='DBSCAN Clustering',
|
| 754 |
+
color_discrete_sequence=self.colors)
|
| 755 |
+
fig_dbscan.update_layout(
|
| 756 |
+
height=400,
|
| 757 |
+
paper_bgcolor="#0F172A",
|
| 758 |
+
plot_bgcolor="#0F172A",
|
| 759 |
+
font=dict(color="#E5E7EB")
|
| 760 |
+
)
|
| 761 |
+
st.plotly_chart(fig_dbscan, theme=None, use_container_width=True)
|
| 762 |
+
|
| 763 |
+
# Comparison metrics
|
| 764 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 765 |
+
|
| 766 |
+
with col1:
|
| 767 |
+
kmeans_clusters = len(set(kmeans_labels))
|
| 768 |
+
st.metric("K-Means Clusters", kmeans_clusters)
|
| 769 |
+
|
| 770 |
+
with col2:
|
| 771 |
+
dbscan_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
|
| 772 |
+
st.metric("DBSCAN Clusters", dbscan_clusters)
|
| 773 |
+
|
| 774 |
+
with col3:
|
| 775 |
+
noise_points = list(dbscan_labels).count(-1)
|
| 776 |
+
st.metric("DBSCAN Noise Points", noise_points)
|
| 777 |
+
|
| 778 |
+
with col4:
|
| 779 |
+
noise_percentage = (noise_points / len(dbscan_labels)) * 100
|
| 780 |
+
st.metric("Noise Percentage", f"{noise_percentage:.1f}%")
|
streamlit_app/main.py
ADDED
|
@@ -0,0 +1,1112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Customer Segmentation Streamlit App
|
| 3 |
+
==================================
|
| 4 |
+
|
| 5 |
+
A comprehensive web application for customer segmentation analysis using
|
| 6 |
+
K-Means and DBSCAN clustering algorithms.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import streamlit as st
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# Add src to path for imports
|
| 16 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
| 17 |
+
|
| 18 |
+
from src.data_loader import DataLoader
|
| 19 |
+
from src.clustering import ClusteringAnalyzer
|
| 20 |
+
from src.visualizations import Visualizer
|
| 21 |
+
|
| 22 |
+
# Page configuration
|
| 23 |
+
st.set_page_config(
|
| 24 |
+
page_title="Customer Segmentation Analysis",
|
| 25 |
+
page_icon="🛍️",
|
| 26 |
+
layout="wide",
|
| 27 |
+
initial_sidebar_state="expanded"
|
| 28 |
+
)
|
| 29 |
+
import plotly.io as pio
|
| 30 |
+
pio.templates.default = "plotly_dark"
|
| 31 |
+
|
| 32 |
+
# Modern Dark Mode Compatible CSS
|
| 33 |
+
st.markdown("""
|
| 34 |
+
<style>
|
| 35 |
+
/* Import Google Fonts */
|
| 36 |
+
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&family=Inter:wght@300;400;500;600;700&display=swap');
|
| 37 |
+
|
| 38 |
+
/* CSS Variables for Dark Mode Support */
|
| 39 |
+
/* :root {
|
| 40 |
+
--bg-primary: #0F172A; /* slate-900 */
|
| 41 |
+
--bg-secondary: #111827; /* gray-900 */
|
| 42 |
+
--bg-tertiary: #1F2937; /* gray-800 */
|
| 43 |
+
--text-primary: #E5E7EB; /* gray-200 */
|
| 44 |
+
--text-secondary: #CBD5E1; /* slate-300 */
|
| 45 |
+
--text-tertiary: #94A3B8; /* slate-400 */
|
| 46 |
+
--border-color: #374151; /* gray-700 */
|
| 47 |
+
--accent-primary: #818CF8; /* indigo-300 */
|
| 48 |
+
--accent-secondary: #A78BFA; /* violet-300 */
|
| 49 |
+
--shadow-sm: 0 1px 2px 0 rgba(0, 0, 0, 0.4);
|
| 50 |
+
--shadow-md: 0 4px 6px -1px rgba(0, 0, 0, 0.5);
|
| 51 |
+
--shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.6);
|
| 52 |
+
} */
|
| 53 |
+
|
| 54 |
+
/* Dark mode support disabled intentionally */
|
| 55 |
+
|
| 56 |
+
/* Base styling */
|
| 57 |
+
.main .block-container {
|
| 58 |
+
padding: 2rem 1rem;
|
| 59 |
+
max-width: 1200px;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
/* Apply CSS variables to Streamlit elements */
|
| 63 |
+
.stApp { background-color: #0F172A; color: #E5E7EB; }
|
| 64 |
+
|
| 65 |
+
/* Headers */
|
| 66 |
+
.main-header {
|
| 67 |
+
font-family: 'Inter', sans-serif;
|
| 68 |
+
font-size: clamp(2.5rem, 5vw, 4rem);
|
| 69 |
+
font-weight: 800;
|
| 70 |
+
text-align: center;
|
| 71 |
+
margin-bottom: 3rem;
|
| 72 |
+
background: linear-gradient(135deg, #818CF8 0%, #A78BFA 100%);
|
| 73 |
+
-webkit-background-clip: text;
|
| 74 |
+
-webkit-text-fill-color: transparent;
|
| 75 |
+
background-clip: text;
|
| 76 |
+
letter-spacing: -0.02em;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.sub-header {
|
| 80 |
+
font-family: 'Inter', sans-serif;
|
| 81 |
+
font-size: 1.75rem;
|
| 82 |
+
font-weight: 600;
|
| 83 |
+
color: #E5E7EB;
|
| 84 |
+
margin: 2rem 0 1rem 0;
|
| 85 |
+
padding-bottom: 0.75rem;
|
| 86 |
+
border-bottom: 2px solid #374151;
|
| 87 |
+
position: relative;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.sub-header::after {
|
| 91 |
+
content: '';
|
| 92 |
+
bottom: -2px;
|
| 93 |
+
left: 0;
|
| 94 |
+
width: 60px;
|
| 95 |
+
height: 2px;
|
| 96 |
+
background: linear-gradient(135deg, #818CF8, #A78BFA);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
/* Enhanced Tab Styling */
|
| 100 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 101 |
+
gap: 4px;
|
| 102 |
+
background: #111827;
|
| 103 |
+
padding: 8px;
|
| 104 |
+
border-radius: 16px;
|
| 105 |
+
border: 1px solid #374151;
|
| 106 |
+
box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.4);
|
| 107 |
+
margin-bottom: 2rem;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.stTabs [data-baseweb="tab"] {
|
| 111 |
+
height: 48px;
|
| 112 |
+
padding: 0 20px;
|
| 113 |
+
background: transparent;
|
| 114 |
+
border-radius: 12px;
|
| 115 |
+
color: #CBD5E1;
|
| 116 |
+
font-weight: 500;
|
| 117 |
+
font-family: 'Inter', sans-serif;
|
| 118 |
+
font-size: 0.875rem;
|
| 119 |
+
border: none;
|
| 120 |
+
transition: all 0.2s cubic-bezier(0.4, 0, 0.2, 1);
|
| 121 |
+
position: relative;
|
| 122 |
+
overflow: hidden;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.stTabs [data-baseweb="tab"]:hover {
|
| 126 |
+
background: #1F2937;
|
| 127 |
+
color: #E5E7EB;
|
| 128 |
+
transform: translateY(-1px);
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
.stTabs [aria-selected="true"] {
|
| 132 |
+
background: linear-gradient(135deg, #818CF8 0%, #A78BFA 100%);
|
| 133 |
+
color: white !important;
|
| 134 |
+
font-weight: 600;
|
| 135 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.5);
|
| 136 |
+
transform: translateY(-1px);
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* Cards and containers */
|
| 140 |
+
.metric-card {
|
| 141 |
+
background: #0F172A;
|
| 142 |
+
border: 1px solid #374151;
|
| 143 |
+
border-radius: 16px;
|
| 144 |
+
padding: 1.5rem;
|
| 145 |
+
margin: 1rem 0;
|
| 146 |
+
box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.4);
|
| 147 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
|
| 148 |
+
position: relative;
|
| 149 |
+
overflow: hidden;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.metric-card::before {
|
| 153 |
+
content: '';
|
| 154 |
+
top: 0;
|
| 155 |
+
left: 0;
|
| 156 |
+
right: 0;
|
| 157 |
+
height: 3px;
|
| 158 |
+
background: linear-gradient(135deg, #818CF8, #A78BFA);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
.metric-card:hover {
|
| 162 |
+
transform: translateY(-4px);
|
| 163 |
+
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.6);
|
| 164 |
+
border-color: #818CF8;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
.insight-box {
|
| 168 |
+
background: #111827;
|
| 169 |
+
border: 1px solid #818CF8;
|
| 170 |
+
border-radius: 16px;
|
| 171 |
+
padding: 1.5rem;
|
| 172 |
+
margin: 1.5rem 0;
|
| 173 |
+
box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.4);
|
| 174 |
+
position: relative;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.insight-box::before {
|
| 178 |
+
content: '';
|
| 179 |
+
top: 0;
|
| 180 |
+
left: 0;
|
| 181 |
+
right: 0;
|
| 182 |
+
height: 3px;
|
| 183 |
+
background: linear-gradient(135deg, #818CF8, #A78BFA);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* Sidebar */
|
| 187 |
+
.css-1d391kg {
|
| 188 |
+
background: #111827;
|
| 189 |
+
border-right: 1px solid #374151;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
/* Text styling with proper contrast */
|
| 193 |
+
.stMarkdown, .stText, p, div, span, label {
|
| 194 |
+
color: #E5E7EB !important;
|
| 195 |
+
font-family: 'Inter', sans-serif;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
[data-testid="stMarkdownContainer"] {
|
| 199 |
+
color: #E5E7EB !important;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
/* Enhanced message styling */
|
| 203 |
+
.stSuccess {
|
| 204 |
+
background: rgba(34, 197, 94, 0.1) !important;
|
| 205 |
+
border: 1px solid #22c55e !important;
|
| 206 |
+
border-radius: 12px !important;
|
| 207 |
+
color: #166534 !important;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
.stInfo {
|
| 211 |
+
background: rgba(59, 130, 246, 0.1) !important;
|
| 212 |
+
border: 1px solid #3b82f6 !important;
|
| 213 |
+
border-radius: 12px !important;
|
| 214 |
+
color: #1e40af !important;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.stWarning {
|
| 218 |
+
background: rgba(245, 158, 11, 0.1) !important;
|
| 219 |
+
border: 1px solid #f59e0b !important;
|
| 220 |
+
border-radius: 12px !important;
|
| 221 |
+
color: #92400e !important;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
.stError {
|
| 225 |
+
background: rgba(239, 68, 68, 0.1) !important;
|
| 226 |
+
border: 1px solid #ef4444 !important;
|
| 227 |
+
border-radius: 12px !important;
|
| 228 |
+
color: #dc2626 !important;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
/* Enhanced Modern Button Styling */
|
| 232 |
+
.stButton > button {
|
| 233 |
+
background: linear-gradient(135deg, #818CF8 0%, #A78BFA 100%);
|
| 234 |
+
color: white !important;
|
| 235 |
+
border: none;
|
| 236 |
+
border-radius: 16px;
|
| 237 |
+
padding: 1rem 2.5rem;
|
| 238 |
+
font-weight: 700;
|
| 239 |
+
font-family: 'Inter', sans-serif;
|
| 240 |
+
font-size: 1rem;
|
| 241 |
+
letter-spacing: 0.025em;
|
| 242 |
+
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
|
| 243 |
+
box-shadow: 0 8px 25px rgba(129, 140, 248, 0.3);
|
| 244 |
+
position: relative;
|
| 245 |
+
overflow: hidden;
|
| 246 |
+
text-transform: uppercase;
|
| 247 |
+
min-height: 48px;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
.stButton > button::before {
|
| 251 |
+
content: '';
|
| 252 |
+
position: absolute;
|
| 253 |
+
top: 0;
|
| 254 |
+
left: -100%;
|
| 255 |
+
width: 100%;
|
| 256 |
+
height: 100%;
|
| 257 |
+
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
|
| 258 |
+
transition: left 0.5s;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
.stButton > button:hover {
|
| 262 |
+
transform: translateY(-3px) scale(1.02);
|
| 263 |
+
box-shadow: 0 15px 35px rgba(129, 140, 248, 0.4);
|
| 264 |
+
filter: brightness(1.15);
|
| 265 |
+
background: linear-gradient(135deg, #A78BFA 0%, #818CF8 100%);
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
.stButton > button:hover::before {
|
| 269 |
+
left: 100%;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
.stButton > button:active {
|
| 273 |
+
transform: translateY(-1px) scale(0.98);
|
| 274 |
+
box-shadow: 0 5px 15px rgba(129, 140, 248, 0.3);
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
/* Special styling for primary action buttons */
|
| 278 |
+
.stButton > button:contains("Apply") {
|
| 279 |
+
background: linear-gradient(135deg, #10B981 0%, #059669 100%);
|
| 280 |
+
box-shadow: 0 8px 25px rgba(16, 185, 129, 0.3);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
.stButton > button:contains("Apply"):hover {
|
| 284 |
+
background: linear-gradient(135deg, #059669 0%, #10B981 100%);
|
| 285 |
+
box-shadow: 0 15px 35px rgba(16, 185, 129, 0.4);
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
/* Special styling for find/analyze buttons */
|
| 289 |
+
.stButton > button:contains("Find") {
|
| 290 |
+
background: linear-gradient(135deg, #F59E0B 0%, #D97706 100%);
|
| 291 |
+
box-shadow: 0 8px 25px rgba(245, 158, 11, 0.3);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
.stButton > button:contains("Find"):hover {
|
| 295 |
+
background: linear-gradient(135deg, #D97706 0%, #F59E0B 100%);
|
| 296 |
+
box-shadow: 0 15px 35px rgba(245, 158, 11, 0.4);
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
/* Special styling for reload/clear buttons */
|
| 300 |
+
.stButton > button:contains("Reload"), .stButton > button:contains("Clear") {
|
| 301 |
+
background: linear-gradient(135deg, #EF4444 0%, #DC2626 100%);
|
| 302 |
+
box-shadow: 0 8px 25px rgba(239, 68, 68, 0.3);
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
.stButton > button:contains("Reload"):hover, .stButton > button:contains("Clear"):hover {
|
| 306 |
+
background: linear-gradient(135deg, #DC2626 0%, #EF4444 100%);
|
| 307 |
+
box-shadow: 0 15px 35px rgba(239, 68, 68, 0.4);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
/* Form elements */
|
| 311 |
+
.stSelectbox > div > div,
|
| 312 |
+
.stTextInput > div > div > input,
|
| 313 |
+
.stNumberInput > div > div > input {
|
| 314 |
+
background: #0F172A !important;
|
| 315 |
+
border: 1px solid #374151 !important;
|
| 316 |
+
border-radius: 12px !important;
|
| 317 |
+
color: #E5E7EB !important;
|
| 318 |
+
font-family: 'Inter', sans-serif !important;
|
| 319 |
+
transition: all 0.2s ease;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
.stSelectbox > div > div:focus-within,
|
| 323 |
+
.stTextInput > div > div:focus-within,
|
| 324 |
+
.stNumberInput > div > div:focus-within {
|
| 325 |
+
border-color: #818CF8 !important;
|
| 326 |
+
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important;
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
/* Slider styling */
|
| 330 |
+
.stSlider > div > div > div > div {
|
| 331 |
+
background: linear-gradient(135deg, #818CF8, #A78BFA) !important;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
.stSlider > div > div > div > div > div {
|
| 335 |
+
background: white !important;
|
| 336 |
+
border: 2px solid #818CF8 !important;
|
| 337 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.5) !important;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
fig.update_traces(
|
| 341 |
+
fillcolor='rgba(129, 140, 248, 0.3)', # semi-transparent fill
|
| 342 |
+
selector=dict(type='box') # only affects box plots
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
.element-container .stPlotlyChart {
|
| 346 |
+
background: #0F172A !important;
|
| 347 |
+
}
|
| 348 |
+
fig.update_traces(
|
| 349 |
+
marker=dict(size=8, opacity=0.9, line=dict(width=1, color="white"))
|
| 350 |
+
)
|
| 351 |
+
import plotly.express as px
|
| 352 |
+
color_palette = px.colors.qualitative.Set2
|
| 353 |
+
fig = px.scatter(
|
| 354 |
+
data_frame,
|
| 355 |
+
x='Age',
|
| 356 |
+
y='Annual Income (k$)',
|
| 357 |
+
color='Cluster',
|
| 358 |
+
color_discrete_sequence=color_palette,
|
| 359 |
+
title='Age vs. Annual Income',
|
| 360 |
+
labels={'Age': 'Age', 'Annual Income (k$)': 'Annual Income (k$)'},
|
| 361 |
+
template='plotly_dark'
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
/* DataFrames */
|
| 366 |
+
.stDataFrame {
|
| 367 |
+
border: 1px solid #374151;
|
| 368 |
+
border-radius: 12px;
|
| 369 |
+
overflow: hidden;
|
| 370 |
+
box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.4);
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
.stDataFrame > div {
|
| 374 |
+
background: #0F172A;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
/* Progress bars */
|
| 378 |
+
.stProgress > div > div > div {
|
| 379 |
+
background: linear-gradient(135deg, #818CF8, #A78BFA) !important;
|
| 380 |
+
border-radius: 8px !important;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
/* Expanders */
|
| 384 |
+
.streamlit-expanderHeader {
|
| 385 |
+
background: #111827 !important;
|
| 386 |
+
border: 1px solid #374151 !important;
|
| 387 |
+
border-radius: 12px !important;
|
| 388 |
+
color: #E5E7EB !important;
|
| 389 |
+
font-weight: 500 !important;
|
| 390 |
+
font-family: 'Inter', sans-serif !important;
|
| 391 |
+
transition: all 0.2s ease;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
.streamlit-expanderHeader:hover {
|
| 395 |
+
background: #1F2937 !important;
|
| 396 |
+
border-color: #818CF8 !important;
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
.streamlit-expanderContent {
|
| 400 |
+
background: #0F172A !important;
|
| 401 |
+
border: 1px solid #374151 !important;
|
| 402 |
+
border-top: none !important;
|
| 403 |
+
color: #E5E7EB !important;
|
| 404 |
+
border-radius: 0 0 12px 12px !important;
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
/* Metrics */
|
| 408 |
+
[data-testid="metric-container"] {
|
| 409 |
+
background: #111827;
|
| 410 |
+
border: 1px solid #374151;
|
| 411 |
+
border-radius: 12px;
|
| 412 |
+
padding: 1rem;
|
| 413 |
+
box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.4);
|
| 414 |
+
transition: all 0.2s ease;
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
[data-testid="metric-container"]:hover {
|
| 418 |
+
transform: translateY(-2px);
|
| 419 |
+
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.5);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
[data-testid="metric-container"] > div {
|
| 423 |
+
color: #E5E7EB !important;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
/* Code blocks */
|
| 427 |
+
.stCode {
|
| 428 |
+
background: #111827 !important;
|
| 429 |
+
border: 1px solid #374151 !important;
|
| 430 |
+
border-radius: 12px !important;
|
| 431 |
+
font-family: 'JetBrains Mono', monospace !important;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
/* Headings */
|
| 435 |
+
h1, h2, h3, h4, h5, h6 {
|
| 436 |
+
color: #E5E7EB !important;
|
| 437 |
+
font-family: 'Inter', sans-serif !important;
|
| 438 |
+
font-weight: 600 !important;
|
| 439 |
+
letter-spacing: -0.01em;
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
/* File uploader */
|
| 443 |
+
.stFileUploader > div {
|
| 444 |
+
background: #111827 !important;
|
| 445 |
+
border: 2px dashed #374151 !important;
|
| 446 |
+
border-radius: 12px !important;
|
| 447 |
+
transition: all 0.2s ease;
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
.stFileUploader > div:hover {
|
| 451 |
+
border-color: #818CF8 !important;
|
| 452 |
+
background: #1F2937 !important;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
/* Scrollbars */
|
| 456 |
+
::-webkit-scrollbar {
|
| 457 |
+
width: 8px;
|
| 458 |
+
height: 8px;
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
::-webkit-scrollbar-track {
|
| 462 |
+
background: #111827;
|
| 463 |
+
border-radius: 4px;
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
::-webkit-scrollbar-thumb {
|
| 467 |
+
background: #94A3B8;
|
| 468 |
+
border-radius: 4px;
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
::-webkit-scrollbar-thumb:hover {
|
| 472 |
+
background: #CBD5E1;
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
/* Animation keyframes */
|
| 476 |
+
@keyframes fadeIn {
|
| 477 |
+
from { opacity: 0; transform: translateY(20px); }
|
| 478 |
+
to { opacity: 1; transform: translateY(0); }
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
.stTabs [data-baseweb="tabpanel"] {
|
| 482 |
+
animation: fadeIn 0.5s ease-out;
|
| 483 |
+
}
|
| 484 |
+
</style>
|
| 485 |
+
""", unsafe_allow_html=True)
|
| 486 |
+
|
| 487 |
+
def initialize_session_state():
|
| 488 |
+
"""Initialize session state variables."""
|
| 489 |
+
if 'data_loader' not in st.session_state:
|
| 490 |
+
st.session_state.data_loader = DataLoader()
|
| 491 |
+
if 'clustering_analyzer' not in st.session_state:
|
| 492 |
+
st.session_state.clustering_analyzer = ClusteringAnalyzer()
|
| 493 |
+
if 'visualizer' not in st.session_state:
|
| 494 |
+
st.session_state.visualizer = Visualizer()
|
| 495 |
+
if 'data_loaded' not in st.session_state:
|
| 496 |
+
st.session_state.data_loaded = False
|
| 497 |
+
if 'data_preprocessed' not in st.session_state:
|
| 498 |
+
st.session_state.data_preprocessed = False
|
| 499 |
+
if 'clustering_done' not in st.session_state:
|
| 500 |
+
st.session_state.clustering_done = {'kmeans': False, 'dbscan': False}
|
| 501 |
+
|
| 502 |
+
def main():
|
| 503 |
+
"""Main application function."""
|
| 504 |
+
initialize_session_state()
|
| 505 |
+
|
| 506 |
+
# Main header
|
| 507 |
+
st.markdown('<h1 class="main-header">🛍️ Customer Segmentation Analysis</h1>', unsafe_allow_html=True)
|
| 508 |
+
st.markdown("---")
|
| 509 |
+
|
| 510 |
+
# Tab navigation
|
| 511 |
+
tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs([
|
| 512 |
+
"🏠 Home", "📊 Data Overview", "🔍 Data Exploration", "⚙️ Preprocessing",
|
| 513 |
+
"🎯 K-Means", "🌟 DBSCAN", "📈 Comparison", "📋 Insights"
|
| 514 |
+
])
|
| 515 |
+
|
| 516 |
+
# Data loading section in sidebar
|
| 517 |
+
st.sidebar.markdown("---")
|
| 518 |
+
st.sidebar.subheader("📂 Data Management")
|
| 519 |
+
|
| 520 |
+
# Auto-load dataset on first run
|
| 521 |
+
if not st.session_state.data_loaded:
|
| 522 |
+
st.session_state.data_loader.load_data()
|
| 523 |
+
st.session_state.data_loaded = True
|
| 524 |
+
|
| 525 |
+
# Show current dataset status
|
| 526 |
+
if st.session_state.data_loaded and st.session_state.data_loader.data is not None:
|
| 527 |
+
data_info = st.session_state.data_loader.get_data_info()
|
| 528 |
+
st.sidebar.success(f"📊 Dataset Loaded")
|
| 529 |
+
st.sidebar.info(f"**Rows:** {data_info['shape'][0]}\n**Columns:** {data_info['shape'][1]}")
|
| 530 |
+
|
| 531 |
+
# Show basic info about the dataset
|
| 532 |
+
if 'Annual Income (k$)' in st.session_state.data_loader.data.columns:
|
| 533 |
+
st.sidebar.write("**Dataset Type:** Mall Customers")
|
| 534 |
+
|
| 535 |
+
# File upload option
|
| 536 |
+
st.sidebar.markdown("### 📁 Upload Different Dataset")
|
| 537 |
+
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=['csv'])
|
| 538 |
+
|
| 539 |
+
if uploaded_file is not None:
|
| 540 |
+
try:
|
| 541 |
+
data = pd.read_csv(uploaded_file)
|
| 542 |
+
st.session_state.data_loader.data = data
|
| 543 |
+
st.session_state.data_loaded = True
|
| 544 |
+
st.session_state.data_preprocessed = False # Reset preprocessing
|
| 545 |
+
st.session_state.clustering_done = {'kmeans': False, 'dbscan': False} # Reset clustering
|
| 546 |
+
st.sidebar.success("✅ New file uploaded!")
|
| 547 |
+
st.rerun()
|
| 548 |
+
except Exception as e:
|
| 549 |
+
st.sidebar.error(f"Error loading file: {e}")
|
| 550 |
+
|
| 551 |
+
# Reload default dataset button
|
| 552 |
+
if st.sidebar.button("🔄 Reload Default Dataset"):
|
| 553 |
+
st.session_state.data_loader.load_data()
|
| 554 |
+
st.session_state.data_loaded = True
|
| 555 |
+
st.session_state.data_preprocessed = False
|
| 556 |
+
st.session_state.clustering_done = {'kmeans': False, 'dbscan': False}
|
| 557 |
+
# Clear any cached clustering results
|
| 558 |
+
st.session_state.clustering_analyzer = ClusteringAnalyzer()
|
| 559 |
+
st.rerun()
|
| 560 |
+
|
| 561 |
+
# Debug: Clear session state button (remove this after fixing)
|
| 562 |
+
if st.sidebar.button("🧪 Clear Session (Debug)"):
|
| 563 |
+
for key in list(st.session_state.keys()):
|
| 564 |
+
del st.session_state[key]
|
| 565 |
+
st.rerun()
|
| 566 |
+
|
| 567 |
+
# Tab content
|
| 568 |
+
with tab1:
|
| 569 |
+
show_home_page()
|
| 570 |
+
with tab2:
|
| 571 |
+
show_data_overview()
|
| 572 |
+
with tab3:
|
| 573 |
+
show_data_exploration()
|
| 574 |
+
with tab4:
|
| 575 |
+
show_preprocessing()
|
| 576 |
+
with tab5:
|
| 577 |
+
show_kmeans_clustering()
|
| 578 |
+
with tab6:
|
| 579 |
+
show_dbscan_clustering()
|
| 580 |
+
with tab7:
|
| 581 |
+
show_results_comparison()
|
| 582 |
+
with tab8:
|
| 583 |
+
show_business_insights()
|
| 584 |
+
|
| 585 |
+
def show_home_page():
|
| 586 |
+
"""Display the home page."""
|
| 587 |
+
st.markdown('<h2 class="sub-header">Welcome to Customer Segmentation Analysis</h2>', unsafe_allow_html=True)
|
| 588 |
+
|
| 589 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 590 |
+
|
| 591 |
+
with col2:
|
| 592 |
+
st.markdown("""
|
| 593 |
+
<div class="insight-box">
|
| 594 |
+
<h3>🎯 Project Overview</h3>
|
| 595 |
+
<p>This application provides a comprehensive customer segmentation analysis using machine learning clustering algorithms.</p>
|
| 596 |
+
</div>
|
| 597 |
+
""", unsafe_allow_html=True)
|
| 598 |
+
|
| 599 |
+
# Feature overview
|
| 600 |
+
st.markdown("### 🚀 Features")
|
| 601 |
+
|
| 602 |
+
col1, col2, col3 = st.columns(3)
|
| 603 |
+
|
| 604 |
+
with col1:
|
| 605 |
+
st.markdown("""
|
| 606 |
+
**📊 Data Analysis**
|
| 607 |
+
- Interactive data exploration
|
| 608 |
+
- Statistical summaries
|
| 609 |
+
- Correlation analysis
|
| 610 |
+
- Missing value detection
|
| 611 |
+
""")
|
| 612 |
+
|
| 613 |
+
with col2:
|
| 614 |
+
st.markdown("""
|
| 615 |
+
**🎯 Clustering Algorithms**
|
| 616 |
+
- K-Means clustering
|
| 617 |
+
- DBSCAN clustering
|
| 618 |
+
- Optimal cluster determination
|
| 619 |
+
- Performance metrics
|
| 620 |
+
""")
|
| 621 |
+
|
| 622 |
+
with col3:
|
| 623 |
+
st.markdown("""
|
| 624 |
+
**📈 Visualizations**
|
| 625 |
+
- 2D cluster plots
|
| 626 |
+
- Distribution analysis
|
| 627 |
+
- Comparative visualizations
|
| 628 |
+
- Interactive charts
|
| 629 |
+
""")
|
| 630 |
+
|
| 631 |
+
# Getting started
|
| 632 |
+
st.markdown("### 🏁 Getting Started")
|
| 633 |
+
st.markdown("""
|
| 634 |
+
1. **📊 Data Overview**: Check your dataset information and statistics (automatically loaded from `data/Mall_Customers.csv`)
|
| 635 |
+
2. **🔍 Data Exploration**: Explore distributions, correlations, and relationships
|
| 636 |
+
3. **⚙️ Preprocessing**: Select features and scale your data for clustering
|
| 637 |
+
4. **🎯 K-Means**: Apply K-Means clustering with optimal cluster determination
|
| 638 |
+
5. **🌟 DBSCAN**: Try density-based clustering for comparison
|
| 639 |
+
6. **📈 Comparison**: Compare results from both algorithms
|
| 640 |
+
7. **📋 Insights**: Get business recommendations for each customer segment
|
| 641 |
+
""")
|
| 642 |
+
|
| 643 |
+
# Quick start note
|
| 644 |
+
st.info("""
|
| 645 |
+
💡 **Quick Start**: Your dataset is automatically loaded from the `data/` folder.
|
| 646 |
+
Just click on the tabs above to start exploring and clustering your customer data!
|
| 647 |
+
""")
|
| 648 |
+
|
| 649 |
+
# Sample data info
|
| 650 |
+
st.markdown("### 📋 Sample Dataset")
|
| 651 |
+
st.info("""
|
| 652 |
+
The sample dataset simulates mall customer data with the following features:
|
| 653 |
+
- **CustomerID**: Unique identifier
|
| 654 |
+
- **Gender**: Customer gender (Male/Female)
|
| 655 |
+
- **Age**: Customer age (18-70 years)
|
| 656 |
+
- **Annual Income (k$)**: Annual income in thousands
|
| 657 |
+
- **Spending Score (1-100)**: Mall-assigned spending score
|
| 658 |
+
""")
|
| 659 |
+
|
| 660 |
+
def show_data_overview():
|
| 661 |
+
"""Display data overview page."""
|
| 662 |
+
st.markdown('<h2 class="sub-header">📊 Data Overview</h2>', unsafe_allow_html=True)
|
| 663 |
+
|
| 664 |
+
if not st.session_state.data_loaded:
|
| 665 |
+
st.warning("⚠️ Please load data first using the sidebar.")
|
| 666 |
+
return
|
| 667 |
+
|
| 668 |
+
data = st.session_state.data_loader.data
|
| 669 |
+
data_info = st.session_state.data_loader.get_data_info()
|
| 670 |
+
|
| 671 |
+
# Basic information
|
| 672 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 673 |
+
|
| 674 |
+
with col1:
|
| 675 |
+
st.metric("Total Customers", data_info['shape'][0])
|
| 676 |
+
with col2:
|
| 677 |
+
st.metric("Features", data_info['shape'][1])
|
| 678 |
+
with col3:
|
| 679 |
+
missing_values = sum(data_info['missing_values'].values())
|
| 680 |
+
st.metric("Missing Values", missing_values)
|
| 681 |
+
with col4:
|
| 682 |
+
numeric_cols = len([col for col, dtype in data_info['dtypes'].items() if dtype in ['int64', 'float64']])
|
| 683 |
+
st.metric("Numeric Features", numeric_cols)
|
| 684 |
+
|
| 685 |
+
# Data preview
|
| 686 |
+
st.subheader("📋 Data Preview")
|
| 687 |
+
st.dataframe(data.head(10), use_container_width=True)
|
| 688 |
+
|
| 689 |
+
# Data types and missing values
|
| 690 |
+
col1, col2 = st.columns(2)
|
| 691 |
+
|
| 692 |
+
with col1:
|
| 693 |
+
st.subheader("🔧 Data Types")
|
| 694 |
+
dtypes_df = pd.DataFrame(list(data_info['dtypes'].items()), columns=['Column', 'Data Type'])
|
| 695 |
+
st.dataframe(dtypes_df, use_container_width=True)
|
| 696 |
+
|
| 697 |
+
with col2:
|
| 698 |
+
st.subheader("❓ Missing Values")
|
| 699 |
+
missing_df = pd.DataFrame(list(data_info['missing_values'].items()), columns=['Column', 'Missing Count'])
|
| 700 |
+
missing_df['Missing %'] = (missing_df['Missing Count'] / data_info['shape'][0] * 100).round(2)
|
| 701 |
+
st.dataframe(missing_df, use_container_width=True)
|
| 702 |
+
|
| 703 |
+
# Statistical summary
|
| 704 |
+
st.subheader("📈 Statistical Summary")
|
| 705 |
+
st.dataframe(data.describe(), use_container_width=True)
|
| 706 |
+
|
| 707 |
+
def show_data_exploration():
|
| 708 |
+
"""Display data exploration page."""
|
| 709 |
+
st.markdown('<h2 class="sub-header">🔍 Data Exploration</h2>', unsafe_allow_html=True)
|
| 710 |
+
|
| 711 |
+
if not st.session_state.data_loaded:
|
| 712 |
+
st.warning("⚠️ Please load data first using the sidebar.")
|
| 713 |
+
return
|
| 714 |
+
|
| 715 |
+
data = st.session_state.data_loader.data
|
| 716 |
+
visualizer = st.session_state.visualizer
|
| 717 |
+
|
| 718 |
+
# Generate exploration visualizations
|
| 719 |
+
visualizer.plot_data_exploration(data)
|
| 720 |
+
|
| 721 |
+
def show_preprocessing():
|
| 722 |
+
"""Display preprocessing page."""
|
| 723 |
+
st.markdown('<h2 class="sub-header">⚙️ Data Preprocessing</h2>', unsafe_allow_html=True)
|
| 724 |
+
|
| 725 |
+
if not st.session_state.data_loaded:
|
| 726 |
+
st.warning("⚠️ Please load data first using the sidebar.")
|
| 727 |
+
return
|
| 728 |
+
|
| 729 |
+
data = st.session_state.data_loader.data
|
| 730 |
+
|
| 731 |
+
# Feature selection
|
| 732 |
+
st.subheader("🎯 Feature Selection")
|
| 733 |
+
|
| 734 |
+
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
|
| 735 |
+
if 'CustomerID' in numeric_columns:
|
| 736 |
+
numeric_columns.remove('CustomerID')
|
| 737 |
+
|
| 738 |
+
selected_features = st.multiselect(
|
| 739 |
+
"Select features for clustering:",
|
| 740 |
+
numeric_columns,
|
| 741 |
+
default=['Annual Income (k$)', 'Spending Score (1-100)'] if all(col in numeric_columns for col in ['Annual Income (k$)', 'Spending Score (1-100)']) else numeric_columns[:2]
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
if len(selected_features) < 2:
|
| 745 |
+
st.error("⚠️ Please select at least 2 features for clustering.")
|
| 746 |
+
return
|
| 747 |
+
|
| 748 |
+
# Preprocessing options
|
| 749 |
+
st.subheader("🔧 Preprocessing Options")
|
| 750 |
+
|
| 751 |
+
col1, col2 = st.columns(2)
|
| 752 |
+
with col1:
|
| 753 |
+
handle_missing = st.selectbox("Handle missing values:", ["Fill with mean", "Drop rows", "No action"])
|
| 754 |
+
with col2:
|
| 755 |
+
scaling_method = st.selectbox("Scaling method:", ["StandardScaler", "MinMaxScaler", "No scaling"])
|
| 756 |
+
|
| 757 |
+
# Apply preprocessing
|
| 758 |
+
if st.button("🚀 Apply Preprocessing"):
|
| 759 |
+
scaled_data = st.session_state.data_loader.preprocess_data(selected_features)
|
| 760 |
+
|
| 761 |
+
if scaled_data is not None:
|
| 762 |
+
st.session_state.data_preprocessed = True
|
| 763 |
+
|
| 764 |
+
# Show preprocessing results
|
| 765 |
+
st.success("✅ Data preprocessing completed!")
|
| 766 |
+
|
| 767 |
+
col1, col2 = st.columns(2)
|
| 768 |
+
|
| 769 |
+
with col1:
|
| 770 |
+
st.subheader("📊 Original Data")
|
| 771 |
+
st.dataframe(data[selected_features].head(), use_container_width=True)
|
| 772 |
+
|
| 773 |
+
with col2:
|
| 774 |
+
st.subheader("🔄 Scaled Data")
|
| 775 |
+
scaled_df = pd.DataFrame(scaled_data, columns=selected_features)
|
| 776 |
+
st.dataframe(scaled_df.head(), use_container_width=True)
|
| 777 |
+
|
| 778 |
+
# Feature statistics
|
| 779 |
+
st.subheader("📈 Feature Statistics")
|
| 780 |
+
col1, col2 = st.columns(2)
|
| 781 |
+
|
| 782 |
+
with col1:
|
| 783 |
+
st.write("**Original Data Statistics:**")
|
| 784 |
+
st.dataframe(data[selected_features].describe(), use_container_width=True)
|
| 785 |
+
|
| 786 |
+
with col2:
|
| 787 |
+
st.write("**Scaled Data Statistics:**")
|
| 788 |
+
st.dataframe(scaled_df.describe(), use_container_width=True)
|
| 789 |
+
|
| 790 |
+
def show_kmeans_clustering():
|
| 791 |
+
"""Display K-Means clustering page."""
|
| 792 |
+
st.markdown('<h2 class="sub-header">🎯 K-Means Clustering</h2>', unsafe_allow_html=True)
|
| 793 |
+
|
| 794 |
+
if not st.session_state.data_preprocessed:
|
| 795 |
+
st.warning("⚠️ Please preprocess data first.")
|
| 796 |
+
return
|
| 797 |
+
|
| 798 |
+
data_loader = st.session_state.data_loader
|
| 799 |
+
clustering_analyzer = st.session_state.clustering_analyzer
|
| 800 |
+
visualizer = st.session_state.visualizer
|
| 801 |
+
|
| 802 |
+
# Optimal cluster determination
|
| 803 |
+
st.subheader("🔍 Optimal Cluster Determination")
|
| 804 |
+
|
| 805 |
+
col1, col2 = st.columns([1, 1])
|
| 806 |
+
|
| 807 |
+
with col1:
|
| 808 |
+
max_clusters = st.slider("Maximum clusters to test:", 2, 15, 10)
|
| 809 |
+
|
| 810 |
+
with col2:
|
| 811 |
+
if st.button("🔍 Find Optimal Clusters"):
|
| 812 |
+
with st.spinner("Finding optimal number of clusters..."):
|
| 813 |
+
optimization_results = clustering_analyzer.find_optimal_clusters(data_loader.scaled_data, max_clusters)
|
| 814 |
+
if optimization_results:
|
| 815 |
+
visualizer.plot_optimization_results(optimization_results)
|
| 816 |
+
|
| 817 |
+
# K-Means clustering
|
| 818 |
+
st.subheader("🎯 K-Means Clustering")
|
| 819 |
+
|
| 820 |
+
col1, col2 = st.columns([1, 1])
|
| 821 |
+
|
| 822 |
+
with col1:
|
| 823 |
+
n_clusters = st.slider("Number of clusters:", 2, 10, clustering_analyzer.optimal_clusters or 5)
|
| 824 |
+
|
| 825 |
+
with col2:
|
| 826 |
+
if st.button("🚀 Apply K-Means"):
|
| 827 |
+
# Clear any existing clustering results first to avoid column naming issues
|
| 828 |
+
clustering_analyzer.cluster_labels = {}
|
| 829 |
+
st.session_state.clustering_done = {'kmeans': False, 'dbscan': False}
|
| 830 |
+
|
| 831 |
+
# Clear any cached data
|
| 832 |
+
if hasattr(st.session_state, 'cluster_analysis_cache'):
|
| 833 |
+
del st.session_state.cluster_analysis_cache
|
| 834 |
+
|
| 835 |
+
with st.spinner("🔄 Applying K-Means clustering..."):
|
| 836 |
+
kmeans_results = clustering_analyzer.apply_kmeans(data_loader.scaled_data, n_clusters)
|
| 837 |
+
|
| 838 |
+
if kmeans_results:
|
| 839 |
+
st.session_state.clustering_done['kmeans'] = True
|
| 840 |
+
|
| 841 |
+
# Display metrics
|
| 842 |
+
col1, col2, col3 = st.columns(3)
|
| 843 |
+
with col1:
|
| 844 |
+
st.metric("Silhouette Score", f"{kmeans_results['silhouette_score']:.3f}")
|
| 845 |
+
with col2:
|
| 846 |
+
st.metric("Calinski-Harabasz Score", f"{kmeans_results['calinski_score']:.1f}")
|
| 847 |
+
with col3:
|
| 848 |
+
st.metric("Inertia", f"{kmeans_results['inertia']:.1f}")
|
| 849 |
+
|
| 850 |
+
# Visualizations
|
| 851 |
+
if st.session_state.clustering_done['kmeans']:
|
| 852 |
+
feature_data = data_loader.get_feature_data()
|
| 853 |
+
kmeans_labels = clustering_analyzer.cluster_labels['kmeans']
|
| 854 |
+
|
| 855 |
+
visualizer.plot_clusters(
|
| 856 |
+
feature_data,
|
| 857 |
+
kmeans_labels,
|
| 858 |
+
'K-Means',
|
| 859 |
+
data_loader.scaler,
|
| 860 |
+
clustering_analyzer.kmeans_model.cluster_centers_
|
| 861 |
+
)
|
| 862 |
+
|
| 863 |
+
# Cluster analysis
|
| 864 |
+
analysis_results = clustering_analyzer.analyze_clusters(feature_data, 'kmeans')
|
| 865 |
+
if analysis_results:
|
| 866 |
+
visualizer.plot_cluster_analysis(analysis_results, 'K-Means')
|
| 867 |
+
|
| 868 |
+
def show_dbscan_clustering():
|
| 869 |
+
"""Display DBSCAN clustering page."""
|
| 870 |
+
st.markdown('<h2 class="sub-header">🌟 DBSCAN Clustering</h2>', unsafe_allow_html=True)
|
| 871 |
+
|
| 872 |
+
if not st.session_state.data_preprocessed:
|
| 873 |
+
st.warning("⚠️ Please preprocess data first.")
|
| 874 |
+
return
|
| 875 |
+
|
| 876 |
+
data_loader = st.session_state.data_loader
|
| 877 |
+
clustering_analyzer = st.session_state.clustering_analyzer
|
| 878 |
+
visualizer = st.session_state.visualizer
|
| 879 |
+
|
| 880 |
+
# DBSCAN parameters
|
| 881 |
+
st.subheader("⚙️ DBSCAN Parameters")
|
| 882 |
+
|
| 883 |
+
col1, col2 = st.columns(2)
|
| 884 |
+
|
| 885 |
+
with col1:
|
| 886 |
+
eps = st.slider("Epsilon (neighborhood distance):", 0.1, 2.0, 0.5, 0.1)
|
| 887 |
+
|
| 888 |
+
with col2:
|
| 889 |
+
min_samples = st.slider("Minimum samples per cluster:", 2, 20, 5)
|
| 890 |
+
|
| 891 |
+
# Parameter guidance
|
| 892 |
+
st.info("""
|
| 893 |
+
**Parameter Guidance:**
|
| 894 |
+
- **Epsilon**: Maximum distance between points in the same cluster. Smaller values create more clusters.
|
| 895 |
+
- **Min Samples**: Minimum number of points required to form a cluster. Higher values create fewer, denser clusters.
|
| 896 |
+
""")
|
| 897 |
+
|
| 898 |
+
# Apply DBSCAN
|
| 899 |
+
if st.button("🚀 Apply DBSCAN"):
|
| 900 |
+
dbscan_results = clustering_analyzer.apply_dbscan(data_loader.scaled_data, eps, min_samples)
|
| 901 |
+
|
| 902 |
+
if dbscan_results:
|
| 903 |
+
st.session_state.clustering_done['dbscan'] = True
|
| 904 |
+
|
| 905 |
+
# Display metrics
|
| 906 |
+
col1, col2, col3 = st.columns(3)
|
| 907 |
+
with col1:
|
| 908 |
+
st.metric("Number of Clusters", dbscan_results['n_clusters'])
|
| 909 |
+
with col2:
|
| 910 |
+
st.metric("Noise Points", dbscan_results['n_noise'])
|
| 911 |
+
with col3:
|
| 912 |
+
if 'silhouette_score' in dbscan_results:
|
| 913 |
+
st.metric("Silhouette Score", f"{dbscan_results['silhouette_score']:.3f}")
|
| 914 |
+
else:
|
| 915 |
+
st.metric("Silhouette Score", "N/A")
|
| 916 |
+
|
| 917 |
+
# Visualizations
|
| 918 |
+
if st.session_state.clustering_done['dbscan']:
|
| 919 |
+
feature_data = data_loader.get_feature_data()
|
| 920 |
+
dbscan_labels = clustering_analyzer.cluster_labels['dbscan']
|
| 921 |
+
|
| 922 |
+
visualizer.plot_clusters(feature_data, dbscan_labels, 'DBSCAN')
|
| 923 |
+
|
| 924 |
+
# Cluster analysis
|
| 925 |
+
analysis_results = clustering_analyzer.analyze_clusters(feature_data, 'dbscan')
|
| 926 |
+
if analysis_results:
|
| 927 |
+
visualizer.plot_cluster_analysis(analysis_results, 'DBSCAN')
|
| 928 |
+
|
| 929 |
+
def show_results_comparison():
|
| 930 |
+
"""Display results comparison page."""
|
| 931 |
+
st.markdown('<h2 class="sub-header">📈 Results Comparison</h2>', unsafe_allow_html=True)
|
| 932 |
+
|
| 933 |
+
if not (st.session_state.clustering_done['kmeans'] and st.session_state.clustering_done['dbscan']):
|
| 934 |
+
st.warning("⚠️ Please complete both K-Means and DBSCAN clustering first.")
|
| 935 |
+
return
|
| 936 |
+
|
| 937 |
+
data_loader = st.session_state.data_loader
|
| 938 |
+
clustering_analyzer = st.session_state.clustering_analyzer
|
| 939 |
+
visualizer = st.session_state.visualizer
|
| 940 |
+
|
| 941 |
+
feature_data = data_loader.get_feature_data()
|
| 942 |
+
kmeans_labels = clustering_analyzer.cluster_labels['kmeans']
|
| 943 |
+
dbscan_labels = clustering_analyzer.cluster_labels['dbscan']
|
| 944 |
+
|
| 945 |
+
# Comparison visualization
|
| 946 |
+
visualizer.plot_comparison(feature_data, kmeans_labels, dbscan_labels)
|
| 947 |
+
|
| 948 |
+
# Performance comparison
|
| 949 |
+
st.subheader("📊 Performance Metrics Comparison")
|
| 950 |
+
|
| 951 |
+
# Calculate metrics for both algorithms
|
| 952 |
+
kmeans_analysis = clustering_analyzer.analyze_clusters(feature_data, 'kmeans')
|
| 953 |
+
dbscan_analysis = clustering_analyzer.analyze_clusters(feature_data, 'dbscan')
|
| 954 |
+
|
| 955 |
+
comparison_data = {
|
| 956 |
+
'Metric': ['Number of Clusters', 'Silhouette Score', 'Noise Points', 'Largest Cluster Size'],
|
| 957 |
+
'K-Means': [],
|
| 958 |
+
'DBSCAN': []
|
| 959 |
+
}
|
| 960 |
+
|
| 961 |
+
# Number of clusters
|
| 962 |
+
comparison_data['K-Means'].append(len(set(kmeans_labels)))
|
| 963 |
+
comparison_data['DBSCAN'].append(len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0))
|
| 964 |
+
|
| 965 |
+
# Silhouette scores (if available)
|
| 966 |
+
try:
|
| 967 |
+
from sklearn.metrics import silhouette_score
|
| 968 |
+
kmeans_silhouette = silhouette_score(data_loader.scaled_data, kmeans_labels)
|
| 969 |
+
comparison_data['K-Means'].append(f"{kmeans_silhouette:.3f}")
|
| 970 |
+
|
| 971 |
+
# DBSCAN silhouette (excluding noise)
|
| 972 |
+
if -1 in dbscan_labels:
|
| 973 |
+
non_noise_mask = dbscan_labels != -1
|
| 974 |
+
if np.sum(non_noise_mask) > 1:
|
| 975 |
+
dbscan_silhouette = silhouette_score(data_loader.scaled_data[non_noise_mask],
|
| 976 |
+
dbscan_labels[non_noise_mask])
|
| 977 |
+
comparison_data['DBSCAN'].append(f"{dbscan_silhouette:.3f}")
|
| 978 |
+
else:
|
| 979 |
+
comparison_data['DBSCAN'].append("N/A")
|
| 980 |
+
else:
|
| 981 |
+
dbscan_silhouette = silhouette_score(data_loader.scaled_data, dbscan_labels)
|
| 982 |
+
comparison_data['DBSCAN'].append(f"{dbscan_silhouette:.3f}")
|
| 983 |
+
except:
|
| 984 |
+
comparison_data['K-Means'].append("N/A")
|
| 985 |
+
comparison_data['DBSCAN'].append("N/A")
|
| 986 |
+
|
| 987 |
+
# Noise points
|
| 988 |
+
comparison_data['K-Means'].append("0")
|
| 989 |
+
comparison_data['DBSCAN'].append(str(list(dbscan_labels).count(-1)))
|
| 990 |
+
|
| 991 |
+
# Largest cluster size
|
| 992 |
+
kmeans_counts = pd.Series(kmeans_labels).value_counts()
|
| 993 |
+
dbscan_counts = pd.Series(dbscan_labels).value_counts()
|
| 994 |
+
|
| 995 |
+
comparison_data['K-Means'].append(str(kmeans_counts.max()))
|
| 996 |
+
if -1 in dbscan_counts.index:
|
| 997 |
+
dbscan_counts = dbscan_counts.drop(-1) # Exclude noise
|
| 998 |
+
comparison_data['DBSCAN'].append(str(dbscan_counts.max()) if len(dbscan_counts) > 0 else "0")
|
| 999 |
+
|
| 1000 |
+
comparison_df = pd.DataFrame(comparison_data)
|
| 1001 |
+
st.dataframe(comparison_df, use_container_width=True)
|
| 1002 |
+
|
| 1003 |
+
def show_business_insights():
|
| 1004 |
+
"""Display business insights page."""
|
| 1005 |
+
st.markdown('<h2 class="sub-header">📋 Business Insights</h2>', unsafe_allow_html=True)
|
| 1006 |
+
|
| 1007 |
+
if not st.session_state.clustering_done['kmeans']:
|
| 1008 |
+
st.warning("⚠️ Please complete K-Means clustering first to generate insights.")
|
| 1009 |
+
return
|
| 1010 |
+
|
| 1011 |
+
data_loader = st.session_state.data_loader
|
| 1012 |
+
clustering_analyzer = st.session_state.clustering_analyzer
|
| 1013 |
+
|
| 1014 |
+
feature_data = data_loader.get_feature_data()
|
| 1015 |
+
|
| 1016 |
+
# Generate customer profiles
|
| 1017 |
+
profiles = clustering_analyzer.get_cluster_profiles(feature_data, 'kmeans')
|
| 1018 |
+
|
| 1019 |
+
if profiles:
|
| 1020 |
+
st.subheader("👥 Customer Segment Profiles")
|
| 1021 |
+
|
| 1022 |
+
for profile in profiles:
|
| 1023 |
+
with st.expander(f"🏷️ Cluster {profile['cluster']} - {profile.get('type', 'Unknown Type')}"):
|
| 1024 |
+
col1, col2 = st.columns(2)
|
| 1025 |
+
|
| 1026 |
+
with col1:
|
| 1027 |
+
st.markdown(f"**📊 Segment Overview**")
|
| 1028 |
+
st.write(f"- **Size**: {profile['size']} customers ({profile['percentage']:.1f}%)")
|
| 1029 |
+
if 'description' in profile:
|
| 1030 |
+
st.write(f"- **Profile**: {profile['description']}")
|
| 1031 |
+
|
| 1032 |
+
if 'avg_age' in profile:
|
| 1033 |
+
st.write(f"- **Average Age**: {profile['avg_age']:.1f} ± {profile['age_std']:.1f} years")
|
| 1034 |
+
|
| 1035 |
+
if 'gender_dist' in profile:
|
| 1036 |
+
st.write(f"- **Gender Distribution**: {profile['gender_dist']}")
|
| 1037 |
+
|
| 1038 |
+
with col2:
|
| 1039 |
+
st.markdown(f"**💰 Financial Profile**")
|
| 1040 |
+
if 'avg_income' in profile:
|
| 1041 |
+
st.write(f"- **Average Income**: ${profile['avg_income']:.1f}k ± ${profile['income_std']:.1f}k")
|
| 1042 |
+
|
| 1043 |
+
if 'avg_spending' in profile:
|
| 1044 |
+
st.write(f"- **Average Spending Score**: {profile['avg_spending']:.1f} ± {profile['spending_std']:.1f}")
|
| 1045 |
+
|
| 1046 |
+
# Business recommendations
|
| 1047 |
+
st.markdown(f"**📈 Recommendations**")
|
| 1048 |
+
if 'avg_income' in profile and 'avg_spending' in profile:
|
| 1049 |
+
avg_income = profile['avg_income']
|
| 1050 |
+
avg_spending = profile['avg_spending']
|
| 1051 |
+
|
| 1052 |
+
if avg_income > 70 and avg_spending > 70:
|
| 1053 |
+
st.write("- Focus on premium products and exclusive services")
|
| 1054 |
+
st.write("- Implement VIP loyalty programs")
|
| 1055 |
+
st.write("- Offer personalized shopping experiences")
|
| 1056 |
+
elif avg_income > 70 and avg_spending < 40:
|
| 1057 |
+
st.write("- Develop targeted upselling strategies")
|
| 1058 |
+
st.write("- Showcase value propositions")
|
| 1059 |
+
st.write("- Create incentive programs to increase spending")
|
| 1060 |
+
elif avg_income < 40 and avg_spending > 70:
|
| 1061 |
+
st.write("- Offer value-based products and promotions")
|
| 1062 |
+
st.write("- Focus on customer retention programs")
|
| 1063 |
+
st.write("- Provide flexible payment options")
|
| 1064 |
+
elif avg_income < 40 and avg_spending < 40:
|
| 1065 |
+
st.write("- Implement engagement and retention strategies")
|
| 1066 |
+
st.write("- Offer budget-friendly options")
|
| 1067 |
+
st.write("- Focus on building brand loyalty")
|
| 1068 |
+
else:
|
| 1069 |
+
st.write("- Balanced marketing approach")
|
| 1070 |
+
st.write("- Personalized offers based on preferences")
|
| 1071 |
+
st.write("- Regular engagement campaigns")
|
| 1072 |
+
|
| 1073 |
+
# Overall business strategy
|
| 1074 |
+
st.subheader("🎯 Overall Business Strategy")
|
| 1075 |
+
|
| 1076 |
+
col1, col2 = st.columns(2)
|
| 1077 |
+
|
| 1078 |
+
with col1:
|
| 1079 |
+
st.markdown("""
|
| 1080 |
+
**🎯 Marketing Strategies**
|
| 1081 |
+
- **Segment-specific campaigns**: Tailor marketing messages to each cluster
|
| 1082 |
+
- **Product positioning**: Align products with cluster preferences
|
| 1083 |
+
- **Channel optimization**: Use preferred communication channels per segment
|
| 1084 |
+
- **Pricing strategies**: Implement dynamic pricing based on segment characteristics
|
| 1085 |
+
""")
|
| 1086 |
+
|
| 1087 |
+
with col2:
|
| 1088 |
+
st.markdown("""
|
| 1089 |
+
**💡 Growth Opportunities**
|
| 1090 |
+
- **Cross-selling**: Identify products popular in high-spending segments
|
| 1091 |
+
- **Retention programs**: Focus on segments with declining engagement
|
| 1092 |
+
- **New product development**: Create offerings for underserved segments
|
| 1093 |
+
- **Customer lifetime value**: Invest more in high-value segments
|
| 1094 |
+
""")
|
| 1095 |
+
|
| 1096 |
+
# Download results
|
| 1097 |
+
st.subheader("💾 Download Results")
|
| 1098 |
+
|
| 1099 |
+
# Prepare data for download
|
| 1100 |
+
result_data = feature_data.copy()
|
| 1101 |
+
result_data['KMeans_Cluster'] = clustering_analyzer.cluster_labels['kmeans']
|
| 1102 |
+
|
| 1103 |
+
csv = result_data.to_csv(index=False)
|
| 1104 |
+
st.download_button(
|
| 1105 |
+
label="📥 Download Customer Segments (CSV)",
|
| 1106 |
+
data=csv,
|
| 1107 |
+
file_name="customer_segments_results.csv",
|
| 1108 |
+
mime="text/csv"
|
| 1109 |
+
)
|
| 1110 |
+
|
| 1111 |
+
if __name__ == "__main__":
|
| 1112 |
+
main()
|
utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Utilities Package
|
utils/__pycache__/data_generator.cpython-311.pyc
ADDED
|
Binary file (3.34 kB). View file
|
|
|
utils/data_generator.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Generation Utilities
|
| 3 |
+
========================
|
| 4 |
+
|
| 5 |
+
Utility functions for generating sample datasets.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
def create_sample_mall_customers(n_customers=200, random_seed=42):
|
| 12 |
+
"""
|
| 13 |
+
Create a realistic sample Mall Customers dataset.
|
| 14 |
+
|
| 15 |
+
Parameters:
|
| 16 |
+
-----------
|
| 17 |
+
n_customers : int, default=200
|
| 18 |
+
Number of customers to generate
|
| 19 |
+
random_seed : int, default=42
|
| 20 |
+
Random seed for reproducibility
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
--------
|
| 24 |
+
pd.DataFrame
|
| 25 |
+
Generated customer dataset
|
| 26 |
+
"""
|
| 27 |
+
np.random.seed(random_seed)
|
| 28 |
+
|
| 29 |
+
customer_ids = range(1, n_customers + 1)
|
| 30 |
+
|
| 31 |
+
# Gender distribution (approximately 56% Female, 44% Male)
|
| 32 |
+
genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56])
|
| 33 |
+
|
| 34 |
+
# Age distribution (mean ~39, std ~14)
|
| 35 |
+
ages = np.random.normal(38.85, 13.97, n_customers).astype(int)
|
| 36 |
+
ages = np.clip(ages, 18, 70)
|
| 37 |
+
|
| 38 |
+
# Create realistic income distribution (mean ~61k, std ~26k)
|
| 39 |
+
annual_incomes = np.random.normal(60.56, 26.26, n_customers)
|
| 40 |
+
annual_incomes = np.clip(annual_incomes, 15, 137)
|
| 41 |
+
|
| 42 |
+
# Create spending scores with realistic patterns
|
| 43 |
+
base_spending = np.random.normal(50, 25, n_customers)
|
| 44 |
+
|
| 45 |
+
# Add some income correlation
|
| 46 |
+
income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min())
|
| 47 |
+
income_effect = (income_normalized - 0.5) * 30
|
| 48 |
+
|
| 49 |
+
# Add age effect (younger people might spend more)
|
| 50 |
+
age_normalized = (ages - ages.min()) / (ages.max() - ages.min())
|
| 51 |
+
age_effect = np.where(age_normalized < 0.3, 10,
|
| 52 |
+
np.where(age_normalized > 0.7, -5, 0))
|
| 53 |
+
|
| 54 |
+
# Gender effect (slight difference in spending patterns)
|
| 55 |
+
gender_effect = np.where(genders == 'Female', 3, -3)
|
| 56 |
+
|
| 57 |
+
spending_scores = (base_spending +
|
| 58 |
+
income_effect * 0.6 +
|
| 59 |
+
age_effect +
|
| 60 |
+
gender_effect +
|
| 61 |
+
np.random.normal(0, 10, n_customers))
|
| 62 |
+
spending_scores = np.clip(spending_scores, 1, 100)
|
| 63 |
+
|
| 64 |
+
# Create DataFrame
|
| 65 |
+
sample_data = pd.DataFrame({
|
| 66 |
+
'CustomerID': customer_ids,
|
| 67 |
+
'Gender': genders,
|
| 68 |
+
'Age': ages,
|
| 69 |
+
'Annual Income (k$)': annual_incomes.round().astype(int),
|
| 70 |
+
'Spending Score (1-100)': spending_scores.round().astype(int)
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
return sample_data
|