|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering |
|
|
from sklearn.decomposition import PCA |
|
|
from sklearn.metrics import silhouette_score |
|
|
from scipy.cluster.hierarchy import dendrogram, linkage |
|
|
from io import BytesIO |
|
|
import base64 |
|
|
|
|
|
|
|
|
def load_data(file='./Mall_Customers.csv'): |
|
|
try: |
|
|
if file: |
|
|
data = pd.read_csv(file) |
|
|
data = data.dropna() |
|
|
return data |
|
|
else: |
|
|
data = pd.read_csv('./Mall_Customers.csv') |
|
|
return data |
|
|
except Exception as e: |
|
|
st.error(f"Error loading data: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def preprocess_data(data): |
|
|
|
|
|
data = data.drop(columns=['CustomerID']) |
|
|
|
|
|
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1}) |
|
|
|
|
|
scaler = StandardScaler() |
|
|
scaled_data = scaler.fit_transform(data) |
|
|
return scaled_data, data |
|
|
|
|
|
|
|
|
def kmeans_clustering(scaled_data, n_clusters): |
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
|
kmeans.fit(scaled_data) |
|
|
return kmeans.labels_, kmeans.inertia_ |
|
|
|
|
|
|
|
|
def dbscan_clustering(scaled_data, eps, min_samples): |
|
|
dbscan = DBSCAN(eps=eps, min_samples=min_samples) |
|
|
dbscan.fit(scaled_data) |
|
|
return dbscan.labels_ |
|
|
|
|
|
|
|
|
def hierarchical_clustering(scaled_data, n_clusters): |
|
|
hierarchical = AgglomerativeClustering(n_clusters=n_clusters) |
|
|
hierarchical.fit(scaled_data) |
|
|
return hierarchical.labels_ |
|
|
|
|
|
|
|
|
def perform_pca(scaled_data, n_components): |
|
|
pca = PCA(n_components=n_components) |
|
|
pca_data = pca.fit_transform(scaled_data) |
|
|
return pca_data, pca |
|
|
|
|
|
|
|
|
def plot_elbow_curve(scaled_data, max_clusters): |
|
|
inertias = [] |
|
|
for k in range(1, max_clusters + 1): |
|
|
kmeans = KMeans(n_clusters=k, random_state=42) |
|
|
kmeans.fit(scaled_data) |
|
|
inertias.append(kmeans.inertia_) |
|
|
plt.figure(figsize=(10, 6)) |
|
|
plt.plot(range(1, max_clusters + 1), inertias, marker='o') |
|
|
plt.title('Elbow Curve') |
|
|
plt.xlabel('Number of Clusters') |
|
|
plt.ylabel('Inertia') |
|
|
st.pyplot(plt) |
|
|
|
|
|
|
|
|
def plot_dendrogram(scaled_data): |
|
|
linked = linkage(scaled_data, 'ward') |
|
|
plt.figure(figsize=(10, 6)) |
|
|
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True) |
|
|
plt.title('Dendrogram') |
|
|
plt.xlabel('Sample Index') |
|
|
plt.ylabel('Distance') |
|
|
st.pyplot(plt) |
|
|
|
|
|
|
|
|
def plot_scatter(data, labels, title): |
|
|
plt.figure(figsize=(10, 6)) |
|
|
sns.scatterplot(x=data[:, 0], y=data[:, 1], hue=labels, palette='viridis', s=100) |
|
|
plt.title(title) |
|
|
plt.xlabel('PCA Component 1') |
|
|
plt.ylabel('PCA Component 2') |
|
|
st.pyplot(plt) |
|
|
|
|
|
|
|
|
def calculate_silhouette_score(scaled_data, labels): |
|
|
if len(set(labels)) > 1: |
|
|
score = silhouette_score(scaled_data, labels) |
|
|
return score |
|
|
else: |
|
|
return None |
|
|
|
|
|
|
|
|
def display_cluster_assignments(data, labels): |
|
|
data['Cluster'] = labels |
|
|
st.write(data) |
|
|
|
|
|
|
|
|
def input_new_data(): |
|
|
gender = st.selectbox('Gender', ['Male', 'Female']) |
|
|
age = st.number_input('Age', min_value=0, max_value=100, value=30) |
|
|
annual_income = st.number_input('Annual Income (k$)', min_value=0, value=60) |
|
|
spending_score = st.number_input('Spending Score (1-100)', min_value=1, max_value=100, value=50) |
|
|
new_data = pd.DataFrame({ |
|
|
'Gender': [gender], |
|
|
'Age': [age], |
|
|
'Annual Income (k$)': [annual_income], |
|
|
'Spending Score (1-100)': [spending_score] |
|
|
}) |
|
|
new_data['Gender'] = new_data['Gender'].map({'Male': 0, 'Female': 1}) |
|
|
return new_data |
|
|
|
|
|
|
|
|
def predict_cluster(model, scaler, new_data): |
|
|
scaled_new_data = scaler.transform(new_data) |
|
|
if isinstance(model, DBSCAN): |
|
|
|
|
|
combined_data = np.vstack([model.components_, scaled_new_data]) |
|
|
labels = model.fit_predict(combined_data) |
|
|
return [labels[-1]] |
|
|
else: |
|
|
cluster = model.predict(scaled_new_data) |
|
|
return cluster |
|
|
|
|
|
|
|
|
def download_results(data): |
|
|
csv = data.to_csv(index=False) |
|
|
b64 = base64.b64encode(csv.encode()).decode() |
|
|
href = f'<a href="data:file/csv;base64,{b64}" download="cluster_results.csv">Download CSV File</a>' |
|
|
return href |
|
|
|
|
|
|
|
|
def main(): |
|
|
st.title('Unsupervised Learning Web Application') |
|
|
st.sidebar.title('Upload Data') |
|
|
file = st.sidebar.file_uploader('Upload a CSV file', type=['csv']) |
|
|
|
|
|
|
|
|
scaled_data = None |
|
|
original_data = None |
|
|
scaler = None |
|
|
pressed = True |
|
|
|
|
|
data = load_data(file='./Mall_Customers.csv') |
|
|
if data is not None: |
|
|
scaled_data, original_data = preprocess_data(data) |
|
|
scaler = StandardScaler() |
|
|
scaled_data = scaler.fit_transform(original_data) |
|
|
st.write('Preprocessed Data:') |
|
|
st.write(original_data) |
|
|
|
|
|
st.sidebar.title('Unsupervised Learning Algorithms') |
|
|
algorithm = st.sidebar.selectbox('Select Algorithm', ['KMeans Clustering', 'DBSCAN Clustering', 'Hierarchical Clustering', 'PCA']) |
|
|
|
|
|
if algorithm == 'KMeans Clustering': |
|
|
st.title('KMeans Clustering') |
|
|
n_clusters = st.slider('Number of Clusters', min_value=2, max_value=10, value=5) |
|
|
if st.button('Run KMeans'): |
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
|
kmeans.fit(scaled_data) |
|
|
labels = kmeans.labels_ |
|
|
inertia = kmeans.inertia_ |
|
|
st.write('Cluster Labels:', labels) |
|
|
st.write('Inertia:', inertia) |
|
|
st.write('Silhouette Score:', calculate_silhouette_score(scaled_data, labels)) |
|
|
display_cluster_assignments(original_data, labels) |
|
|
pca_data, _ = perform_pca(scaled_data, 2) |
|
|
plot_scatter(pca_data, labels, 'KMeans Clustering') |
|
|
plot_elbow_curve(scaled_data, 10) |
|
|
st.markdown(download_results(original_data), unsafe_allow_html=True) |
|
|
|
|
|
elif algorithm == 'DBSCAN Clustering': |
|
|
st.title('DBSCAN Clustering') |
|
|
eps = st.slider('Epsilon', min_value=0.1, max_value=1.0, value=0.5, step=0.1) |
|
|
min_samples = st.slider('Minimum Samples', min_value=2, max_value=10, value=5) |
|
|
if st.button('Run DBSCAN'): |
|
|
labels = dbscan_clustering(scaled_data, eps, min_samples) |
|
|
st.write('Cluster Labels:', labels) |
|
|
st.write('Silhouette Score:', calculate_silhouette_score(scaled_data, labels)) |
|
|
display_cluster_assignments(original_data, labels) |
|
|
pca_data, _ = perform_pca(scaled_data, 2) |
|
|
plot_scatter(pca_data, labels, 'DBSCAN Clustering') |
|
|
st.markdown(download_results(original_data), unsafe_allow_html=True) |
|
|
|
|
|
elif algorithm == 'Hierarchical Clustering': |
|
|
st.title('Hierarchical Clustering') |
|
|
n_clusters = st.slider('Number of Clusters', min_value=2, max_value=10, value=5) |
|
|
if st.button('Run Hierarchical Clustering'): |
|
|
labels = hierarchical_clustering(scaled_data, n_clusters) |
|
|
st.write('Cluster Labels:', labels) |
|
|
st.write('Silhouette Score:', calculate_silhouette_score(scaled_data, labels)) |
|
|
display_cluster_assignments(original_data, labels) |
|
|
pca_data, _ = perform_pca(scaled_data, 2) |
|
|
plot_scatter(pca_data, labels, 'Hierarchical Clustering') |
|
|
plot_dendrogram(scaled_data) |
|
|
st.markdown(download_results(original_data), unsafe_allow_html=True) |
|
|
|
|
|
elif algorithm == 'PCA': |
|
|
st.title('Principal Component Analysis') |
|
|
n_components = st.slider('Number of Components', min_value=2, max_value=4, value=2) |
|
|
if st.button('Run PCA'): |
|
|
pca_data, pca = perform_pca(scaled_data, n_components) |
|
|
st.write('PCA Components:', pca.components_) |
|
|
st.write('Explained Variance Ratio:', pca.explained_variance_ratio_) |
|
|
plot_scatter(pca_data, np.zeros(pca_data.shape[0]), 'PCA') |
|
|
st.markdown(download_results(pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(n_components)])), unsafe_allow_html=True) |
|
|
|
|
|
st.sidebar.title('Input New Data') |
|
|
pressed = st.sidebar.button('Predict Cluster') |
|
|
st.session_state.button_pressed = getattr(st.session_state, 'button_pressed', False) or pressed |
|
|
if st.session_state.button_pressed: |
|
|
new_data = input_new_data() |
|
|
if algorithm == 'KMeans Clustering': |
|
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
|
kmeans.fit(scaled_data) |
|
|
cluster = predict_cluster(kmeans, scaler, new_data) |
|
|
st.write('Predicted Cluster:', cluster[0]) |
|
|
|
|
|
elif algorithm == 'DBSCAN Clustering': |
|
|
dbscan = DBSCAN(eps=eps, min_samples=min_samples) |
|
|
dbscan.fit(scaled_data) |
|
|
cluster = predict_cluster(dbscan, scaler, new_data) |
|
|
st.write('Predicted Cluster:', cluster[0]) |
|
|
elif algorithm == 'Hierarchical Clustering': |
|
|
scaled_new_data = scaler.transform(new_data) |
|
|
combined_data = np.vstack([scaled_data, scaled_new_data]) |
|
|
hierarchical = AgglomerativeClustering(n_clusters=n_clusters) |
|
|
labels = hierarchical.fit_predict(combined_data) |
|
|
cluster = [labels[-1]] |
|
|
st.write('Predicted Cluster:', cluster[0]) |
|
|
elif algorithm == 'PCA': |
|
|
|
|
|
scaled_new_data = scaler.transform(new_data) |
|
|
pca = PCA(n_components=n_components) |
|
|
pca.fit(scaled_data) |
|
|
pca_new_data = pca.transform(scaled_new_data) |
|
|
st.write('PCA transformed data:', pca_new_data[0]) |
|
|
|
|
|
pca_data = pca.transform(scaled_data) |
|
|
plt.figure(figsize=(10, 6)) |
|
|
plt.scatter(pca_data[:, 0], pca_data[:, 1], c='blue', alpha=0.5, label='Existing Data') |
|
|
plt.scatter(pca_new_data[0, 0], pca_new_data[0, 1], c='red', marker='*', s=200, label='New Data') |
|
|
plt.title('PCA Visualization with New Data Point') |
|
|
plt.xlabel('PC1') |
|
|
plt.ylabel('PC2') |
|
|
plt.legend() |
|
|
st.pyplot(plt) |
|
|
|
|
|
st.sidebar.title('Feature Correlation Analysis') |
|
|
if st.sidebar.button('Analyze Correlation'): |
|
|
corr_matrix = original_data.corr() |
|
|
st.write('Correlation Matrix:') |
|
|
st.write(corr_matrix) |
|
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm') |
|
|
st.pyplot(plt) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |