xxnithicxx's picture
Enhance KMeans analysis visualization using Plotly and update requirements
346590b
import gradio as gr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import vlai_template
# ──────────────────────────── Functions ─────────────────────────
def simple_pca(X, n_components=2):
"""Simple PCA implementation"""
# Center the data
X_centered = X - np.mean(X, axis=0)
# Compute covariance matrix
cov_matrix = np.cov(X_centered.T)
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# Sort by eigenvalues (descending)
idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# Select top n_components
components = eigenvectors[:, :n_components]
# Transform data
X_pca = X_centered @ components
# Calculate explained variance ratio
explained_variance_ratio = eigenvalues[:n_components] / np.sum(eigenvalues)
return X_pca, components, explained_variance_ratio
def simple_kmeans(X, k, max_iters=100, random_state=42):
"""Simple KMeans implementation"""
np.random.seed(random_state)
# Initialize centroids randomly
n_samples, n_features = X.shape
centroids = X[np.random.choice(n_samples, k, replace=False)]
for _ in range(max_iters):
# Assign points to closest centroid
distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
labels = np.argmin(distances, axis=0)
# Update centroids
new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
# Check for convergence
if np.allclose(centroids, new_centroids):
break
centroids = new_centroids
# Calculate inertia
inertia = sum([np.sum((X[labels == i] - centroids[i])**2) for i in range(k)])
return labels, centroids, inertia
def standardize_data(X):
"""Standardize data to have mean=0 and std=1"""
return (X - np.mean(X, axis=0)) / np.std(X, axis=0)
def create_plotly_visualization(X_pca, wine_types, labels, centroids_pca, k, explained_var):
"""Create a plotly visualization with two subplots"""
# Create subplots
fig = make_subplots(
rows=1, cols=2,
subplot_titles=(
"Original Data (Red vs White Wine)",
f"After KMeans Clustering (K={k})"
),
horizontal_spacing=0.1
)
# Plot 1: Original Wine Types
red_mask = np.array(wine_types) == "red"
white_mask = np.array(wine_types) == "white"
# Add red wine points
if np.any(red_mask):
fig.add_trace(
go.Scatter(
x=X_pca[red_mask, 0],
y=X_pca[red_mask, 1],
mode='markers',
marker=dict(color='#d62728', size=4, opacity=0.6),
name='Red Wine',
showlegend=True
),
row=1, col=1
)
# Add white wine points
if np.any(white_mask):
fig.add_trace(
go.Scatter(
x=X_pca[white_mask, 0],
y=X_pca[white_mask, 1],
mode='markers',
marker=dict(color='#1f77b4', size=4, opacity=0.6),
name='White Wine',
showlegend=True
),
row=1, col=1
)
# Plot 2: KMeans Clusters
cluster_colors = ["#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22"]
# Add cluster points
for i in range(k):
cluster_mask = labels == i
if np.any(cluster_mask):
fig.add_trace(
go.Scatter(
x=X_pca[cluster_mask, 0],
y=X_pca[cluster_mask, 1],
mode='markers',
marker=dict(color=cluster_colors[i % len(cluster_colors)], size=4, opacity=0.6),
name=f'Cluster {i}',
showlegend=True
),
row=1, col=2
)
# Add centroids
fig.add_trace(
go.Scatter(
x=centroids_pca[:, 0],
y=centroids_pca[:, 1],
mode='markers+text',
marker=dict(color='black', size=12, line=dict(color='white', width=2)),
text=[str(i) for i in range(k)],
textfont=dict(color='white', size=10),
textposition="middle center",
name='Centroids',
showlegend=True
),
row=1, col=2
)
# Update layout
fig.update_layout(
title="Wine Quality Dataset - KMeans Clustering with PCA",
title_x=0.5,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(size=12),
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.2,
xanchor="center",
x=0.5
)
)
# Update x and y axes labels and styling
fig.update_xaxes(
title_text=f"PC1 ({explained_var[0]:.1%} variance)",
showgrid=True,
gridcolor='lightgray',
gridwidth=1,
zeroline=True,
zerolinecolor='lightgray',
row=1, col=1
)
fig.update_xaxes(
title_text=f"PC1 ({explained_var[0]:.1%} variance)",
showgrid=True,
gridcolor='lightgray',
gridwidth=1,
zeroline=True,
zerolinecolor='lightgray',
row=1, col=2
)
fig.update_yaxes(
title_text=f"PC2 ({explained_var[1]:.1%} variance)",
showgrid=True,
gridcolor='lightgray',
gridwidth=1,
zeroline=True,
zerolinecolor='lightgray',
row=1, col=1
)
fig.update_yaxes(
title_text=f"PC2 ({explained_var[1]:.1%} variance)",
showgrid=True,
gridcolor='lightgray',
gridwidth=1,
zeroline=True,
zerolinecolor='lightgray',
row=1, col=2
)
return fig
def run_kmeans_analysis(n_clusters, random_state):
"""Main function to run the KMeans analysis"""
try:
# Load data
df = pd.read_csv('data/winequality-merged.csv')
# Prepare features (exclude wine_type)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
X = df[numeric_cols].values
# Standardize data
X_scaled = standardize_data(X)
# Apply PCA
X_pca, components, explained_var = simple_pca(X_scaled, n_components=2)
# Apply KMeans
labels, centroids, inertia = simple_kmeans(X_scaled, n_clusters, random_state=random_state)
# Transform centroids to PCA space
centroids_centered = centroids - np.mean(X_scaled, axis=0)
centroids_pca = centroids_centered @ components
# Create plot
wine_types = df['wine_type'].tolist() if 'wine_type' in df.columns else ['unknown'] * len(df)
plot_fig = create_plotly_visualization(X_pca, wine_types, labels, centroids_pca, n_clusters, explained_var)
return plot_fig
except Exception as e:
# Return an empty plotly figure with error message
fig = go.Figure()
fig.add_annotation(
text=f"Error: {str(e)}",
xref="paper", yref="paper",
x=0.5, y=0.5,
showarrow=False,
font=dict(color="red", size=16)
)
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
xaxis=dict(visible=False),
yaxis=dict(visible=False)
)
return fig
# ──────────────────────────── Main ─────────────────────────
with gr.Blocks(theme='gstaff/sketch', css=vlai_template.custom_css, title="Wine Quality KMeans Demo") as demo:
vlai_template.create_header()
gr.Markdown("""
## 🍷 Wine Quality Dataset - KMeans Clustering with PCA
This demo applies **KMeans clustering** to the merged wine quality dataset using **Principal Component Analysis (PCA)**
for dimensionality reduction. We visualize how the data looks before and after clustering.
""")
with gr.Row(equal_height=True, variant="panel"):
with gr.Column(scale=1):
n_clusters = gr.Slider(
minimum=2, maximum=6, step=1, value=3,
label="Number of Clusters (K)",
info="Choose how many clusters KMeans should find"
)
random_state = gr.Slider(
minimum=1, maximum=100, step=1, value=42,
label="Random Seed",
info="For reproducible results"
)
run_btn = gr.Button("πŸ” Run KMeans Analysis", variant="primary", size="lg")
gr.Markdown("""
### πŸ’‘ How it works:
1. **Load Data**: Wine quality features from merged dataset.
2. **Standardize**: Scale all features to same range.
3. **PCA**: Reduce to 2 dimensions for visualization.
4. **KMeans**: Group wines into K clusters.
5. **Visualize**: Compare original vs. clustered data.
""")
with gr.Column(scale=7):
output_plot = gr.Plot(label="πŸ“ˆ PCA Visualization & KMeans Results")
run_btn.click(
run_kmeans_analysis,
inputs=[n_clusters, random_state],
outputs=[output_plot],
)
# Auto-run on page load
demo.load(
run_kmeans_analysis,
inputs=[gr.Number(3, visible=False), gr.Number(42, visible=False)],
outputs=[output_plot]
)
vlai_template.create_footer()
if __name__ == "__main__":
demo.launch(allowed_paths=["static/aivn_logo.png", "static/vlai_logo.png", "static", "data"])