|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import plotly.graph_objects as go |
|
|
from plotly.subplots import make_subplots |
|
|
|
|
|
import vlai_template |
|
|
|
|
|
|
|
|
|
|
|
def simple_pca(X, n_components=2): |
|
|
"""Simple PCA implementation""" |
|
|
|
|
|
X_centered = X - np.mean(X, axis=0) |
|
|
|
|
|
|
|
|
cov_matrix = np.cov(X_centered.T) |
|
|
|
|
|
|
|
|
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix) |
|
|
|
|
|
|
|
|
idx = np.argsort(eigenvalues)[::-1] |
|
|
eigenvalues = eigenvalues[idx] |
|
|
eigenvectors = eigenvectors[:, idx] |
|
|
|
|
|
|
|
|
components = eigenvectors[:, :n_components] |
|
|
|
|
|
|
|
|
X_pca = X_centered @ components |
|
|
|
|
|
|
|
|
explained_variance_ratio = eigenvalues[:n_components] / np.sum(eigenvalues) |
|
|
|
|
|
return X_pca, components, explained_variance_ratio |
|
|
|
|
|
def simple_kmeans(X, k, max_iters=100, random_state=42): |
|
|
"""Simple KMeans implementation""" |
|
|
np.random.seed(random_state) |
|
|
|
|
|
|
|
|
n_samples, n_features = X.shape |
|
|
centroids = X[np.random.choice(n_samples, k, replace=False)] |
|
|
|
|
|
for _ in range(max_iters): |
|
|
|
|
|
distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2)) |
|
|
labels = np.argmin(distances, axis=0) |
|
|
|
|
|
|
|
|
new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)]) |
|
|
|
|
|
|
|
|
if np.allclose(centroids, new_centroids): |
|
|
break |
|
|
|
|
|
centroids = new_centroids |
|
|
|
|
|
|
|
|
inertia = sum([np.sum((X[labels == i] - centroids[i])**2) for i in range(k)]) |
|
|
|
|
|
return labels, centroids, inertia |
|
|
|
|
|
def standardize_data(X): |
|
|
"""Standardize data to have mean=0 and std=1""" |
|
|
return (X - np.mean(X, axis=0)) / np.std(X, axis=0) |
|
|
|
|
|
def create_plotly_visualization(X_pca, wine_types, labels, centroids_pca, k, explained_var): |
|
|
"""Create a plotly visualization with two subplots""" |
|
|
|
|
|
|
|
|
fig = make_subplots( |
|
|
rows=1, cols=2, |
|
|
subplot_titles=( |
|
|
"Original Data (Red vs White Wine)", |
|
|
f"After KMeans Clustering (K={k})" |
|
|
), |
|
|
horizontal_spacing=0.1 |
|
|
) |
|
|
|
|
|
|
|
|
red_mask = np.array(wine_types) == "red" |
|
|
white_mask = np.array(wine_types) == "white" |
|
|
|
|
|
|
|
|
if np.any(red_mask): |
|
|
fig.add_trace( |
|
|
go.Scatter( |
|
|
x=X_pca[red_mask, 0], |
|
|
y=X_pca[red_mask, 1], |
|
|
mode='markers', |
|
|
marker=dict(color='#d62728', size=4, opacity=0.6), |
|
|
name='Red Wine', |
|
|
showlegend=True |
|
|
), |
|
|
row=1, col=1 |
|
|
) |
|
|
|
|
|
|
|
|
if np.any(white_mask): |
|
|
fig.add_trace( |
|
|
go.Scatter( |
|
|
x=X_pca[white_mask, 0], |
|
|
y=X_pca[white_mask, 1], |
|
|
mode='markers', |
|
|
marker=dict(color='#1f77b4', size=4, opacity=0.6), |
|
|
name='White Wine', |
|
|
showlegend=True |
|
|
), |
|
|
row=1, col=1 |
|
|
) |
|
|
|
|
|
|
|
|
cluster_colors = ["#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22"] |
|
|
|
|
|
|
|
|
for i in range(k): |
|
|
cluster_mask = labels == i |
|
|
if np.any(cluster_mask): |
|
|
fig.add_trace( |
|
|
go.Scatter( |
|
|
x=X_pca[cluster_mask, 0], |
|
|
y=X_pca[cluster_mask, 1], |
|
|
mode='markers', |
|
|
marker=dict(color=cluster_colors[i % len(cluster_colors)], size=4, opacity=0.6), |
|
|
name=f'Cluster {i}', |
|
|
showlegend=True |
|
|
), |
|
|
row=1, col=2 |
|
|
) |
|
|
|
|
|
|
|
|
fig.add_trace( |
|
|
go.Scatter( |
|
|
x=centroids_pca[:, 0], |
|
|
y=centroids_pca[:, 1], |
|
|
mode='markers+text', |
|
|
marker=dict(color='black', size=12, line=dict(color='white', width=2)), |
|
|
text=[str(i) for i in range(k)], |
|
|
textfont=dict(color='white', size=10), |
|
|
textposition="middle center", |
|
|
name='Centroids', |
|
|
showlegend=True |
|
|
), |
|
|
row=1, col=2 |
|
|
) |
|
|
|
|
|
|
|
|
fig.update_layout( |
|
|
title="Wine Quality Dataset - KMeans Clustering with PCA", |
|
|
title_x=0.5, |
|
|
height=600, |
|
|
plot_bgcolor='white', |
|
|
paper_bgcolor='white', |
|
|
font=dict(size=12), |
|
|
showlegend=True, |
|
|
legend=dict( |
|
|
orientation="h", |
|
|
yanchor="bottom", |
|
|
y=-0.2, |
|
|
xanchor="center", |
|
|
x=0.5 |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
fig.update_xaxes( |
|
|
title_text=f"PC1 ({explained_var[0]:.1%} variance)", |
|
|
showgrid=True, |
|
|
gridcolor='lightgray', |
|
|
gridwidth=1, |
|
|
zeroline=True, |
|
|
zerolinecolor='lightgray', |
|
|
row=1, col=1 |
|
|
) |
|
|
fig.update_xaxes( |
|
|
title_text=f"PC1 ({explained_var[0]:.1%} variance)", |
|
|
showgrid=True, |
|
|
gridcolor='lightgray', |
|
|
gridwidth=1, |
|
|
zeroline=True, |
|
|
zerolinecolor='lightgray', |
|
|
row=1, col=2 |
|
|
) |
|
|
fig.update_yaxes( |
|
|
title_text=f"PC2 ({explained_var[1]:.1%} variance)", |
|
|
showgrid=True, |
|
|
gridcolor='lightgray', |
|
|
gridwidth=1, |
|
|
zeroline=True, |
|
|
zerolinecolor='lightgray', |
|
|
row=1, col=1 |
|
|
) |
|
|
fig.update_yaxes( |
|
|
title_text=f"PC2 ({explained_var[1]:.1%} variance)", |
|
|
showgrid=True, |
|
|
gridcolor='lightgray', |
|
|
gridwidth=1, |
|
|
zeroline=True, |
|
|
zerolinecolor='lightgray', |
|
|
row=1, col=2 |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
def run_kmeans_analysis(n_clusters, random_state): |
|
|
"""Main function to run the KMeans analysis""" |
|
|
try: |
|
|
|
|
|
df = pd.read_csv('data/winequality-merged.csv') |
|
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
|
|
X = df[numeric_cols].values |
|
|
|
|
|
|
|
|
X_scaled = standardize_data(X) |
|
|
|
|
|
|
|
|
X_pca, components, explained_var = simple_pca(X_scaled, n_components=2) |
|
|
|
|
|
|
|
|
labels, centroids, inertia = simple_kmeans(X_scaled, n_clusters, random_state=random_state) |
|
|
|
|
|
|
|
|
centroids_centered = centroids - np.mean(X_scaled, axis=0) |
|
|
centroids_pca = centroids_centered @ components |
|
|
|
|
|
|
|
|
wine_types = df['wine_type'].tolist() if 'wine_type' in df.columns else ['unknown'] * len(df) |
|
|
plot_fig = create_plotly_visualization(X_pca, wine_types, labels, centroids_pca, n_clusters, explained_var) |
|
|
|
|
|
return plot_fig |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
fig = go.Figure() |
|
|
fig.add_annotation( |
|
|
text=f"Error: {str(e)}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.5, y=0.5, |
|
|
showarrow=False, |
|
|
font=dict(color="red", size=16) |
|
|
) |
|
|
fig.update_layout( |
|
|
plot_bgcolor='white', |
|
|
paper_bgcolor='white', |
|
|
xaxis=dict(visible=False), |
|
|
yaxis=dict(visible=False) |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
with gr.Blocks(theme='gstaff/sketch', css=vlai_template.custom_css, title="Wine Quality KMeans Demo") as demo: |
|
|
vlai_template.create_header() |
|
|
|
|
|
gr.Markdown(""" |
|
|
## π· Wine Quality Dataset - KMeans Clustering with PCA |
|
|
|
|
|
This demo applies **KMeans clustering** to the merged wine quality dataset using **Principal Component Analysis (PCA)** |
|
|
for dimensionality reduction. We visualize how the data looks before and after clustering. |
|
|
""") |
|
|
|
|
|
with gr.Row(equal_height=True, variant="panel"): |
|
|
with gr.Column(scale=1): |
|
|
n_clusters = gr.Slider( |
|
|
minimum=2, maximum=6, step=1, value=3, |
|
|
label="Number of Clusters (K)", |
|
|
info="Choose how many clusters KMeans should find" |
|
|
) |
|
|
random_state = gr.Slider( |
|
|
minimum=1, maximum=100, step=1, value=42, |
|
|
label="Random Seed", |
|
|
info="For reproducible results" |
|
|
) |
|
|
|
|
|
run_btn = gr.Button("π Run KMeans Analysis", variant="primary", size="lg") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### π‘ How it works: |
|
|
1. **Load Data**: Wine quality features from merged dataset. |
|
|
2. **Standardize**: Scale all features to same range. |
|
|
3. **PCA**: Reduce to 2 dimensions for visualization. |
|
|
4. **KMeans**: Group wines into K clusters. |
|
|
5. **Visualize**: Compare original vs. clustered data. |
|
|
""") |
|
|
|
|
|
with gr.Column(scale=7): |
|
|
output_plot = gr.Plot(label="π PCA Visualization & KMeans Results") |
|
|
|
|
|
run_btn.click( |
|
|
run_kmeans_analysis, |
|
|
inputs=[n_clusters, random_state], |
|
|
outputs=[output_plot], |
|
|
) |
|
|
|
|
|
|
|
|
demo.load( |
|
|
run_kmeans_analysis, |
|
|
inputs=[gr.Number(3, visible=False), gr.Number(42, visible=False)], |
|
|
outputs=[output_plot] |
|
|
) |
|
|
|
|
|
vlai_template.create_footer() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(allowed_paths=["static/aivn_logo.png", "static/vlai_logo.png", "static", "data"]) |
|
|
|