import gradio as gr import pandas as pd import numpy as np import plotly.graph_objects as go from plotly.subplots import make_subplots import vlai_template # ──────────────────────────── Functions ───────────────────────── def simple_pca(X, n_components=2): """Simple PCA implementation""" # Center the data X_centered = X - np.mean(X, axis=0) # Compute covariance matrix cov_matrix = np.cov(X_centered.T) # Compute eigenvalues and eigenvectors eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix) # Sort by eigenvalues (descending) idx = np.argsort(eigenvalues)[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] # Select top n_components components = eigenvectors[:, :n_components] # Transform data X_pca = X_centered @ components # Calculate explained variance ratio explained_variance_ratio = eigenvalues[:n_components] / np.sum(eigenvalues) return X_pca, components, explained_variance_ratio def simple_kmeans(X, k, max_iters=100, random_state=42): """Simple KMeans implementation""" np.random.seed(random_state) # Initialize centroids randomly n_samples, n_features = X.shape centroids = X[np.random.choice(n_samples, k, replace=False)] for _ in range(max_iters): # Assign points to closest centroid distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2)) labels = np.argmin(distances, axis=0) # Update centroids new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)]) # Check for convergence if np.allclose(centroids, new_centroids): break centroids = new_centroids # Calculate inertia inertia = sum([np.sum((X[labels == i] - centroids[i])**2) for i in range(k)]) return labels, centroids, inertia def standardize_data(X): """Standardize data to have mean=0 and std=1""" return (X - np.mean(X, axis=0)) / np.std(X, axis=0) def create_plotly_visualization(X_pca, wine_types, labels, centroids_pca, k, explained_var): """Create a plotly visualization with two subplots""" # Create subplots fig = make_subplots( rows=1, cols=2, subplot_titles=( "Original Data (Red vs White Wine)", f"After KMeans Clustering (K={k})" ), horizontal_spacing=0.1 ) # Plot 1: Original Wine Types red_mask = np.array(wine_types) == "red" white_mask = np.array(wine_types) == "white" # Add red wine points if np.any(red_mask): fig.add_trace( go.Scatter( x=X_pca[red_mask, 0], y=X_pca[red_mask, 1], mode='markers', marker=dict(color='#d62728', size=4, opacity=0.6), name='Red Wine', showlegend=True ), row=1, col=1 ) # Add white wine points if np.any(white_mask): fig.add_trace( go.Scatter( x=X_pca[white_mask, 0], y=X_pca[white_mask, 1], mode='markers', marker=dict(color='#1f77b4', size=4, opacity=0.6), name='White Wine', showlegend=True ), row=1, col=1 ) # Plot 2: KMeans Clusters cluster_colors = ["#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22"] # Add cluster points for i in range(k): cluster_mask = labels == i if np.any(cluster_mask): fig.add_trace( go.Scatter( x=X_pca[cluster_mask, 0], y=X_pca[cluster_mask, 1], mode='markers', marker=dict(color=cluster_colors[i % len(cluster_colors)], size=4, opacity=0.6), name=f'Cluster {i}', showlegend=True ), row=1, col=2 ) # Add centroids fig.add_trace( go.Scatter( x=centroids_pca[:, 0], y=centroids_pca[:, 1], mode='markers+text', marker=dict(color='black', size=12, line=dict(color='white', width=2)), text=[str(i) for i in range(k)], textfont=dict(color='white', size=10), textposition="middle center", name='Centroids', showlegend=True ), row=1, col=2 ) # Update layout fig.update_layout( title="Wine Quality Dataset - KMeans Clustering with PCA", title_x=0.5, height=600, plot_bgcolor='white', paper_bgcolor='white', font=dict(size=12), showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5 ) ) # Update x and y axes labels and styling fig.update_xaxes( title_text=f"PC1 ({explained_var[0]:.1%} variance)", showgrid=True, gridcolor='lightgray', gridwidth=1, zeroline=True, zerolinecolor='lightgray', row=1, col=1 ) fig.update_xaxes( title_text=f"PC1 ({explained_var[0]:.1%} variance)", showgrid=True, gridcolor='lightgray', gridwidth=1, zeroline=True, zerolinecolor='lightgray', row=1, col=2 ) fig.update_yaxes( title_text=f"PC2 ({explained_var[1]:.1%} variance)", showgrid=True, gridcolor='lightgray', gridwidth=1, zeroline=True, zerolinecolor='lightgray', row=1, col=1 ) fig.update_yaxes( title_text=f"PC2 ({explained_var[1]:.1%} variance)", showgrid=True, gridcolor='lightgray', gridwidth=1, zeroline=True, zerolinecolor='lightgray', row=1, col=2 ) return fig def run_kmeans_analysis(n_clusters, random_state): """Main function to run the KMeans analysis""" try: # Load data df = pd.read_csv('data/winequality-merged.csv') # Prepare features (exclude wine_type) numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() X = df[numeric_cols].values # Standardize data X_scaled = standardize_data(X) # Apply PCA X_pca, components, explained_var = simple_pca(X_scaled, n_components=2) # Apply KMeans labels, centroids, inertia = simple_kmeans(X_scaled, n_clusters, random_state=random_state) # Transform centroids to PCA space centroids_centered = centroids - np.mean(X_scaled, axis=0) centroids_pca = centroids_centered @ components # Create plot wine_types = df['wine_type'].tolist() if 'wine_type' in df.columns else ['unknown'] * len(df) plot_fig = create_plotly_visualization(X_pca, wine_types, labels, centroids_pca, n_clusters, explained_var) return plot_fig except Exception as e: # Return an empty plotly figure with error message fig = go.Figure() fig.add_annotation( text=f"Error: {str(e)}", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, font=dict(color="red", size=16) ) fig.update_layout( plot_bgcolor='white', paper_bgcolor='white', xaxis=dict(visible=False), yaxis=dict(visible=False) ) return fig # ──────────────────────────── Main ───────────────────────── with gr.Blocks(theme='gstaff/sketch', css=vlai_template.custom_css, title="Wine Quality KMeans Demo") as demo: vlai_template.create_header() gr.Markdown(""" ## 🍷 Wine Quality Dataset - KMeans Clustering with PCA This demo applies **KMeans clustering** to the merged wine quality dataset using **Principal Component Analysis (PCA)** for dimensionality reduction. We visualize how the data looks before and after clustering. """) with gr.Row(equal_height=True, variant="panel"): with gr.Column(scale=1): n_clusters = gr.Slider( minimum=2, maximum=6, step=1, value=3, label="Number of Clusters (K)", info="Choose how many clusters KMeans should find" ) random_state = gr.Slider( minimum=1, maximum=100, step=1, value=42, label="Random Seed", info="For reproducible results" ) run_btn = gr.Button("🔍 Run KMeans Analysis", variant="primary", size="lg") gr.Markdown(""" ### 💡 How it works: 1. **Load Data**: Wine quality features from merged dataset. 2. **Standardize**: Scale all features to same range. 3. **PCA**: Reduce to 2 dimensions for visualization. 4. **KMeans**: Group wines into K clusters. 5. **Visualize**: Compare original vs. clustered data. """) with gr.Column(scale=7): output_plot = gr.Plot(label="📈 PCA Visualization & KMeans Results") run_btn.click( run_kmeans_analysis, inputs=[n_clusters, random_state], outputs=[output_plot], ) # Auto-run on page load demo.load( run_kmeans_analysis, inputs=[gr.Number(3, visible=False), gr.Number(42, visible=False)], outputs=[output_plot] ) vlai_template.create_footer() if __name__ == "__main__": demo.launch(allowed_paths=["static/aivn_logo.png", "static/vlai_logo.png", "static", "data"])