import streamlit as st import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go # Set page configuration st.set_page_config( page_title="Wine Quality Analysis", page_icon="🍷", layout="wide" ) # Title and description st.title("🍷 Wine Quality Analysis") st.markdown(""" This app analyzes the Wine Quality dataset using unsupervised learning techniques. Explore the dataset, visualize PCA components, and see clustering results. """) # Load the dataset @st.cache_data def load_data(): wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' wine_data = pd.read_csv(wine_url, sep=';') return wine_data wine_data = load_data() # Sidebar for navigation st.sidebar.title("Navigation") options = st.sidebar.radio("Select a section:", ["Dataset Overview", "PCA Analysis", "Clustering Analysis", "Cluster Insights"]) # Dataset Overview Section if options == "Dataset Overview": st.header("Dataset Overview") st.subheader("First few rows of the dataset") st.dataframe(wine_data.head()) st.subheader("Dataset Information") col1, col2 = st.columns(2) with col1: st.write("**Shape:**", wine_data.shape) st.write("**Columns:**", list(wine_data.columns)) with col2: st.write("**Missing values:**") missing_values = wine_data.isnull().sum() st.write(missing_values) st.subheader("Feature Distributions") selected_feature = st.selectbox("Select a feature to visualize:", wine_data.columns[:-1]) fig = px.histogram(wine_data, x=selected_feature, title=f"Distribution of {selected_feature}") st.plotly_chart(fig) st.subheader("Quality Distribution") quality_counts = wine_data['quality'].value_counts().sort_index() fig = px.bar(x=quality_counts.index, y=quality_counts.values, labels={'x': 'Quality Score', 'y': 'Count'}, title="Distribution of Wine Quality Scores") st.plotly_chart(fig) # PCA Analysis Section elif options == "PCA Analysis": st.header("Principal Component Analysis (PCA)") # Prepare the data features = wine_data.drop('quality', axis=1) scaler = StandardScaler() scaled_features = scaler.fit_transform(features) # Perform PCA pca = PCA() pca_result = pca.fit_transform(scaled_features) # Explained variance explained_variance = np.cumsum(pca.explained_variance_ratio_) # Plot explained variance fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance)+1)), y=explained_variance, mode='lines+markers', name='Cumulative Explained Variance')) fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance)+1)), y=[0.80]*len(explained_variance), mode='lines', name='80% Variance Threshold', line=dict(dash='dash'))) fig.update_layout(title='PCA Explained Variance', xaxis_title='Number of Principal Components', yaxis_title='Cumulative Explained Variance') st.plotly_chart(fig) # Choose optimal components optimal_components = np.argmax(explained_variance >= 0.80) + 1 st.write(f"**Optimal number of principal components:** {optimal_components} (explains ~80% of variance)") # PCA component interpretation pca_components = pd.DataFrame(pca.components_, columns=features.columns) main_components = pca_components.iloc[:optimal_components] st.subheader("Main Principal Components Interpretation") for i, row in main_components.iterrows(): st.write(f"**PC{i+1}** represents major influence from:") sorted_features = row.abs().sort_values(ascending=False) top_features = list(sorted_features.items())[:3] for feature, value in top_features: st.write(f" - {feature} (weight {value:.2f})") # Visualize PCA results st.subheader("PCA Visualization") # Select components to visualize col1, col2 = st.columns(2) with col1: x_component = st.selectbox("X-axis component", [f"PC{i+1}" for i in range(optimal_components)], index=0) with col2: y_component = st.selectbox("Y-axis component", [f"PC{i+1}" for i in range(optimal_components)], index=1) x_idx = int(x_component[2:]) - 1 y_idx = int(y_component[2:]) - 1 # Create scatter plot fig = px.scatter(x=pca_result[:, x_idx], y=pca_result[:, y_idx], color=wine_data['quality'], labels={'x': x_component, 'y': y_component, 'color': 'Quality'}, title=f"{y_component} vs {x_component} Colored by Quality") st.plotly_chart(fig) # Clustering Analysis Section elif options == "Clustering Analysis": st.header("Clustering Analysis") # Prepare the data features = wine_data.drop('quality', axis=1) scaler = StandardScaler() scaled_features = scaler.fit_transform(features) # Perform PCA for dimensionality reduction pca = PCA(n_components=0.85) pca_features = pca.fit_transform(scaled_features) # Determine optimal number of clusters inertia = [] silhouette = [] k_range = range(2, 11) for k in k_range: kmeans = KMeans(n_clusters=k, random_state=42) labels = kmeans.fit_predict(pca_features) inertia.append(kmeans.inertia_) if k > 1: # Silhouette score requires at least 2 clusters silhouette.append(silhouette_score(pca_features, labels)) else: silhouette.append(0) # Plot elbow and silhouette methods col1, col2 = st.columns(2) with col1: fig = go.Figure() fig.add_trace(go.Scatter(x=list(k_range), y=inertia, mode='lines+markers')) fig.update_layout(title='Elbow Method', xaxis_title='Number of Clusters', yaxis_title='Inertia') st.plotly_chart(fig) with col2: fig = go.Figure() fig.add_trace(go.Scatter(x=list(k_range)[1:], y=silhouette[1:], mode='lines+markers')) fig.update_layout(title='Silhouette Method', xaxis_title='Number of Clusters', yaxis_title='Silhouette Score') st.plotly_chart(fig) # Let user select number of clusters k_optimal = st.slider("Select number of clusters:", min_value=2, max_value=10, value=3) # Apply K-Means with selected clusters kmeans = KMeans(n_clusters=k_optimal, random_state=42) cluster_labels = kmeans.fit_predict(pca_features) # Add cluster labels to the dataframe wine_data_clustered = wine_data.copy() wine_data_clustered['Cluster'] = cluster_labels # Visualize clusters st.subheader("Cluster Visualization") # Create scatter plot of clusters fig = px.scatter(x=pca_features[:, 0], y=pca_features[:, 1], color=cluster_labels, labels={'x': 'PC1', 'y': 'PC2', 'color': 'Cluster'}, title="Clusters Visualized in PCA Space") st.plotly_chart(fig) # Show cluster profiles st.subheader("Cluster Profiles") cluster_profiles = wine_data_clustered.groupby('Cluster').mean() st.dataframe(cluster_profiles) # Cluster Insights Section elif options == "Cluster Insights": st.header("Cluster Business Insights") # Prepare the data (same as in clustering section) features = wine_data.drop('quality', axis=1) scaler = StandardScaler() scaled_features = scaler.fit_transform(features) pca = PCA(n_components=0.85) pca_features = pca.fit_transform(scaled_features) # Use 3 clusters as in the original analysis kmeans = KMeans(n_clusters=3, random_state=42) cluster_labels = kmeans.fit_predict(pca_features) wine_data_clustered = wine_data.copy() wine_data_clustered['Cluster'] = cluster_labels # Define cluster insights (based on the original analysis) cluster_insights = { 0: "Premium Taste Wines: High alcohol, balanced acidity, high quality", 1: "Sweet & Mild Wines: High sugar, low acidity, moderate quality", 2: "Sharp & Preservative-heavy Wines: High acidity, high sulfates, lower quality" } # Display insights for cluster, desc in cluster_insights.items(): st.subheader(f"Cluster {cluster}") st.write(desc) # Show statistics for this cluster cluster_data = wine_data_clustered[wine_data_clustered['Cluster'] == cluster] st.write(f"Number of wines in this cluster: {len(cluster_data)}") st.write(f"Average quality: {cluster_data['quality'].mean():.2f}") # Show key characteristics key_features = ['alcohol', 'residual sugar', 'volatile acidity', 'citric acid', 'sulphates'] cluster_means = cluster_data[key_features].mean() fig = go.Figure() fig.add_trace(go.Bar(x=key_features, y=cluster_means.values, name=f"Cluster {cluster}")) fig.update_layout(title=f"Key Features for Cluster {cluster}", yaxis_title="Average Value") st.plotly_chart(fig) st.write("---")