Spaces:
No application file
No application file
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="Wine Quality Analysis", | |
| page_icon="🍷", | |
| layout="wide" | |
| ) | |
| # Title and description | |
| st.title("🍷 Wine Quality Analysis") | |
| st.markdown(""" | |
| This app analyzes the Wine Quality dataset using unsupervised learning techniques. | |
| Explore the dataset, visualize PCA components, and see clustering results. | |
| """) | |
| # Load the dataset | |
| def load_data(): | |
| wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' | |
| wine_data = pd.read_csv(wine_url, sep=';') | |
| return wine_data | |
| wine_data = load_data() | |
| # Sidebar for navigation | |
| st.sidebar.title("Navigation") | |
| options = st.sidebar.radio("Select a section:", | |
| ["Dataset Overview", "PCA Analysis", "Clustering Analysis", "Cluster Insights"]) | |
| # Dataset Overview Section | |
| if options == "Dataset Overview": | |
| st.header("Dataset Overview") | |
| st.subheader("First few rows of the dataset") | |
| st.dataframe(wine_data.head()) | |
| st.subheader("Dataset Information") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("**Shape:**", wine_data.shape) | |
| st.write("**Columns:**", list(wine_data.columns)) | |
| with col2: | |
| st.write("**Missing values:**") | |
| missing_values = wine_data.isnull().sum() | |
| st.write(missing_values) | |
| st.subheader("Feature Distributions") | |
| selected_feature = st.selectbox("Select a feature to visualize:", wine_data.columns[:-1]) | |
| fig = px.histogram(wine_data, x=selected_feature, title=f"Distribution of {selected_feature}") | |
| st.plotly_chart(fig) | |
| st.subheader("Quality Distribution") | |
| quality_counts = wine_data['quality'].value_counts().sort_index() | |
| fig = px.bar(x=quality_counts.index, y=quality_counts.values, | |
| labels={'x': 'Quality Score', 'y': 'Count'}, | |
| title="Distribution of Wine Quality Scores") | |
| st.plotly_chart(fig) | |
| # PCA Analysis Section | |
| elif options == "PCA Analysis": | |
| st.header("Principal Component Analysis (PCA)") | |
| # Prepare the data | |
| features = wine_data.drop('quality', axis=1) | |
| scaler = StandardScaler() | |
| scaled_features = scaler.fit_transform(features) | |
| # Perform PCA | |
| pca = PCA() | |
| pca_result = pca.fit_transform(scaled_features) | |
| # Explained variance | |
| explained_variance = np.cumsum(pca.explained_variance_ratio_) | |
| # Plot explained variance | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance)+1)), | |
| y=explained_variance, | |
| mode='lines+markers', | |
| name='Cumulative Explained Variance')) | |
| fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance)+1)), | |
| y=[0.80]*len(explained_variance), | |
| mode='lines', | |
| name='80% Variance Threshold', | |
| line=dict(dash='dash'))) | |
| fig.update_layout(title='PCA Explained Variance', | |
| xaxis_title='Number of Principal Components', | |
| yaxis_title='Cumulative Explained Variance') | |
| st.plotly_chart(fig) | |
| # Choose optimal components | |
| optimal_components = np.argmax(explained_variance >= 0.80) + 1 | |
| st.write(f"**Optimal number of principal components:** {optimal_components} (explains ~80% of variance)") | |
| # PCA component interpretation | |
| pca_components = pd.DataFrame(pca.components_, columns=features.columns) | |
| main_components = pca_components.iloc[:optimal_components] | |
| st.subheader("Main Principal Components Interpretation") | |
| for i, row in main_components.iterrows(): | |
| st.write(f"**PC{i+1}** represents major influence from:") | |
| sorted_features = row.abs().sort_values(ascending=False) | |
| top_features = list(sorted_features.items())[:3] | |
| for feature, value in top_features: | |
| st.write(f" - {feature} (weight {value:.2f})") | |
| # Visualize PCA results | |
| st.subheader("PCA Visualization") | |
| # Select components to visualize | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| x_component = st.selectbox("X-axis component", | |
| [f"PC{i+1}" for i in range(optimal_components)], | |
| index=0) | |
| with col2: | |
| y_component = st.selectbox("Y-axis component", | |
| [f"PC{i+1}" for i in range(optimal_components)], | |
| index=1) | |
| x_idx = int(x_component[2:]) - 1 | |
| y_idx = int(y_component[2:]) - 1 | |
| # Create scatter plot | |
| fig = px.scatter(x=pca_result[:, x_idx], y=pca_result[:, y_idx], | |
| color=wine_data['quality'], | |
| labels={'x': x_component, 'y': y_component, 'color': 'Quality'}, | |
| title=f"{y_component} vs {x_component} Colored by Quality") | |
| st.plotly_chart(fig) | |
| # Clustering Analysis Section | |
| elif options == "Clustering Analysis": | |
| st.header("Clustering Analysis") | |
| # Prepare the data | |
| features = wine_data.drop('quality', axis=1) | |
| scaler = StandardScaler() | |
| scaled_features = scaler.fit_transform(features) | |
| # Perform PCA for dimensionality reduction | |
| pca = PCA(n_components=0.85) | |
| pca_features = pca.fit_transform(scaled_features) | |
| # Determine optimal number of clusters | |
| inertia = [] | |
| silhouette = [] | |
| k_range = range(2, 11) | |
| for k in k_range: | |
| kmeans = KMeans(n_clusters=k, random_state=42) | |
| labels = kmeans.fit_predict(pca_features) | |
| inertia.append(kmeans.inertia_) | |
| if k > 1: # Silhouette score requires at least 2 clusters | |
| silhouette.append(silhouette_score(pca_features, labels)) | |
| else: | |
| silhouette.append(0) | |
| # Plot elbow and silhouette methods | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=list(k_range), y=inertia, mode='lines+markers')) | |
| fig.update_layout(title='Elbow Method', | |
| xaxis_title='Number of Clusters', | |
| yaxis_title='Inertia') | |
| st.plotly_chart(fig) | |
| with col2: | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=list(k_range)[1:], y=silhouette[1:], mode='lines+markers')) | |
| fig.update_layout(title='Silhouette Method', | |
| xaxis_title='Number of Clusters', | |
| yaxis_title='Silhouette Score') | |
| st.plotly_chart(fig) | |
| # Let user select number of clusters | |
| k_optimal = st.slider("Select number of clusters:", min_value=2, max_value=10, value=3) | |
| # Apply K-Means with selected clusters | |
| kmeans = KMeans(n_clusters=k_optimal, random_state=42) | |
| cluster_labels = kmeans.fit_predict(pca_features) | |
| # Add cluster labels to the dataframe | |
| wine_data_clustered = wine_data.copy() | |
| wine_data_clustered['Cluster'] = cluster_labels | |
| # Visualize clusters | |
| st.subheader("Cluster Visualization") | |
| # Create scatter plot of clusters | |
| fig = px.scatter(x=pca_features[:, 0], y=pca_features[:, 1], | |
| color=cluster_labels, | |
| labels={'x': 'PC1', 'y': 'PC2', 'color': 'Cluster'}, | |
| title="Clusters Visualized in PCA Space") | |
| st.plotly_chart(fig) | |
| # Show cluster profiles | |
| st.subheader("Cluster Profiles") | |
| cluster_profiles = wine_data_clustered.groupby('Cluster').mean() | |
| st.dataframe(cluster_profiles) | |
| # Cluster Insights Section | |
| elif options == "Cluster Insights": | |
| st.header("Cluster Business Insights") | |
| # Prepare the data (same as in clustering section) | |
| features = wine_data.drop('quality', axis=1) | |
| scaler = StandardScaler() | |
| scaled_features = scaler.fit_transform(features) | |
| pca = PCA(n_components=0.85) | |
| pca_features = pca.fit_transform(scaled_features) | |
| # Use 3 clusters as in the original analysis | |
| kmeans = KMeans(n_clusters=3, random_state=42) | |
| cluster_labels = kmeans.fit_predict(pca_features) | |
| wine_data_clustered = wine_data.copy() | |
| wine_data_clustered['Cluster'] = cluster_labels | |
| # Define cluster insights (based on the original analysis) | |
| cluster_insights = { | |
| 0: "Premium Taste Wines: High alcohol, balanced acidity, high quality", | |
| 1: "Sweet & Mild Wines: High sugar, low acidity, moderate quality", | |
| 2: "Sharp & Preservative-heavy Wines: High acidity, high sulfates, lower quality" | |
| } | |
| # Display insights | |
| for cluster, desc in cluster_insights.items(): | |
| st.subheader(f"Cluster {cluster}") | |
| st.write(desc) | |
| # Show statistics for this cluster | |
| cluster_data = wine_data_clustered[wine_data_clustered['Cluster'] == cluster] | |
| st.write(f"Number of wines in this cluster: {len(cluster_data)}") | |
| st.write(f"Average quality: {cluster_data['quality'].mean():.2f}") | |
| # Show key characteristics | |
| key_features = ['alcohol', 'residual sugar', 'volatile acidity', 'citric acid', 'sulphates'] | |
| cluster_means = cluster_data[key_features].mean() | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar(x=key_features, y=cluster_means.values, | |
| name=f"Cluster {cluster}")) | |
| fig.update_layout(title=f"Key Features for Cluster {cluster}", | |
| yaxis_title="Average Value") | |
| st.plotly_chart(fig) | |
| st.write("---") | |
| import pandas as pd | |
| df = pd.read_csv("wine_data.csv") | |