import streamlit as st import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # Title st.title("🍷 Wine Quality Analysis App") # Load dataset @st.cache_data def load_data(): url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" return pd.read_csv(url, sep=";") df = load_data() st.subheader("Preview of Dataset") st.write(df.head()) # PCA features = df.drop("quality", axis=1) scaler = StandardScaler() scaled_features = scaler.fit_transform(features) pca = PCA() pca_result = pca.fit_transform(scaled_features) explained_variance = np.cumsum(pca.explained_variance_ratio_) st.subheader("PCA Explained Variance") fig, ax = plt.subplots() ax.plot(range(1, len(explained_variance)+1), explained_variance, marker="o") ax.set_xlabel("Number of Principal Components") ax.set_ylabel("Cumulative Explained Variance") st.pyplot(fig) # Clustering pca_features = PCA(n_components=0.85).fit_transform(scaled_features) inertia, silhouette = [], [] K = range(2, 11) for k in K: km = KMeans(n_clusters=k, random_state=42, n_init=10) labels = km.fit_predict(pca_features) inertia.append(km.inertia_) silhouette.append(silhouette_score(pca_features, labels)) st.subheader("Elbow & Silhouette Method") fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) ax1.plot(K, inertia, marker="o") ax1.set_title("Elbow Method") ax1.set_xlabel("Clusters") ax1.set_ylabel("Inertia") ax2.plot(K, silhouette, marker="o", color="orange") ax2.set_title("Silhouette Score") ax2.set_xlabel("Clusters") ax2.set_ylabel("Score") st.pyplot(fig) # Apply clustering with 3 clusters kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) df["Cluster"] = kmeans.fit_predict(pca_features) st.subheader("Cluster Profiles") st.write(df.groupby("Cluster").mean()) # Business Insights cluster_insights = { 0: "Premium Taste Wines: High alcohol, balanced acidity, high quality", 1: "Sweet & Mild Wines: High sugar, low acidity, moderate quality", 2: "Sharp & Preservative-heavy Wines: High acidity, high sulfates, lower quality" } st.subheader("Business Insights") for cluster, desc in cluster_insights.items(): st.write(f"**Cluster {cluster}:** {desc}") # ---------------------- # Interactive Section # ---------------------- st.subheader("🍷 Explore Wines Interactively") # Slider for alcohol content alcohol_val = st.slider( "Select minimum alcohol content", float(df['alcohol'].min()), float(df['alcohol'].max()), float(df['alcohol'].min()) ) filtered_df = df[df['alcohol'] >= alcohol_val] st.write(f"Wines with alcohol ≥ {alcohol_val}") st.dataframe(filtered_df) # Slider for pH ph_val = st.slider( "Select maximum pH", float(df['pH'].min()), float(df['pH'].max()), float(df['pH'].max()) ) ph_filtered = filtered_df[filtered_df['pH'] <= ph_val] st.write(f"Wines with alcohol ≥ {alcohol_val} and pH ≤ {ph_val}") st.dataframe(ph_filtered) # Dropdown for cluster selection cluster_select = st.selectbox("Select Cluster to View", options=sorted(df['Cluster'].unique())) cluster_filtered = df[df['Cluster'] == cluster_select] st.write(f"Wines in Cluster {cluster_select}") st.dataframe(cluster_filtered)