Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- assignmentwine.py +111 -0
assignmentwine.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.preprocessing import StandardScaler
|
| 6 |
+
from sklearn.decomposition import PCA
|
| 7 |
+
from sklearn.cluster import KMeans
|
| 8 |
+
from sklearn.metrics import silhouette_score
|
| 9 |
+
|
| 10 |
+
# Title
|
| 11 |
+
st.title("🍷 Wine Quality Analysis App")
|
| 12 |
+
|
| 13 |
+
# Load dataset
|
| 14 |
+
@st.cache_data
|
| 15 |
+
def load_data():
|
| 16 |
+
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
|
| 17 |
+
return pd.read_csv(url, sep=";")
|
| 18 |
+
|
| 19 |
+
df = load_data()
|
| 20 |
+
st.subheader("Preview of Dataset")
|
| 21 |
+
st.write(df.head())
|
| 22 |
+
|
| 23 |
+
# PCA
|
| 24 |
+
features = df.drop("quality", axis=1)
|
| 25 |
+
scaler = StandardScaler()
|
| 26 |
+
scaled_features = scaler.fit_transform(features)
|
| 27 |
+
|
| 28 |
+
pca = PCA()
|
| 29 |
+
pca_result = pca.fit_transform(scaled_features)
|
| 30 |
+
explained_variance = np.cumsum(pca.explained_variance_ratio_)
|
| 31 |
+
|
| 32 |
+
st.subheader("PCA Explained Variance")
|
| 33 |
+
fig, ax = plt.subplots()
|
| 34 |
+
ax.plot(range(1, len(explained_variance)+1), explained_variance, marker="o")
|
| 35 |
+
ax.set_xlabel("Number of Principal Components")
|
| 36 |
+
ax.set_ylabel("Cumulative Explained Variance")
|
| 37 |
+
st.pyplot(fig)
|
| 38 |
+
|
| 39 |
+
# Clustering
|
| 40 |
+
pca_features = PCA(n_components=0.85).fit_transform(scaled_features)
|
| 41 |
+
|
| 42 |
+
inertia, silhouette = [], []
|
| 43 |
+
K = range(2, 11)
|
| 44 |
+
for k in K:
|
| 45 |
+
km = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 46 |
+
labels = km.fit_predict(pca_features)
|
| 47 |
+
inertia.append(km.inertia_)
|
| 48 |
+
silhouette.append(silhouette_score(pca_features, labels))
|
| 49 |
+
|
| 50 |
+
st.subheader("Elbow & Silhouette Method")
|
| 51 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
|
| 52 |
+
ax1.plot(K, inertia, marker="o")
|
| 53 |
+
ax1.set_title("Elbow Method")
|
| 54 |
+
ax1.set_xlabel("Clusters")
|
| 55 |
+
ax1.set_ylabel("Inertia")
|
| 56 |
+
|
| 57 |
+
ax2.plot(K, silhouette, marker="o", color="orange")
|
| 58 |
+
ax2.set_title("Silhouette Score")
|
| 59 |
+
ax2.set_xlabel("Clusters")
|
| 60 |
+
ax2.set_ylabel("Score")
|
| 61 |
+
st.pyplot(fig)
|
| 62 |
+
|
| 63 |
+
# Apply clustering with 3 clusters
|
| 64 |
+
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
|
| 65 |
+
df["Cluster"] = kmeans.fit_predict(pca_features)
|
| 66 |
+
|
| 67 |
+
st.subheader("Cluster Profiles")
|
| 68 |
+
st.write(df.groupby("Cluster").mean())
|
| 69 |
+
|
| 70 |
+
# Business Insights
|
| 71 |
+
cluster_insights = {
|
| 72 |
+
0: "Premium Taste Wines: High alcohol, balanced acidity, high quality",
|
| 73 |
+
1: "Sweet & Mild Wines: High sugar, low acidity, moderate quality",
|
| 74 |
+
2: "Sharp & Preservative-heavy Wines: High acidity, high sulfates, lower quality"
|
| 75 |
+
}
|
| 76 |
+
st.subheader("Business Insights")
|
| 77 |
+
for cluster, desc in cluster_insights.items():
|
| 78 |
+
st.write(f"**Cluster {cluster}:** {desc}")
|
| 79 |
+
|
| 80 |
+
# ----------------------
|
| 81 |
+
# Interactive Section
|
| 82 |
+
# ----------------------
|
| 83 |
+
st.subheader("🍷 Explore Wines Interactively")
|
| 84 |
+
|
| 85 |
+
# Slider for alcohol content
|
| 86 |
+
alcohol_val = st.slider(
|
| 87 |
+
"Select minimum alcohol content",
|
| 88 |
+
float(df['alcohol'].min()),
|
| 89 |
+
float(df['alcohol'].max()),
|
| 90 |
+
float(df['alcohol'].min())
|
| 91 |
+
)
|
| 92 |
+
filtered_df = df[df['alcohol'] >= alcohol_val]
|
| 93 |
+
st.write(f"Wines with alcohol ≥ {alcohol_val}")
|
| 94 |
+
st.dataframe(filtered_df)
|
| 95 |
+
|
| 96 |
+
# Slider for pH
|
| 97 |
+
ph_val = st.slider(
|
| 98 |
+
"Select maximum pH",
|
| 99 |
+
float(df['pH'].min()),
|
| 100 |
+
float(df['pH'].max()),
|
| 101 |
+
float(df['pH'].max())
|
| 102 |
+
)
|
| 103 |
+
ph_filtered = filtered_df[filtered_df['pH'] <= ph_val]
|
| 104 |
+
st.write(f"Wines with alcohol ≥ {alcohol_val} and pH ≤ {ph_val}")
|
| 105 |
+
st.dataframe(ph_filtered)
|
| 106 |
+
|
| 107 |
+
# Dropdown for cluster selection
|
| 108 |
+
cluster_select = st.selectbox("Select Cluster to View", options=sorted(df['Cluster'].unique()))
|
| 109 |
+
cluster_filtered = df[df['Cluster'] == cluster_select]
|
| 110 |
+
st.write(f"Wines in Cluster {cluster_select}")
|
| 111 |
+
st.dataframe(cluster_filtered)
|