Riya1217 commited on
Commit
3d35f65
·
verified ·
1 Parent(s): e01340c

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +5 -0
  2. wine_analysis_app.py +268 -0
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ seaborn
4
+ matplotlib
5
+
wine_analysis_app.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.decomposition import PCA
6
+ from sklearn.cluster import KMeans
7
+ from sklearn.metrics import silhouette_score
8
+ import matplotlib.pyplot as plt
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+
12
+ # Set page configuration
13
+ st.set_page_config(
14
+ page_title="Wine Quality Analysis",
15
+ page_icon="🍷",
16
+ layout="wide"
17
+ )
18
+
19
+ # Title and description
20
+ st.title("🍷 Wine Quality Analysis")
21
+ st.markdown("""
22
+ This app analyzes the Wine Quality dataset using unsupervised learning techniques.
23
+ Explore the dataset, visualize PCA components, and see clustering results.
24
+ """)
25
+
26
+ # Load the dataset
27
+ @st.cache_data
28
+ def load_data():
29
+ wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
30
+ wine_data = pd.read_csv(wine_url, sep=';')
31
+ return wine_data
32
+
33
+ wine_data = load_data()
34
+
35
+ # Sidebar for navigation
36
+ st.sidebar.title("Navigation")
37
+ options = st.sidebar.radio("Select a section:",
38
+ ["Dataset Overview", "PCA Analysis", "Clustering Analysis", "Cluster Insights"])
39
+
40
+ # Dataset Overview Section
41
+ if options == "Dataset Overview":
42
+ st.header("Dataset Overview")
43
+
44
+ st.subheader("First few rows of the dataset")
45
+ st.dataframe(wine_data.head())
46
+
47
+ st.subheader("Dataset Information")
48
+ col1, col2 = st.columns(2)
49
+
50
+ with col1:
51
+ st.write("**Shape:**", wine_data.shape)
52
+ st.write("**Columns:**", list(wine_data.columns))
53
+
54
+ with col2:
55
+ st.write("**Missing values:**")
56
+ missing_values = wine_data.isnull().sum()
57
+ st.write(missing_values)
58
+
59
+ st.subheader("Feature Distributions")
60
+ selected_feature = st.selectbox("Select a feature to visualize:", wine_data.columns[:-1])
61
+
62
+ fig = px.histogram(wine_data, x=selected_feature, title=f"Distribution of {selected_feature}")
63
+ st.plotly_chart(fig)
64
+
65
+ st.subheader("Quality Distribution")
66
+ quality_counts = wine_data['quality'].value_counts().sort_index()
67
+ fig = px.bar(x=quality_counts.index, y=quality_counts.values,
68
+ labels={'x': 'Quality Score', 'y': 'Count'},
69
+ title="Distribution of Wine Quality Scores")
70
+ st.plotly_chart(fig)
71
+
72
+ # PCA Analysis Section
73
+ elif options == "PCA Analysis":
74
+ st.header("Principal Component Analysis (PCA)")
75
+
76
+ # Prepare the data
77
+ features = wine_data.drop('quality', axis=1)
78
+ scaler = StandardScaler()
79
+ scaled_features = scaler.fit_transform(features)
80
+
81
+ # Perform PCA
82
+ pca = PCA()
83
+ pca_result = pca.fit_transform(scaled_features)
84
+
85
+ # Explained variance
86
+ explained_variance = np.cumsum(pca.explained_variance_ratio_)
87
+
88
+ # Plot explained variance
89
+ fig = go.Figure()
90
+ fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance)+1)),
91
+ y=explained_variance,
92
+ mode='lines+markers',
93
+ name='Cumulative Explained Variance'))
94
+ fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance)+1)),
95
+ y=[0.80]*len(explained_variance),
96
+ mode='lines',
97
+ name='80% Variance Threshold',
98
+ line=dict(dash='dash')))
99
+ fig.update_layout(title='PCA Explained Variance',
100
+ xaxis_title='Number of Principal Components',
101
+ yaxis_title='Cumulative Explained Variance')
102
+ st.plotly_chart(fig)
103
+
104
+ # Choose optimal components
105
+ optimal_components = np.argmax(explained_variance >= 0.80) + 1
106
+ st.write(f"**Optimal number of principal components:** {optimal_components} (explains ~80% of variance)")
107
+
108
+ # PCA component interpretation
109
+ pca_components = pd.DataFrame(pca.components_, columns=features.columns)
110
+ main_components = pca_components.iloc[:optimal_components]
111
+
112
+ st.subheader("Main Principal Components Interpretation")
113
+
114
+ for i, row in main_components.iterrows():
115
+ st.write(f"**PC{i+1}** represents major influence from:")
116
+ sorted_features = row.abs().sort_values(ascending=False)
117
+ top_features = list(sorted_features.items())[:3]
118
+
119
+ for feature, value in top_features:
120
+ st.write(f" - {feature} (weight {value:.2f})")
121
+
122
+ # Visualize PCA results
123
+ st.subheader("PCA Visualization")
124
+
125
+ # Select components to visualize
126
+ col1, col2 = st.columns(2)
127
+
128
+ with col1:
129
+ x_component = st.selectbox("X-axis component",
130
+ [f"PC{i+1}" for i in range(optimal_components)],
131
+ index=0)
132
+ with col2:
133
+ y_component = st.selectbox("Y-axis component",
134
+ [f"PC{i+1}" for i in range(optimal_components)],
135
+ index=1)
136
+
137
+ x_idx = int(x_component[2:]) - 1
138
+ y_idx = int(y_component[2:]) - 1
139
+
140
+ # Create scatter plot
141
+ fig = px.scatter(x=pca_result[:, x_idx], y=pca_result[:, y_idx],
142
+ color=wine_data['quality'],
143
+ labels={'x': x_component, 'y': y_component, 'color': 'Quality'},
144
+ title=f"{y_component} vs {x_component} Colored by Quality")
145
+ st.plotly_chart(fig)
146
+
147
+ # Clustering Analysis Section
148
+ elif options == "Clustering Analysis":
149
+ st.header("Clustering Analysis")
150
+
151
+ # Prepare the data
152
+ features = wine_data.drop('quality', axis=1)
153
+ scaler = StandardScaler()
154
+ scaled_features = scaler.fit_transform(features)
155
+
156
+ # Perform PCA for dimensionality reduction
157
+ pca = PCA(n_components=0.85)
158
+ pca_features = pca.fit_transform(scaled_features)
159
+
160
+ # Determine optimal number of clusters
161
+ inertia = []
162
+ silhouette = []
163
+ k_range = range(2, 11)
164
+
165
+ for k in k_range:
166
+ kmeans = KMeans(n_clusters=k, random_state=42)
167
+ labels = kmeans.fit_predict(pca_features)
168
+ inertia.append(kmeans.inertia_)
169
+
170
+ if k > 1: # Silhouette score requires at least 2 clusters
171
+ silhouette.append(silhouette_score(pca_features, labels))
172
+ else:
173
+ silhouette.append(0)
174
+
175
+ # Plot elbow and silhouette methods
176
+ col1, col2 = st.columns(2)
177
+
178
+ with col1:
179
+ fig = go.Figure()
180
+ fig.add_trace(go.Scatter(x=list(k_range), y=inertia, mode='lines+markers'))
181
+ fig.update_layout(title='Elbow Method',
182
+ xaxis_title='Number of Clusters',
183
+ yaxis_title='Inertia')
184
+ st.plotly_chart(fig)
185
+
186
+ with col2:
187
+ fig = go.Figure()
188
+ fig.add_trace(go.Scatter(x=list(k_range)[1:], y=silhouette[1:], mode='lines+markers'))
189
+ fig.update_layout(title='Silhouette Method',
190
+ xaxis_title='Number of Clusters',
191
+ yaxis_title='Silhouette Score')
192
+ st.plotly_chart(fig)
193
+
194
+ # Let user select number of clusters
195
+ k_optimal = st.slider("Select number of clusters:", min_value=2, max_value=10, value=3)
196
+
197
+ # Apply K-Means with selected clusters
198
+ kmeans = KMeans(n_clusters=k_optimal, random_state=42)
199
+ cluster_labels = kmeans.fit_predict(pca_features)
200
+
201
+ # Add cluster labels to the dataframe
202
+ wine_data_clustered = wine_data.copy()
203
+ wine_data_clustered['Cluster'] = cluster_labels
204
+
205
+ # Visualize clusters
206
+ st.subheader("Cluster Visualization")
207
+
208
+ # Create scatter plot of clusters
209
+ fig = px.scatter(x=pca_features[:, 0], y=pca_features[:, 1],
210
+ color=cluster_labels,
211
+ labels={'x': 'PC1', 'y': 'PC2', 'color': 'Cluster'},
212
+ title="Clusters Visualized in PCA Space")
213
+ st.plotly_chart(fig)
214
+
215
+ # Show cluster profiles
216
+ st.subheader("Cluster Profiles")
217
+ cluster_profiles = wine_data_clustered.groupby('Cluster').mean()
218
+ st.dataframe(cluster_profiles)
219
+
220
+ # Cluster Insights Section
221
+ elif options == "Cluster Insights":
222
+ st.header("Cluster Business Insights")
223
+
224
+ # Prepare the data (same as in clustering section)
225
+ features = wine_data.drop('quality', axis=1)
226
+ scaler = StandardScaler()
227
+ scaled_features = scaler.fit_transform(features)
228
+
229
+ pca = PCA(n_components=0.85)
230
+ pca_features = pca.fit_transform(scaled_features)
231
+
232
+ # Use 3 clusters as in the original analysis
233
+ kmeans = KMeans(n_clusters=3, random_state=42)
234
+ cluster_labels = kmeans.fit_predict(pca_features)
235
+
236
+ wine_data_clustered = wine_data.copy()
237
+ wine_data_clustered['Cluster'] = cluster_labels
238
+
239
+ # Define cluster insights (based on the original analysis)
240
+ cluster_insights = {
241
+ 0: "Premium Taste Wines: High alcohol, balanced acidity, high quality",
242
+ 1: "Sweet & Mild Wines: High sugar, low acidity, moderate quality",
243
+ 2: "Sharp & Preservative-heavy Wines: High acidity, high sulfates, lower quality"
244
+ }
245
+
246
+ # Display insights
247
+ for cluster, desc in cluster_insights.items():
248
+ st.subheader(f"Cluster {cluster}")
249
+ st.write(desc)
250
+
251
+ # Show statistics for this cluster
252
+ cluster_data = wine_data_clustered[wine_data_clustered['Cluster'] == cluster]
253
+ st.write(f"Number of wines in this cluster: {len(cluster_data)}")
254
+ st.write(f"Average quality: {cluster_data['quality'].mean():.2f}")
255
+
256
+ # Show key characteristics
257
+ key_features = ['alcohol', 'residual sugar', 'volatile acidity', 'citric acid', 'sulphates']
258
+ cluster_means = cluster_data[key_features].mean()
259
+
260
+ fig = go.Figure()
261
+ fig.add_trace(go.Bar(x=key_features, y=cluster_means.values,
262
+ name=f"Cluster {cluster}"))
263
+ fig.update_layout(title=f"Key Features for Cluster {cluster}",
264
+ yaxis_title="Average Value")
265
+ st.plotly_chart(fig)
266
+
267
+ st.write("---")
268
+