Riya1217 commited on
Commit
ec41e13
·
verified ·
1 Parent(s): e19311f

Upload 2 files

Browse files
Files changed (2) hide show
  1. assignment3.py +190 -0
  2. requriements.txt +0 -0
assignment3.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler
4
+ from sklearn.decomposition import PCA
5
+ from sklearn.cluster import KMeans
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+
9
+ # Set matplotlib backend for Streamlit compatibility
10
+ plt.switch_backend('Agg')
11
+
12
+ # --- Application Title and Introduction ---
13
+ st.title('Wine Quality Clustering Insights')
14
+ st.markdown("""
15
+ This application explores insights derived from applying unsupervised learning (PCA and K-Means clustering)
16
+ to a dataset of red wines based on their chemical properties. The goal is to identify distinct
17
+ segments of wines that can inform business strategies related to marketing, production,
18
+ and product development.
19
+ """)
20
+
21
+ # --- Data Loading and Preparation ---
22
+ @st.cache_data # Cache the data loading and preprocessing steps
23
+ def load_data():
24
+ """Loads the dataset and performs preprocessing."""
25
+ wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
26
+ wine_data = pd.read_csv(wine_url, sep=';')
27
+
28
+ # Separate features and target
29
+ features = wine_data.drop('quality', axis=1)
30
+
31
+ # Standardize features
32
+ scaler = StandardScaler()
33
+ scaled_features = scaler.fit_transform(features)
34
+
35
+ # Apply PCA (keeping components explaining >= 80% variance)
36
+ pca = PCA(n_components=0.80)
37
+ pca_features = pca.fit_transform(scaled_features)
38
+
39
+ # Apply KMeans clustering with 3 clusters (based on previous analysis)
40
+ kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
41
+ wine_data['Cluster'] = kmeans.fit_predict(pca_features)
42
+
43
+ return wine_data
44
+
45
+ # Load the processed data
46
+ wine_data = load_data()
47
+
48
+ # --- Methodology Explanation (Expandable Section) ---
49
+ with st.expander("Explain the Methodology"):
50
+ st.markdown("""
51
+ This analysis used the following steps:
52
+ 1. **Data Preparation:** The dataset was loaded and chemical features were standardized to ensure they
53
+ are on a similar scale.
54
+ 2. **Dimensionality Reduction (PCA):** Principal Component Analysis was used to reduce the number
55
+ of features while retaining most of the original data's variance. This helps in handling
56
+ multicollinearity and preparing data for clustering.
57
+ 3. **Clustering (K-Means):** K-Means clustering was applied to the reduced data to group wines
58
+ with similar chemical properties into distinct clusters. We chose 3 clusters based on
59
+ evaluation metrics like the Elbow and Silhouette methods (performed in the notebook).
60
+ """)
61
+
62
+ # --- User Interface Controls ---
63
+ st.sidebar.header('Explore Clusters')
64
+
65
+ # Get unique cluster numbers and sort them
66
+ cluster_numbers = sorted(wine_data['Cluster'].unique())
67
+
68
+ # Create a selectbox for cluster selection in the sidebar
69
+ selected_cluster = st.sidebar.selectbox(
70
+ 'Select Cluster',
71
+ cluster_numbers
72
+ )
73
+
74
+ # Get unique quality ratings and sort them
75
+ quality_ratings = sorted(wine_data['quality'].unique())
76
+
77
+ # Create a slider for quality rating selection in the sidebar
78
+ selected_quality = st.sidebar.slider(
79
+ 'Select Quality Rating',
80
+ min_value=min(quality_ratings),
81
+ max_value=max(quality_ratings),
82
+ value=min(quality_ratings), # Set a default value
83
+ step=1 # Ensure only integer quality values are selected
84
+ )
85
+
86
+ # --- Implement Visualizations ---
87
+ st.subheader(f'Characteristics for Cluster {selected_cluster}, Quality {selected_quality}')
88
+
89
+ # Filter the wine_data DataFrame based on user selection
90
+ filtered_data = wine_data[
91
+ (wine_data['Cluster'] == selected_cluster) &
92
+ (wine_data['quality'] == selected_quality)
93
+ ]
94
+
95
+ # Check if filtered data is empty
96
+ if filtered_data.empty:
97
+ st.warning(f"No data found for Cluster {selected_cluster} with Quality {selected_quality}.")
98
+ else:
99
+ # Create a bar chart for average chemical features
100
+ avg_features = filtered_data.drop(['quality', 'Cluster'], axis=1).mean()
101
+
102
+ fig1, ax1 = plt.subplots(figsize=(10, 5)) # Adjusted figure size
103
+ avg_features.plot(kind='bar', ax=ax1, color='skyblue') # Added color
104
+
105
+ # Add labels and title to the bar chart
106
+ ax1.set_xlabel('Chemical Features')
107
+ ax1.set_ylabel('Average Value')
108
+ ax1.set_title(f'Average Chemical Features for Cluster {selected_cluster}, Quality {selected_quality}')
109
+ plt.xticks(rotation=45, ha='right') # Rotate labels for readability
110
+ fig1.tight_layout() # Adjust layout to prevent labels overlapping
111
+
112
+ # Display the bar chart
113
+ st.pyplot(fig1)
114
+ plt.close(fig1) # Close the figure to free memory
115
+
116
+ # Create a scatter plot of 'alcohol' vs 'volatile acidity'
117
+ fig2, ax2 = plt.subplots(figsize=(8, 5)) # Adjusted figure size
118
+ ax2.scatter(filtered_data['alcohol'], filtered_data['volatile acidity'], alpha=0.6, color='lightcoral') # Added alpha and color
119
+
120
+ # Add labels and title to the scatter plot
121
+ ax2.set_xlabel('Alcohol')
122
+ ax2.set_ylabel('Volatile Acidity')
123
+ ax2.set_title(f'Alcohol vs Volatile Acidity for Cluster {selected_cluster}, Quality {selected_quality}')
124
+ ax2.grid(True, linestyle='--', alpha=0.6) # Add grid
125
+
126
+ # Display the scatter plot
127
+ st.pyplot(fig2)
128
+ plt.close(fig2) # Close the figure
129
+
130
+ # --- Display Dynamic Insights ---
131
+ st.subheader("Cluster Insights and Recommendations")
132
+
133
+ # Define cluster insights and recommendations
134
+ cluster_insights = {
135
+ 0: {
136
+ "Description": "Premium Taste Wines: Balanced acidity, high alcohol, high quality",
137
+ "Recommendation": "Market to wine connoisseurs; premium pricing; emphasize quality in promotions."
138
+ },
139
+ 1: {
140
+ "Description": "Sweet & Mild Wines: Higher sugar, lower acidity, moderate quality",
141
+ "Recommendation": "Target casual drinkers; affordable pricing; highlight smooth and approachable taste."
142
+ },
143
+ 2: {
144
+ "Description": "Sharp & Preservative-heavy Wines: High acidity, higher sulfates, lower quality",
145
+ "Recommendation": "Target budget-conscious customers; optimize production to reduce sulfates; focus on cost-efficiency."
146
+ }
147
+ }
148
+
149
+ if selected_cluster in cluster_insights:
150
+ insight = cluster_insights[selected_cluster]
151
+ st.markdown(f"**Cluster {selected_cluster}:**")
152
+ st.markdown(f"- **Description:** {insight['Description']}")
153
+ st.markdown(f"- **Recommendation:** {insight['Recommendation']}")
154
+
155
+ # Add a dynamic message based on quality
156
+ if selected_quality >= 6:
157
+ st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *higher quality* wines.")
158
+ else:
159
+ st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *moderate to lower quality* wines. This segment might be suitable for value-focused markets or present opportunities for quality improvement.")
160
+
161
+ else:
162
+ st.write("Select a cluster to see insights.")
163
+
164
+ # --- Concluding Section ---
165
+ st.markdown("""
166
+ ---
167
+ **Key Takeaways:**
168
+ * The clustering analysis reveals distinct groups of wines based on their chemical composition.
169
+ * Understanding these clusters allows for targeted marketing and product strategies.
170
+ * Wines in Cluster 0 tend to align with 'Premium Taste', Cluster 1 with 'Sweet & Mild', and Cluster 2 with 'Sharp & Preservative-heavy'.
171
+ * Quality ratings within each cluster can vary, providing further granularity for decision-making.
172
+
173
+ **Next Steps:**
174
+ * Validate these clusters with sensory evaluation data.
175
+ * Integrate these insights into marketing campaigns and production planning.
176
+ * Explore other clustering algorithms or feature engineering techniques.
177
+ """)
178
+
179
+ # --- requirements.txt content ---
180
+ requirements_content = """streamlit
181
+ pandas
182
+ scikit-learn
183
+ matplotlib
184
+ numpy
185
+ """
186
+
187
+ # Print the requirements.txt content for deployment
188
+ print("\n--- requirements.txt content ---")
189
+ print(requirements_content)
190
+ print("--- end requirements.txt content ---")
requriements.txt ADDED
Binary file (202 Bytes). View file