assignment33 / assignment3.py
Riya1217's picture
Upload 2 files
ec41e13 verified
import streamlit as st
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
# Set matplotlib backend for Streamlit compatibility
plt.switch_backend('Agg')
# --- Application Title and Introduction ---
st.title('Wine Quality Clustering Insights')
st.markdown("""
This application explores insights derived from applying unsupervised learning (PCA and K-Means clustering)
to a dataset of red wines based on their chemical properties. The goal is to identify distinct
segments of wines that can inform business strategies related to marketing, production,
and product development.
""")
# --- Data Loading and Preparation ---
@st.cache_data # Cache the data loading and preprocessing steps
def load_data():
"""Loads the dataset and performs preprocessing."""
wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine_data = pd.read_csv(wine_url, sep=';')
# Separate features and target
features = wine_data.drop('quality', axis=1)
# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Apply PCA (keeping components explaining >= 80% variance)
pca = PCA(n_components=0.80)
pca_features = pca.fit_transform(scaled_features)
# Apply KMeans clustering with 3 clusters (based on previous analysis)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
wine_data['Cluster'] = kmeans.fit_predict(pca_features)
return wine_data
# Load the processed data
wine_data = load_data()
# --- Methodology Explanation (Expandable Section) ---
with st.expander("Explain the Methodology"):
st.markdown("""
This analysis used the following steps:
1. **Data Preparation:** The dataset was loaded and chemical features were standardized to ensure they
are on a similar scale.
2. **Dimensionality Reduction (PCA):** Principal Component Analysis was used to reduce the number
of features while retaining most of the original data's variance. This helps in handling
multicollinearity and preparing data for clustering.
3. **Clustering (K-Means):** K-Means clustering was applied to the reduced data to group wines
with similar chemical properties into distinct clusters. We chose 3 clusters based on
evaluation metrics like the Elbow and Silhouette methods (performed in the notebook).
""")
# --- User Interface Controls ---
st.sidebar.header('Explore Clusters')
# Get unique cluster numbers and sort them
cluster_numbers = sorted(wine_data['Cluster'].unique())
# Create a selectbox for cluster selection in the sidebar
selected_cluster = st.sidebar.selectbox(
'Select Cluster',
cluster_numbers
)
# Get unique quality ratings and sort them
quality_ratings = sorted(wine_data['quality'].unique())
# Create a slider for quality rating selection in the sidebar
selected_quality = st.sidebar.slider(
'Select Quality Rating',
min_value=min(quality_ratings),
max_value=max(quality_ratings),
value=min(quality_ratings), # Set a default value
step=1 # Ensure only integer quality values are selected
)
# --- Implement Visualizations ---
st.subheader(f'Characteristics for Cluster {selected_cluster}, Quality {selected_quality}')
# Filter the wine_data DataFrame based on user selection
filtered_data = wine_data[
(wine_data['Cluster'] == selected_cluster) &
(wine_data['quality'] == selected_quality)
]
# Check if filtered data is empty
if filtered_data.empty:
st.warning(f"No data found for Cluster {selected_cluster} with Quality {selected_quality}.")
else:
# Create a bar chart for average chemical features
avg_features = filtered_data.drop(['quality', 'Cluster'], axis=1).mean()
fig1, ax1 = plt.subplots(figsize=(10, 5)) # Adjusted figure size
avg_features.plot(kind='bar', ax=ax1, color='skyblue') # Added color
# Add labels and title to the bar chart
ax1.set_xlabel('Chemical Features')
ax1.set_ylabel('Average Value')
ax1.set_title(f'Average Chemical Features for Cluster {selected_cluster}, Quality {selected_quality}')
plt.xticks(rotation=45, ha='right') # Rotate labels for readability
fig1.tight_layout() # Adjust layout to prevent labels overlapping
# Display the bar chart
st.pyplot(fig1)
plt.close(fig1) # Close the figure to free memory
# Create a scatter plot of 'alcohol' vs 'volatile acidity'
fig2, ax2 = plt.subplots(figsize=(8, 5)) # Adjusted figure size
ax2.scatter(filtered_data['alcohol'], filtered_data['volatile acidity'], alpha=0.6, color='lightcoral') # Added alpha and color
# Add labels and title to the scatter plot
ax2.set_xlabel('Alcohol')
ax2.set_ylabel('Volatile Acidity')
ax2.set_title(f'Alcohol vs Volatile Acidity for Cluster {selected_cluster}, Quality {selected_quality}')
ax2.grid(True, linestyle='--', alpha=0.6) # Add grid
# Display the scatter plot
st.pyplot(fig2)
plt.close(fig2) # Close the figure
# --- Display Dynamic Insights ---
st.subheader("Cluster Insights and Recommendations")
# Define cluster insights and recommendations
cluster_insights = {
0: {
"Description": "Premium Taste Wines: Balanced acidity, high alcohol, high quality",
"Recommendation": "Market to wine connoisseurs; premium pricing; emphasize quality in promotions."
},
1: {
"Description": "Sweet & Mild Wines: Higher sugar, lower acidity, moderate quality",
"Recommendation": "Target casual drinkers; affordable pricing; highlight smooth and approachable taste."
},
2: {
"Description": "Sharp & Preservative-heavy Wines: High acidity, higher sulfates, lower quality",
"Recommendation": "Target budget-conscious customers; optimize production to reduce sulfates; focus on cost-efficiency."
}
}
if selected_cluster in cluster_insights:
insight = cluster_insights[selected_cluster]
st.markdown(f"**Cluster {selected_cluster}:**")
st.markdown(f"- **Description:** {insight['Description']}")
st.markdown(f"- **Recommendation:** {insight['Recommendation']}")
# Add a dynamic message based on quality
if selected_quality >= 6:
st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *higher quality* wines.")
else:
st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *moderate to lower quality* wines. This segment might be suitable for value-focused markets or present opportunities for quality improvement.")
else:
st.write("Select a cluster to see insights.")
# --- Concluding Section ---
st.markdown("""
---
**Key Takeaways:**
* The clustering analysis reveals distinct groups of wines based on their chemical composition.
* Understanding these clusters allows for targeted marketing and product strategies.
* Wines in Cluster 0 tend to align with 'Premium Taste', Cluster 1 with 'Sweet & Mild', and Cluster 2 with 'Sharp & Preservative-heavy'.
* Quality ratings within each cluster can vary, providing further granularity for decision-making.
**Next Steps:**
* Validate these clusters with sensory evaluation data.
* Integrate these insights into marketing campaigns and production planning.
* Explore other clustering algorithms or feature engineering techniques.
""")
# --- requirements.txt content ---
requirements_content = """streamlit
pandas
scikit-learn
matplotlib
numpy
"""
# Print the requirements.txt content for deployment
print("\n--- requirements.txt content ---")
print(requirements_content)
print("--- end requirements.txt content ---")