import streamlit as st import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans import matplotlib.pyplot as plt import numpy as np # Set matplotlib backend for Streamlit compatibility plt.switch_backend('Agg') # --- Application Title and Introduction --- st.title('Wine Quality Clustering Insights') st.markdown(""" This application explores insights derived from applying unsupervised learning (PCA and K-Means clustering) to a dataset of red wines based on their chemical properties. The goal is to identify distinct segments of wines that can inform business strategies related to marketing, production, and product development. """) # --- Data Loading and Preparation --- @st.cache_data # Cache the data loading and preprocessing steps def load_data(): """Loads the dataset and performs preprocessing.""" wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' wine_data = pd.read_csv(wine_url, sep=';') # Separate features and target features = wine_data.drop('quality', axis=1) # Standardize features scaler = StandardScaler() scaled_features = scaler.fit_transform(features) # Apply PCA (keeping components explaining >= 80% variance) pca = PCA(n_components=0.80) pca_features = pca.fit_transform(scaled_features) # Apply KMeans clustering with 3 clusters (based on previous analysis) kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) wine_data['Cluster'] = kmeans.fit_predict(pca_features) return wine_data # Load the processed data wine_data = load_data() # --- Methodology Explanation (Expandable Section) --- with st.expander("Explain the Methodology"): st.markdown(""" This analysis used the following steps: 1. **Data Preparation:** The dataset was loaded and chemical features were standardized to ensure they are on a similar scale. 2. **Dimensionality Reduction (PCA):** Principal Component Analysis was used to reduce the number of features while retaining most of the original data's variance. This helps in handling multicollinearity and preparing data for clustering. 3. **Clustering (K-Means):** K-Means clustering was applied to the reduced data to group wines with similar chemical properties into distinct clusters. We chose 3 clusters based on evaluation metrics like the Elbow and Silhouette methods (performed in the notebook). """) # --- User Interface Controls --- st.sidebar.header('Explore Clusters') # Get unique cluster numbers and sort them cluster_numbers = sorted(wine_data['Cluster'].unique()) # Create a selectbox for cluster selection in the sidebar selected_cluster = st.sidebar.selectbox( 'Select Cluster', cluster_numbers ) # Get unique quality ratings and sort them quality_ratings = sorted(wine_data['quality'].unique()) # Create a slider for quality rating selection in the sidebar selected_quality = st.sidebar.slider( 'Select Quality Rating', min_value=min(quality_ratings), max_value=max(quality_ratings), value=min(quality_ratings), # Set a default value step=1 # Ensure only integer quality values are selected ) # --- Implement Visualizations --- st.subheader(f'Characteristics for Cluster {selected_cluster}, Quality {selected_quality}') # Filter the wine_data DataFrame based on user selection filtered_data = wine_data[ (wine_data['Cluster'] == selected_cluster) & (wine_data['quality'] == selected_quality) ] # Check if filtered data is empty if filtered_data.empty: st.warning(f"No data found for Cluster {selected_cluster} with Quality {selected_quality}.") else: # Create a bar chart for average chemical features avg_features = filtered_data.drop(['quality', 'Cluster'], axis=1).mean() fig1, ax1 = plt.subplots(figsize=(10, 5)) # Adjusted figure size avg_features.plot(kind='bar', ax=ax1, color='skyblue') # Added color # Add labels and title to the bar chart ax1.set_xlabel('Chemical Features') ax1.set_ylabel('Average Value') ax1.set_title(f'Average Chemical Features for Cluster {selected_cluster}, Quality {selected_quality}') plt.xticks(rotation=45, ha='right') # Rotate labels for readability fig1.tight_layout() # Adjust layout to prevent labels overlapping # Display the bar chart st.pyplot(fig1) plt.close(fig1) # Close the figure to free memory # Create a scatter plot of 'alcohol' vs 'volatile acidity' fig2, ax2 = plt.subplots(figsize=(8, 5)) # Adjusted figure size ax2.scatter(filtered_data['alcohol'], filtered_data['volatile acidity'], alpha=0.6, color='lightcoral') # Added alpha and color # Add labels and title to the scatter plot ax2.set_xlabel('Alcohol') ax2.set_ylabel('Volatile Acidity') ax2.set_title(f'Alcohol vs Volatile Acidity for Cluster {selected_cluster}, Quality {selected_quality}') ax2.grid(True, linestyle='--', alpha=0.6) # Add grid # Display the scatter plot st.pyplot(fig2) plt.close(fig2) # Close the figure # --- Display Dynamic Insights --- st.subheader("Cluster Insights and Recommendations") # Define cluster insights and recommendations cluster_insights = { 0: { "Description": "Premium Taste Wines: Balanced acidity, high alcohol, high quality", "Recommendation": "Market to wine connoisseurs; premium pricing; emphasize quality in promotions." }, 1: { "Description": "Sweet & Mild Wines: Higher sugar, lower acidity, moderate quality", "Recommendation": "Target casual drinkers; affordable pricing; highlight smooth and approachable taste." }, 2: { "Description": "Sharp & Preservative-heavy Wines: High acidity, higher sulfates, lower quality", "Recommendation": "Target budget-conscious customers; optimize production to reduce sulfates; focus on cost-efficiency." } } if selected_cluster in cluster_insights: insight = cluster_insights[selected_cluster] st.markdown(f"**Cluster {selected_cluster}:**") st.markdown(f"- **Description:** {insight['Description']}") st.markdown(f"- **Recommendation:** {insight['Recommendation']}") # Add a dynamic message based on quality if selected_quality >= 6: st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *higher quality* wines.") else: st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *moderate to lower quality* wines. This segment might be suitable for value-focused markets or present opportunities for quality improvement.") else: st.write("Select a cluster to see insights.") # --- Concluding Section --- st.markdown(""" --- **Key Takeaways:** * The clustering analysis reveals distinct groups of wines based on their chemical composition. * Understanding these clusters allows for targeted marketing and product strategies. * Wines in Cluster 0 tend to align with 'Premium Taste', Cluster 1 with 'Sweet & Mild', and Cluster 2 with 'Sharp & Preservative-heavy'. * Quality ratings within each cluster can vary, providing further granularity for decision-making. **Next Steps:** * Validate these clusters with sensory evaluation data. * Integrate these insights into marketing campaigns and production planning. * Explore other clustering algorithms or feature engineering techniques. """) # --- requirements.txt content --- requirements_content = """streamlit pandas scikit-learn matplotlib numpy """ # Print the requirements.txt content for deployment print("\n--- requirements.txt content ---") print(requirements_content) print("--- end requirements.txt content ---")