Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| # Set matplotlib backend for Streamlit compatibility | |
| plt.switch_backend('Agg') | |
| # --- Application Title and Introduction --- | |
| st.title('Wine Quality Clustering Insights') | |
| st.markdown(""" | |
| This application explores insights derived from applying unsupervised learning (PCA and K-Means clustering) | |
| to a dataset of red wines based on their chemical properties. The goal is to identify distinct | |
| segments of wines that can inform business strategies related to marketing, production, | |
| and product development. | |
| """) | |
| # --- Data Loading and Preparation --- | |
| # Cache the data loading and preprocessing steps | |
| def load_data(): | |
| """Loads the dataset and performs preprocessing.""" | |
| wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' | |
| wine_data = pd.read_csv(wine_url, sep=';') | |
| # Separate features and target | |
| features = wine_data.drop('quality', axis=1) | |
| # Standardize features | |
| scaler = StandardScaler() | |
| scaled_features = scaler.fit_transform(features) | |
| # Apply PCA (keeping components explaining >= 80% variance) | |
| pca = PCA(n_components=0.80) | |
| pca_features = pca.fit_transform(scaled_features) | |
| # Apply KMeans clustering with 3 clusters (based on previous analysis) | |
| kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) | |
| wine_data['Cluster'] = kmeans.fit_predict(pca_features) | |
| return wine_data | |
| # Load the processed data | |
| wine_data = load_data() | |
| # --- Methodology Explanation (Expandable Section) --- | |
| with st.expander("Explain the Methodology"): | |
| st.markdown(""" | |
| This analysis used the following steps: | |
| 1. **Data Preparation:** The dataset was loaded and chemical features were standardized to ensure they | |
| are on a similar scale. | |
| 2. **Dimensionality Reduction (PCA):** Principal Component Analysis was used to reduce the number | |
| of features while retaining most of the original data's variance. This helps in handling | |
| multicollinearity and preparing data for clustering. | |
| 3. **Clustering (K-Means):** K-Means clustering was applied to the reduced data to group wines | |
| with similar chemical properties into distinct clusters. We chose 3 clusters based on | |
| evaluation metrics like the Elbow and Silhouette methods (performed in the notebook). | |
| """) | |
| # --- User Interface Controls --- | |
| st.sidebar.header('Explore Clusters') | |
| # Get unique cluster numbers and sort them | |
| cluster_numbers = sorted(wine_data['Cluster'].unique()) | |
| # Create a selectbox for cluster selection in the sidebar | |
| selected_cluster = st.sidebar.selectbox( | |
| 'Select Cluster', | |
| cluster_numbers | |
| ) | |
| # Get unique quality ratings and sort them | |
| quality_ratings = sorted(wine_data['quality'].unique()) | |
| # Create a slider for quality rating selection in the sidebar | |
| selected_quality = st.sidebar.slider( | |
| 'Select Quality Rating', | |
| min_value=min(quality_ratings), | |
| max_value=max(quality_ratings), | |
| value=min(quality_ratings), # Set a default value | |
| step=1 # Ensure only integer quality values are selected | |
| ) | |
| # --- Implement Visualizations --- | |
| st.subheader(f'Characteristics for Cluster {selected_cluster}, Quality {selected_quality}') | |
| # Filter the wine_data DataFrame based on user selection | |
| filtered_data = wine_data[ | |
| (wine_data['Cluster'] == selected_cluster) & | |
| (wine_data['quality'] == selected_quality) | |
| ] | |
| # Check if filtered data is empty | |
| if filtered_data.empty: | |
| st.warning(f"No data found for Cluster {selected_cluster} with Quality {selected_quality}.") | |
| else: | |
| # Create a bar chart for average chemical features | |
| avg_features = filtered_data.drop(['quality', 'Cluster'], axis=1).mean() | |
| fig1, ax1 = plt.subplots(figsize=(10, 5)) # Adjusted figure size | |
| avg_features.plot(kind='bar', ax=ax1, color='skyblue') # Added color | |
| # Add labels and title to the bar chart | |
| ax1.set_xlabel('Chemical Features') | |
| ax1.set_ylabel('Average Value') | |
| ax1.set_title(f'Average Chemical Features for Cluster {selected_cluster}, Quality {selected_quality}') | |
| plt.xticks(rotation=45, ha='right') # Rotate labels for readability | |
| fig1.tight_layout() # Adjust layout to prevent labels overlapping | |
| # Display the bar chart | |
| st.pyplot(fig1) | |
| plt.close(fig1) # Close the figure to free memory | |
| # Create a scatter plot of 'alcohol' vs 'volatile acidity' | |
| fig2, ax2 = plt.subplots(figsize=(8, 5)) # Adjusted figure size | |
| ax2.scatter(filtered_data['alcohol'], filtered_data['volatile acidity'], alpha=0.6, color='lightcoral') # Added alpha and color | |
| # Add labels and title to the scatter plot | |
| ax2.set_xlabel('Alcohol') | |
| ax2.set_ylabel('Volatile Acidity') | |
| ax2.set_title(f'Alcohol vs Volatile Acidity for Cluster {selected_cluster}, Quality {selected_quality}') | |
| ax2.grid(True, linestyle='--', alpha=0.6) # Add grid | |
| # Display the scatter plot | |
| st.pyplot(fig2) | |
| plt.close(fig2) # Close the figure | |
| # --- Display Dynamic Insights --- | |
| st.subheader("Cluster Insights and Recommendations") | |
| # Define cluster insights and recommendations | |
| cluster_insights = { | |
| 0: { | |
| "Description": "Premium Taste Wines: Balanced acidity, high alcohol, high quality", | |
| "Recommendation": "Market to wine connoisseurs; premium pricing; emphasize quality in promotions." | |
| }, | |
| 1: { | |
| "Description": "Sweet & Mild Wines: Higher sugar, lower acidity, moderate quality", | |
| "Recommendation": "Target casual drinkers; affordable pricing; highlight smooth and approachable taste." | |
| }, | |
| 2: { | |
| "Description": "Sharp & Preservative-heavy Wines: High acidity, higher sulfates, lower quality", | |
| "Recommendation": "Target budget-conscious customers; optimize production to reduce sulfates; focus on cost-efficiency." | |
| } | |
| } | |
| if selected_cluster in cluster_insights: | |
| insight = cluster_insights[selected_cluster] | |
| st.markdown(f"**Cluster {selected_cluster}:**") | |
| st.markdown(f"- **Description:** {insight['Description']}") | |
| st.markdown(f"- **Recommendation:** {insight['Recommendation']}") | |
| # Add a dynamic message based on quality | |
| if selected_quality >= 6: | |
| st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *higher quality* wines.") | |
| else: | |
| st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with *moderate to lower quality* wines. This segment might be suitable for value-focused markets or present opportunities for quality improvement.") | |
| else: | |
| st.write("Select a cluster to see insights.") | |
| # --- Concluding Section --- | |
| st.markdown(""" | |
| --- | |
| **Key Takeaways:** | |
| * The clustering analysis reveals distinct groups of wines based on their chemical composition. | |
| * Understanding these clusters allows for targeted marketing and product strategies. | |
| * Wines in Cluster 0 tend to align with 'Premium Taste', Cluster 1 with 'Sweet & Mild', and Cluster 2 with 'Sharp & Preservative-heavy'. | |
| * Quality ratings within each cluster can vary, providing further granularity for decision-making. | |
| **Next Steps:** | |
| * Validate these clusters with sensory evaluation data. | |
| * Integrate these insights into marketing campaigns and production planning. | |
| * Explore other clustering algorithms or feature engineering techniques. | |
| """) | |
| # --- requirements.txt content --- | |
| requirements_content = """streamlit | |
| pandas | |
| scikit-learn | |
| matplotlib | |
| numpy | |
| """ | |
| # Print the requirements.txt content for deployment | |
| print("\n--- requirements.txt content ---") | |
| print(requirements_content) | |
| print("--- end requirements.txt content ---") | |