Spaces:

Riya1217
/

assignment33

Sleeping

App Files Files Community

assignment33 / assignment3.py

Riya1217

Upload 2 files

ec41e13 verified 5 months ago

raw

history blame contribute delete

8.1 kB

	import streamlit as st
	import pandas as pd
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from sklearn.cluster import KMeans
	import matplotlib.pyplot as plt
	import numpy as np

	# Set matplotlib backend for Streamlit compatibility
	plt.switch_backend('Agg')

	# --- Application Title and Introduction ---
	st.title('Wine Quality Clustering Insights')
	st.markdown("""
	This application explores insights derived from applying unsupervised learning (PCA and K-Means clustering)
	to a dataset of red wines based on their chemical properties. The goal is to identify distinct
	segments of wines that can inform business strategies related to marketing, production,
	and product development.
	""")

	# --- Data Loading and Preparation ---
	@st.cache_data # Cache the data loading and preprocessing steps
	def load_data():
	"""Loads the dataset and performs preprocessing."""
	wine_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
	wine_data = pd.read_csv(wine_url, sep=';')

	# Separate features and target
	features = wine_data.drop('quality', axis=1)

	# Standardize features
	scaler = StandardScaler()
	scaled_features = scaler.fit_transform(features)

	# Apply PCA (keeping components explaining >= 80% variance)
	pca = PCA(n_components=0.80)
	pca_features = pca.fit_transform(scaled_features)

	# Apply KMeans clustering with 3 clusters (based on previous analysis)
	kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
	wine_data['Cluster'] = kmeans.fit_predict(pca_features)

	return wine_data

	# Load the processed data
	wine_data = load_data()

	# --- Methodology Explanation (Expandable Section) ---
	with st.expander("Explain the Methodology"):
	st.markdown("""
	This analysis used the following steps:
	1. Data Preparation: The dataset was loaded and chemical features were standardized to ensure they
	are on a similar scale.
	2. Dimensionality Reduction (PCA): Principal Component Analysis was used to reduce the number
	of features while retaining most of the original data's variance. This helps in handling
	multicollinearity and preparing data for clustering.
	3. Clustering (K-Means): K-Means clustering was applied to the reduced data to group wines
	with similar chemical properties into distinct clusters. We chose 3 clusters based on
	evaluation metrics like the Elbow and Silhouette methods (performed in the notebook).
	""")

	# --- User Interface Controls ---
	st.sidebar.header('Explore Clusters')

	# Get unique cluster numbers and sort them
	cluster_numbers = sorted(wine_data['Cluster'].unique())

	# Create a selectbox for cluster selection in the sidebar
	selected_cluster = st.sidebar.selectbox(
	'Select Cluster',
	cluster_numbers
	)

	# Get unique quality ratings and sort them
	quality_ratings = sorted(wine_data['quality'].unique())

	# Create a slider for quality rating selection in the sidebar
	selected_quality = st.sidebar.slider(
	'Select Quality Rating',
	min_value=min(quality_ratings),
	max_value=max(quality_ratings),
	value=min(quality_ratings), # Set a default value
	step=1 # Ensure only integer quality values are selected
	)

	# --- Implement Visualizations ---
	st.subheader(f'Characteristics for Cluster {selected_cluster}, Quality {selected_quality}')

	# Filter the wine_data DataFrame based on user selection
	filtered_data = wine_data[
	(wine_data['Cluster'] == selected_cluster) &
	(wine_data['quality'] == selected_quality)
	]

	# Check if filtered data is empty
	if filtered_data.empty:
	st.warning(f"No data found for Cluster {selected_cluster} with Quality {selected_quality}.")
	else:
	# Create a bar chart for average chemical features
	avg_features = filtered_data.drop(['quality', 'Cluster'], axis=1).mean()

	fig1, ax1 = plt.subplots(figsize=(10, 5)) # Adjusted figure size
	avg_features.plot(kind='bar', ax=ax1, color='skyblue') # Added color

	# Add labels and title to the bar chart
	ax1.set_xlabel('Chemical Features')
	ax1.set_ylabel('Average Value')
	ax1.set_title(f'Average Chemical Features for Cluster {selected_cluster}, Quality {selected_quality}')
	plt.xticks(rotation=45, ha='right') # Rotate labels for readability
	fig1.tight_layout() # Adjust layout to prevent labels overlapping

	# Display the bar chart
	st.pyplot(fig1)
	plt.close(fig1) # Close the figure to free memory

	# Create a scatter plot of 'alcohol' vs 'volatile acidity'
	fig2, ax2 = plt.subplots(figsize=(8, 5)) # Adjusted figure size
	ax2.scatter(filtered_data['alcohol'], filtered_data['volatile acidity'], alpha=0.6, color='lightcoral') # Added alpha and color

	# Add labels and title to the scatter plot
	ax2.set_xlabel('Alcohol')
	ax2.set_ylabel('Volatile Acidity')
	ax2.set_title(f'Alcohol vs Volatile Acidity for Cluster {selected_cluster}, Quality {selected_quality}')
	ax2.grid(True, linestyle='--', alpha=0.6) # Add grid

	# Display the scatter plot
	st.pyplot(fig2)
	plt.close(fig2) # Close the figure

	# --- Display Dynamic Insights ---
	st.subheader("Cluster Insights and Recommendations")

	# Define cluster insights and recommendations
	cluster_insights = {
	0: {
	"Description": "Premium Taste Wines: Balanced acidity, high alcohol, high quality",
	"Recommendation": "Market to wine connoisseurs; premium pricing; emphasize quality in promotions."
	},
	1: {
	"Description": "Sweet & Mild Wines: Higher sugar, lower acidity, moderate quality",
	"Recommendation": "Target casual drinkers; affordable pricing; highlight smooth and approachable taste."
	},
	2: {
	"Description": "Sharp & Preservative-heavy Wines: High acidity, higher sulfates, lower quality",
	"Recommendation": "Target budget-conscious customers; optimize production to reduce sulfates; focus on cost-efficiency."
	}
	}

	if selected_cluster in cluster_insights:
	insight = cluster_insights[selected_cluster]
	st.markdown(f"Cluster {selected_cluster}:")
	st.markdown(f"- Description: {insight['Description']}")
	st.markdown(f"- Recommendation: {insight['Recommendation']}")

	# Add a dynamic message based on quality
	if selected_quality >= 6:
	st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with higher quality wines.")
	else:
	st.info(f"Based on your selection, wines in this segment (Cluster {selected_cluster}, Quality {selected_quality}) show characteristics often associated with moderate to lower quality wines. This segment might be suitable for value-focused markets or present opportunities for quality improvement.")

	else:
	st.write("Select a cluster to see insights.")

	# --- Concluding Section ---
	st.markdown("""
	---
	Key Takeaways:
	* The clustering analysis reveals distinct groups of wines based on their chemical composition.
	* Understanding these clusters allows for targeted marketing and product strategies.
	* Wines in Cluster 0 tend to align with 'Premium Taste', Cluster 1 with 'Sweet & Mild', and Cluster 2 with 'Sharp & Preservative-heavy'.
	* Quality ratings within each cluster can vary, providing further granularity for decision-making.

	Next Steps:
	* Validate these clusters with sensory evaluation data.
	* Integrate these insights into marketing campaigns and production planning.
	* Explore other clustering algorithms or feature engineering techniques.
	""")

	# --- requirements.txt content ---
	requirements_content = """streamlit
	pandas
	scikit-learn
	matplotlib
	numpy
	"""

	# Print the requirements.txt content for deployment
	print("\n--- requirements.txt content ---")
	print(requirements_content)
	print("--- end requirements.txt content ---")