Spaces:

aikanava
/

wine_quality_pca_explorer

Sleeping

App Files Files Community

wine_quality_pca_explorer / app.py

aikanava

upload files

2baa578 9 months ago

raw

history blame contribute delete

4.24 kB

	# app.py

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Streamlit page setup
	st.set_page_config(page_title="PCA Explorer - Wine Quality", page_icon="🍷", layout="wide")

	# Title and short description
	st.title("🍷 Principal Component Analysis (PCA) on Wine Quality Dataset")
	st.write("""
	This app demonstrates Principal Component Analysis (PCA) for dimensionality reduction and visualization of the Wine Quality Dataset.
	""")

	# Load Wine Quality dataset (local file)
	@st.cache_data
	def load_data():
	data = pd.read_csv("winequality-red.csv") # Make sure the dataset is named correctly
	return data

	data = load_data()

	# Sidebar settings
	st.sidebar.header("Settings")
	n_components = st.sidebar.slider("Select number of PCA components", 2, min(data.shape[1], 10), 2)

	# Features selection (all numeric columns except 'quality')
	features = data.drop(columns=['quality'])

	# Standardize the data
	scaler = StandardScaler()
	scaled_features = scaler.fit_transform(features)

	# Perform PCA
	pca = PCA(n_components=n_components)
	principal_components = pca.fit_transform(scaled_features)

	# Create DataFrame for PCA result
	pca_df = pd.DataFrame(
	data=principal_components,
	columns=[f"PC{i+1}" for i in range(n_components)]
	)

	# Add the 'quality' column to the PCA DataFrame
	pca_df['Quality'] = data['quality']

	# Tabs
	tab1, tab2, tab3, tab4 = st.tabs(["📄 Raw Dataset", "📉 PCA Scatter Plot", "📈 Explained Variance", "📥 Download Reduced Dataset"])

	with tab1:
	st.subheader("📄 Raw Dataset")
	st.dataframe(data)

	with tab2:
	st.subheader("📉 PCA Scatter Plot")
	if n_components >= 2:
	fig, ax = plt.subplots()
	sns.scatterplot(
	x="PC1",
	y="PC2",
	data=pca_df,
	hue="Quality",
	palette="viridis",
	s=70,
	edgecolor="black",
	alpha=0.7
	)
	ax.set_xlabel("Principal Component 1")
	ax.set_ylabel("Principal Component 2")
	ax.set_title("PCA - First Two Components")
	st.pyplot(fig)

	st.write("""
	The scatter plot above shows how the wine samples are distributed in the space of the first two principal components.
	Points are colored based on their wine quality, which ranges from 3 (poor) to 8 (excellent).
	- Clusters: Notice how wines of similar quality tend to group together in the plot.
	- Separation: High-quality wines (higher quality scores) tend to be more spread out, while lower-quality wines are often more tightly clustered.
	""")

	else:
	st.warning("Please select at least 2 components to plot a scatter plot.")

	with tab3:
	st.subheader("📈 Explained Variance Ratio")
	exp_var = pca.explained_variance_ratio_
	fig2, ax2 = plt.subplots()
	sns.barplot(x=[f"PC{i+1}" for i in range(n_components)], y=exp_var, color="skyblue", ax=ax2)
	ax2.set_ylabel('Explained Variance Ratio')
	ax2.set_xlabel('Principal Components')
	ax2.set_title('Variance Explained by Each Principal Component')
	st.pyplot(fig2)

	st.markdown(f"Total Variance Explained: {np.sum(exp_var):.2f}")
	st.write("""
	The bar plot shows the explained variance ratio of each principal component.
	- Higher variance means that component carries more information.
	- In this case, the first few components explain the majority of the variance in the dataset, with later components contributing less.
	- By selecting fewer components, we reduce dimensionality but still retain most of the data's information.
	""")

	with tab4:
	st.subheader("📥 Download Reduced Dataset")
	st.write("You can download the PCA-reduced dataset as a CSV file.")

	# Create a CSV for the PCA-reduced data
	pca_reduced = pca_df.to_csv(index=False)
	st.download_button(
	label="Download PCA Reduced Data",
	data=pca_reduced,
	file_name="pca_reduced_wine_quality.csv",
	mime="text/csv"
	)

	# Footer
	st.markdown("---")
	st.caption("Made with ❤️ using Streamlit")