aikanava's picture
upload files
2baa578
# app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Streamlit page setup
st.set_page_config(page_title="PCA Explorer - Wine Quality", page_icon="🍷", layout="wide")
# Title and short description
st.title("🍷 Principal Component Analysis (PCA) on Wine Quality Dataset")
st.write("""
This app demonstrates **Principal Component Analysis (PCA)** for dimensionality reduction and visualization of the **Wine Quality Dataset**.
""")
# Load Wine Quality dataset (local file)
@st.cache_data
def load_data():
data = pd.read_csv("winequality-red.csv") # Make sure the dataset is named correctly
return data
data = load_data()
# Sidebar settings
st.sidebar.header("Settings")
n_components = st.sidebar.slider("Select number of PCA components", 2, min(data.shape[1], 10), 2)
# Features selection (all numeric columns except 'quality')
features = data.drop(columns=['quality'])
# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Perform PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(scaled_features)
# Create DataFrame for PCA result
pca_df = pd.DataFrame(
data=principal_components,
columns=[f"PC{i+1}" for i in range(n_components)]
)
# Add the 'quality' column to the PCA DataFrame
pca_df['Quality'] = data['quality']
# Tabs
tab1, tab2, tab3, tab4 = st.tabs(["πŸ“„ Raw Dataset", "πŸ“‰ PCA Scatter Plot", "πŸ“ˆ Explained Variance", "πŸ“₯ Download Reduced Dataset"])
with tab1:
st.subheader("πŸ“„ Raw Dataset")
st.dataframe(data)
with tab2:
st.subheader("πŸ“‰ PCA Scatter Plot")
if n_components >= 2:
fig, ax = plt.subplots()
sns.scatterplot(
x="PC1",
y="PC2",
data=pca_df,
hue="Quality",
palette="viridis",
s=70,
edgecolor="black",
alpha=0.7
)
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_title("PCA - First Two Components")
st.pyplot(fig)
st.write("""
The scatter plot above shows how the wine samples are distributed in the space of the first two principal components.
Points are colored based on their **wine quality**, which ranges from 3 (poor) to 8 (excellent).
- **Clusters**: Notice how wines of similar quality tend to group together in the plot.
- **Separation**: High-quality wines (higher quality scores) tend to be more spread out, while lower-quality wines are often more tightly clustered.
""")
else:
st.warning("Please select at least 2 components to plot a scatter plot.")
with tab3:
st.subheader("πŸ“ˆ Explained Variance Ratio")
exp_var = pca.explained_variance_ratio_
fig2, ax2 = plt.subplots()
sns.barplot(x=[f"PC{i+1}" for i in range(n_components)], y=exp_var, color="skyblue", ax=ax2)
ax2.set_ylabel('Explained Variance Ratio')
ax2.set_xlabel('Principal Components')
ax2.set_title('Variance Explained by Each Principal Component')
st.pyplot(fig2)
st.markdown(f"**Total Variance Explained:** {np.sum(exp_var):.2f}")
st.write("""
The bar plot shows the **explained variance ratio** of each principal component.
- **Higher variance** means that component carries more information.
- In this case, the first few components explain the majority of the variance in the dataset, with later components contributing less.
- By selecting fewer components, we reduce dimensionality but still retain most of the data's information.
""")
with tab4:
st.subheader("πŸ“₯ Download Reduced Dataset")
st.write("You can download the PCA-reduced dataset as a CSV file.")
# Create a CSV for the PCA-reduced data
pca_reduced = pca_df.to_csv(index=False)
st.download_button(
label="Download PCA Reduced Data",
data=pca_reduced,
file_name="pca_reduced_wine_quality.csv",
mime="text/csv"
)
# Footer
st.markdown("---")
st.caption("Made with ❀️ using Streamlit")