# app.py import streamlit as st import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import seaborn as sns # Streamlit page setup st.set_page_config(page_title="PCA Explorer - Wine Quality", page_icon="🍷", layout="wide") # Title and short description st.title("🍷 Principal Component Analysis (PCA) on Wine Quality Dataset") st.write(""" This app demonstrates **Principal Component Analysis (PCA)** for dimensionality reduction and visualization of the **Wine Quality Dataset**. """) # Load Wine Quality dataset (local file) @st.cache_data def load_data(): data = pd.read_csv("winequality-red.csv") # Make sure the dataset is named correctly return data data = load_data() # Sidebar settings st.sidebar.header("Settings") n_components = st.sidebar.slider("Select number of PCA components", 2, min(data.shape[1], 10), 2) # Features selection (all numeric columns except 'quality') features = data.drop(columns=['quality']) # Standardize the data scaler = StandardScaler() scaled_features = scaler.fit_transform(features) # Perform PCA pca = PCA(n_components=n_components) principal_components = pca.fit_transform(scaled_features) # Create DataFrame for PCA result pca_df = pd.DataFrame( data=principal_components, columns=[f"PC{i+1}" for i in range(n_components)] ) # Add the 'quality' column to the PCA DataFrame pca_df['Quality'] = data['quality'] # Tabs tab1, tab2, tab3, tab4 = st.tabs(["📄 Raw Dataset", "📉 PCA Scatter Plot", "📈 Explained Variance", "📥 Download Reduced Dataset"]) with tab1: st.subheader("📄 Raw Dataset") st.dataframe(data) with tab2: st.subheader("📉 PCA Scatter Plot") if n_components >= 2: fig, ax = plt.subplots() sns.scatterplot( x="PC1", y="PC2", data=pca_df, hue="Quality", palette="viridis", s=70, edgecolor="black", alpha=0.7 ) ax.set_xlabel("Principal Component 1") ax.set_ylabel("Principal Component 2") ax.set_title("PCA - First Two Components") st.pyplot(fig) st.write(""" The scatter plot above shows how the wine samples are distributed in the space of the first two principal components. Points are colored based on their **wine quality**, which ranges from 3 (poor) to 8 (excellent). - **Clusters**: Notice how wines of similar quality tend to group together in the plot. - **Separation**: High-quality wines (higher quality scores) tend to be more spread out, while lower-quality wines are often more tightly clustered. """) else: st.warning("Please select at least 2 components to plot a scatter plot.") with tab3: st.subheader("📈 Explained Variance Ratio") exp_var = pca.explained_variance_ratio_ fig2, ax2 = plt.subplots() sns.barplot(x=[f"PC{i+1}" for i in range(n_components)], y=exp_var, color="skyblue", ax=ax2) ax2.set_ylabel('Explained Variance Ratio') ax2.set_xlabel('Principal Components') ax2.set_title('Variance Explained by Each Principal Component') st.pyplot(fig2) st.markdown(f"**Total Variance Explained:** {np.sum(exp_var):.2f}") st.write(""" The bar plot shows the **explained variance ratio** of each principal component. - **Higher variance** means that component carries more information. - In this case, the first few components explain the majority of the variance in the dataset, with later components contributing less. - By selecting fewer components, we reduce dimensionality but still retain most of the data's information. """) with tab4: st.subheader("📥 Download Reduced Dataset") st.write("You can download the PCA-reduced dataset as a CSV file.") # Create a CSV for the PCA-reduced data pca_reduced = pca_df.to_csv(index=False) st.download_button( label="Download PCA Reduced Data", data=pca_reduced, file_name="pca_reduced_wine_quality.csv", mime="text/csv" ) # Footer st.markdown("---") st.caption("Made with ❤️ using Streamlit")