Spaces:
Sleeping
Sleeping
| # app.py | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Streamlit page setup | |
| st.set_page_config(page_title="PCA Explorer - Wine Quality", page_icon="π·", layout="wide") | |
| # Title and short description | |
| st.title("π· Principal Component Analysis (PCA) on Wine Quality Dataset") | |
| st.write(""" | |
| This app demonstrates **Principal Component Analysis (PCA)** for dimensionality reduction and visualization of the **Wine Quality Dataset**. | |
| """) | |
| # Load Wine Quality dataset (local file) | |
| def load_data(): | |
| data = pd.read_csv("winequality-red.csv") # Make sure the dataset is named correctly | |
| return data | |
| data = load_data() | |
| # Sidebar settings | |
| st.sidebar.header("Settings") | |
| n_components = st.sidebar.slider("Select number of PCA components", 2, min(data.shape[1], 10), 2) | |
| # Features selection (all numeric columns except 'quality') | |
| features = data.drop(columns=['quality']) | |
| # Standardize the data | |
| scaler = StandardScaler() | |
| scaled_features = scaler.fit_transform(features) | |
| # Perform PCA | |
| pca = PCA(n_components=n_components) | |
| principal_components = pca.fit_transform(scaled_features) | |
| # Create DataFrame for PCA result | |
| pca_df = pd.DataFrame( | |
| data=principal_components, | |
| columns=[f"PC{i+1}" for i in range(n_components)] | |
| ) | |
| # Add the 'quality' column to the PCA DataFrame | |
| pca_df['Quality'] = data['quality'] | |
| # Tabs | |
| tab1, tab2, tab3, tab4 = st.tabs(["π Raw Dataset", "π PCA Scatter Plot", "π Explained Variance", "π₯ Download Reduced Dataset"]) | |
| with tab1: | |
| st.subheader("π Raw Dataset") | |
| st.dataframe(data) | |
| with tab2: | |
| st.subheader("π PCA Scatter Plot") | |
| if n_components >= 2: | |
| fig, ax = plt.subplots() | |
| sns.scatterplot( | |
| x="PC1", | |
| y="PC2", | |
| data=pca_df, | |
| hue="Quality", | |
| palette="viridis", | |
| s=70, | |
| edgecolor="black", | |
| alpha=0.7 | |
| ) | |
| ax.set_xlabel("Principal Component 1") | |
| ax.set_ylabel("Principal Component 2") | |
| ax.set_title("PCA - First Two Components") | |
| st.pyplot(fig) | |
| st.write(""" | |
| The scatter plot above shows how the wine samples are distributed in the space of the first two principal components. | |
| Points are colored based on their **wine quality**, which ranges from 3 (poor) to 8 (excellent). | |
| - **Clusters**: Notice how wines of similar quality tend to group together in the plot. | |
| - **Separation**: High-quality wines (higher quality scores) tend to be more spread out, while lower-quality wines are often more tightly clustered. | |
| """) | |
| else: | |
| st.warning("Please select at least 2 components to plot a scatter plot.") | |
| with tab3: | |
| st.subheader("π Explained Variance Ratio") | |
| exp_var = pca.explained_variance_ratio_ | |
| fig2, ax2 = plt.subplots() | |
| sns.barplot(x=[f"PC{i+1}" for i in range(n_components)], y=exp_var, color="skyblue", ax=ax2) | |
| ax2.set_ylabel('Explained Variance Ratio') | |
| ax2.set_xlabel('Principal Components') | |
| ax2.set_title('Variance Explained by Each Principal Component') | |
| st.pyplot(fig2) | |
| st.markdown(f"**Total Variance Explained:** {np.sum(exp_var):.2f}") | |
| st.write(""" | |
| The bar plot shows the **explained variance ratio** of each principal component. | |
| - **Higher variance** means that component carries more information. | |
| - In this case, the first few components explain the majority of the variance in the dataset, with later components contributing less. | |
| - By selecting fewer components, we reduce dimensionality but still retain most of the data's information. | |
| """) | |
| with tab4: | |
| st.subheader("π₯ Download Reduced Dataset") | |
| st.write("You can download the PCA-reduced dataset as a CSV file.") | |
| # Create a CSV for the PCA-reduced data | |
| pca_reduced = pca_df.to_csv(index=False) | |
| st.download_button( | |
| label="Download PCA Reduced Data", | |
| data=pca_reduced, | |
| file_name="pca_reduced_wine_quality.csv", | |
| mime="text/csv" | |
| ) | |
| # Footer | |
| st.markdown("---") | |
| st.caption("Made with β€οΈ using Streamlit") | |