Spaces:

Dun3Co
/

week3_assignment

Sleeping

File size: 10,497 Bytes

cf5ff90

# start by importing the necessary packages
#standard
import numpy as np
import pandas as pd

#plt packages
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
#streamlit
import streamlit as st

#sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score



st.set_page_config(page_title="StressedOUT – Cached/DB", page_icon=":skull:", layout="wide")
st.title("StressedOOUT – Looking into a dataset of stressed students (Cached)")
st.caption("Reads .csv files. Uses Streamlit caching and a form submit gate.")

BASE_DIR = "StressLevelDataset.csv"   # 

@st.cache_data
def load_data(path):
    data = pd.read_csv(path)
    return data

data = load_data(BASE_DIR).drop(columns=['future_career_concerns', 'anxiety_level', 'depression', 'bullying','peer_pressure'])


with st.sidebar:
    st.header("Filters")
    with st.form("filters"):
        analysis = st.radio(
            "Select your dataset",
            ('PCA reduced', 'No dimensionality reduction'),
            captions=('PCA reduced', 'No dimensionality reduction')
        )
        k = st.slider("Select number of clusters (k)", 2, 10, 4, step=1)
        iterations = st.slider("Select number of iterations to show", 1, 10, 5, step=1)
        seed = st.number_input("Random seed", min_value=0, max_value=100, value=42, step=1)
        st.write("For no dimensionality reduction, the first two features will be used for visualization.")
        feature_x = st.selectbox("Select X-axis feature", data.columns, index=0)
        feature_y = st.selectbox("Select Y-axis feature", data.columns, index=1)

        submitted = st.form_submit_button("Apply")        
if not submitted:
    st.info("Adjust filters and click **Apply**.")
    st.stop()

def kmeans_iteration_demo(X, k, max_iters=iterations):
# Initialize centers randomly
    np.random.seed(seed)
    centers = X[np.random.choice(len(X), k, replace=False)]

    fig, axes = plt.subplots(1, max_iters + 1, figsize=(20, 4))

    for iteration in range(max_iters + 1):
        if iteration == 0:
            # Show initial random centers
            axes[iteration].scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.6, s=30)
            axes[iteration].scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X',
                                edgecolors='black', linewidths=2)
            axes[iteration].set_title(f'Iteration {iteration}\n(Random Initialization)')

        else:
            # Assign points to nearest center
            distances = np.sqrt(((X - centers[:, np.newaxis])**2).sum(axis=2))
            labels = np.argmin(distances, axis=0)

            # Plot current clustering
            colors = ['blue', 'green', 'red', 'purple', 'orange']
            for j in range(k):
                mask = labels == j
                axes[iteration].scatter(X[mask, 0], X[mask, 1],
                                    c=colors[j], alpha=0.6, s=30, label=f'Cluster {j+1}')

            axes[iteration].scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='X',
                                edgecolors='white', linewidths=2)
            axes[iteration].set_title(f'Iteration {iteration}')

            # Update centers
            new_centers = np.array([X[labels == j].mean(axis=0) for j in range(k)])

            # Show center movement with arrows
            if iteration > 1:
                for j in range(k):
                    axes[iteration].annotate('', xy=new_centers[j], xytext=centers[j],
                                        arrowprops=dict(arrowstyle='->', lw=2, color='red', alpha=0.7))

            centers = new_centers

        axes[iteration].set_xlabel('PC1')
        axes[iteration].set_ylabel('PC2')
        axes[iteration].grid(True, alpha=0.3)

    plt.tight_layout()
    st.pyplot(fig)

if analysis == 'PCA reduced':
    data_scaled = StandardScaler().fit_transform(data)
    data_reduced_df = pd.DataFrame(data_scaled, columns=data.columns)
    st.write('You selected PCA reduced')
    pca = PCA()
    pca_data = pca.fit_transform(data_reduced_df)
    pca_data_pd = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])
    st.write('The PCA reduced data is shown below')
    st.dataframe(pca_data_pd.head(10))

    explained_variance = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)

    st.write("Explained Variance by Component:")
    for i in range(min(10, len(explained_variance))):
        st.write(f"PC{i+1}: {explained_variance[i]:.3f} ({explained_variance[i]*100:.1f}%)")

    st.write(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance")
    st.write(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance")

    #visualizations
    fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,5))

    #scree plot
    ax1.plot(range(1,len(explained_variance)+1),explained_variance,marker='o',linestyle='--')
    ax1.set_title('Scree Plot')
    ax1.set_xlabel('Principal Component')
    ax1.set_ylabel('Variance Explained')
    ax1.axvline(x=3,color='r',linestyle='--',label='3 components')
    ax1.axvline(x=5,color='g',linestyle='--',label='5 components')
    ax1.legend()
    ax1.grid()
    #cumulative variance plot
    ax2.plot(range(1,len(cumulative_variance)+1),cumulative_variance,marker='o',linestyle='--',color='orange')
    ax2.set_title('Cumulative Variance Explained')
    ax2.set_xlabel('Number of Principal Components')
    ax2.set_ylabel('Cumulative Variance Explained')
    ax2.axhline(y=0.9,color='r',linestyle='--',label='90% variance')
    ax2.axhline(y=0.95,color='g',linestyle='--',label='95% variance')
    ax2.legend()
    ax2.grid()
    st.pyplot(fig)

    components_df = pd.DataFrame(
    pca.components_[:5].T,  # First 5 components
    columns=[f'PC{i+1}' for i in range(5)],
    index=data_reduced_df.columns
)
    st.write("PCA Component Loadings (first 5 components):")
    st.dataframe(components_df)

    # Visualize component loadings for interpretation
    fig, axes = plt.subplots(3, 2, figsize=(16, 12))

    # PC1 loadings
    pc1_loadings = components_df['PC1'].sort_values(key=abs, ascending=False)
    axes[0,0].barh(range(len(pc1_loadings)), pc1_loadings.values)
    axes[0,0].set_yticks(range(len(pc1_loadings)))
    axes[0,0].set_yticklabels(pc1_loadings.index, fontsize=9)
    axes[0,0].set_title(f'PC1 Loadings (Explains {explained_variance[0]*100:.1f}% of variance)')
    axes[0,0].axvline(x=0, color='black', linestyle='-', alpha=0.3)

    # PC2 loadings
    pc2_loadings = components_df['PC2'].sort_values(key=abs, ascending=False)
    axes[0,1].barh(range(len(pc2_loadings)), pc2_loadings.values, color='orange')
    axes[0,1].set_yticks(range(len(pc2_loadings)))
    axes[0,1].set_yticklabels(pc2_loadings.index, fontsize=9)
    axes[0,1].set_title(f'PC2 Loadings (Explains {explained_variance[1]*100:.1f}% of variance)')
    axes[0,1].axvline(x=0, color='black', linestyle='-', alpha=0.3)

    # PC3 loadings
    pc3_loadings = components_df['PC3'].sort_values(key=abs, ascending=False)
    axes[1,0].barh(range(len(pc3_loadings)), pc3_loadings.values, color='green')
    axes[1,0].set_yticks(range(len(pc3_loadings)))
    axes[1,0].set_yticklabels(pc3_loadings.index, fontsize=9)
    axes[1,0].set_title(f'PC3 Loadings (Explains {explained_variance[2]*100:.1f}% of variance)')
    axes[1,0].axvline(x=0, color='black', linestyle='-', alpha=0.3)

    # PC1 vs PC2 scatter plot of cities
    axes[1,1].scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.6)
    axes[1,1].set_xlabel('PC1')
    axes[1,1].set_ylabel('PC2')
    axes[1,1].set_title('Students  in PC1-PC2 Space')
    axes[1,1].grid(True, alpha=0.3)

    # PC1 vs PC3 scatter plot of cities
    axes[2,0].scatter(pca_data[:, 0], pca_data[:, 2], alpha=0.6)
    axes[2,0].set_xlabel('PC1')
    axes[2,0].set_ylabel('PC3')
    axes[2,0].set_title('Students in PC1-PC3 Space')
    axes[2,0].grid(True, alpha=0.3)

    # PC2 vs PC3 scatter plot of cities
    axes[2,1].scatter(pca_data[:, 1], pca_data[:, 2], alpha=0.6)
    axes[2,1].set_xlabel('PC2')
    axes[2,1].set_ylabel('PC3')
    axes[2,1].set_title('Students in PC2-PC3 Space')
    axes[2,1].grid(True, alpha=0.3)
    plt.tight_layout()
    st.pyplot(fig)

# KMeans clustering on PCA reduced data
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(pca_data[:,:5])  # Using first 5 PCs  
    silhouette_avg = silhouette_score(pca_data[:,:5], cluster_labels)
    st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}")
    pca_data_pd['Cluster'] = cluster_labels
    pca = PCA(n_components=2, random_state=42)
    pca_2d = pca.fit_transform(pca_data_pd.drop(columns=['Cluster']))
    pca_2d_df = pd.DataFrame(pca_2d, columns=['PC1', 'PC2'])
    pca_2d_df['Cluster'] = cluster_labels
    st.write("2D PCA plot with KMeans clusters:")

    kmeans_iteration_demo(pca_2d_df[['PC1', 'PC2']].values, k)

# ...existing code...
# ...existing code...
else:
    st.write('You selected No dimensionality reduction')
    st.write('The original data is shown below')
    st.dataframe(data.head(10))

    # Standardize the data
    data_scaled = StandardScaler().fit_transform(data)
    data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns)
    st.dataframe(data_scaled_df.head(10))

    # KMeans clustering on original scaled data
    kmeans = KMeans(n_clusters=k, random_state=seed)
    cluster_labels = kmeans.fit_predict(data_scaled_df)
    silhouette_avg = silhouette_score(data_scaled_df, cluster_labels)
    st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}")

    # Add cluster labels for plotting
    data_scaled_df['Cluster'] = cluster_labels

    # 2D scatter plot using two original features for visualization
    fig, ax = plt.subplots(figsize=(8, 6))
    scatter = ax.scatter(
        data_scaled_df[feature_x], data_scaled_df[feature_y],
        c=cluster_labels, cmap='tab10', alpha=0.7, s=50
    )
    ax.set_xlabel(feature_x)
    ax.set_ylabel(feature_y)
    ax.set_title('KMeans Clusters (Original Scaled Features)')
    plt.colorbar(scatter, ax=ax, label='Cluster')
    st.pyplot(fig)