Spaces:

DhominickJ
/

EmployabilityPrediction

Sleeping

App Files Files Community

DhominickJ commited on Mar 2, 2025

Commit

ee40e2b

1 Parent(s): 134db2b

Initial Commit for the Mall Customers Prediciton

Browse files

Files changed (3) hide show

Student-Employability-Datasets(Data).csv +0 -0
app.py +293 -0
requirements.txt +6 -0

Student-Employability-Datasets(Data).csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MeanShift
+from sklearn.metrics import silhouette_score, calinski_harabasz_score
+import matplotlib.pyplot as plt
+import seaborn as sns
+import base64
+# Function to load and preprocess the data
+def load_data(file_path):
+    """
+    Load and preprocess the dataset from a CSV file.
+    Parameters:
+    - file_path: str, path to the CSV file
+    Returns:
+    - df: DataFrame, preprocessed dataset
+    """
+    try:
+        df = pd.read_csv(file_path)
+        # Drop the 'Name of Student' column as it is not numerical
+        df = df.drop(columns=['Name of Student'])
+        # Convert categorical 'CLASS' to numerical
+        df['CLASS'] = df['CLASS'].astype('category').cat.codes
+        return df
+    except Exception as e:
+        st.error(f"Error loading data: {e}")
+        return None
+# Function to scale and normalize the data
+def scale_normalize_data(df):
+    """
+    Scale and normalize the dataset.
+    Parameters:
+    - df: DataFrame, dataset to be scaled and normalized
+    Returns:
+    - scaled_df: DataFrame, scaled and normalized dataset
+    """
+    scaler = StandardScaler()
+    # Drop 'Cluster' column if it exists
+    if 'Cluster' in df.columns:
+        df = df.drop(columns=['Cluster'])
+    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
+    return scaled_df
+# Function to create a scatter plot
+def create_scatter_plot(df, x_col, y_col, cluster_labels):
+    """
+    Create a scatter plot for visualization.
+    Parameters:
+    - df: DataFrame, dataset
+    - x_col: str, column for x-axis
+    - y_col: str, column for y-axis
+    - cluster_labels: array, cluster labels
+    """
+    plt.figure(figsize=(10, 6))
+    sns.scatterplot(x=x_col, y=y_col, hue=cluster_labels, data=df, palette='viridis')
+    plt.title(f'Scatter Plot of {x_col} vs {y_col}')
+    st.pyplot(plt)
+# Function to create an elbow curve
+def create_elbow_curve(df, max_clusters):
+    """
+    Create an elbow curve to determine the optimal number of clusters.
+    Parameters:
+    - df: DataFrame, dataset
+    - max_clusters: int, maximum number of clusters to consider
+    """
+    wcss = []
+    for i in range(1, max_clusters + 1):
+        kmeans = KMeans(n_clusters=i, random_state=42)
+        kmeans.fit(df)
+        wcss.append(kmeans.inertia_)
+    plt.figure(figsize=(10, 6))
+    plt.plot(range(1, max_clusters + 1), wcss, marker='o')
+    plt.title('Elbow Curve')
+    plt.xlabel('Number of Clusters')
+    plt.ylabel('WCSS')
+    st.pyplot(plt)
+# Function to perform clustering and display results
+def perform_clustering(df, algorithm, params):
+    """
+    Perform clustering using the specified algorithm and parameters.
+    Parameters:
+    - df: DataFrame, dataset
+    - algorithm: str, clustering algorithm ('kmeans', 'dbscan', 'agglomerative', 'birch', 'meanshift')
+    - params: dict, parameters for the algorithm
+    Returns:
+    - model: fitted clustering model
+    """
+    if algorithm == 'kmeans':
+        model = KMeans(n_clusters=params['n_clusters'], random_state=42)
+    elif algorithm == 'dbscan':
+        model = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
+    elif algorithm == 'agglomerative':
+        model = AgglomerativeClustering(n_clusters=params['n_clusters'])
+    elif algorithm == 'birch':
+        model = Birch(n_clusters=params['n_clusters'])
+    elif algorithm == 'meanshift':
+        model = MeanShift(bandwidth=params['bandwidth'])
+    else:
+        st.error("Invalid algorithm")
+        return None
+    cluster_labels = model.fit_predict(df)
+    df['Cluster'] = cluster_labels
+    st.write("Cluster Assignments:")
+    st.dataframe(df)
+    # Create elbow curve if applicable
+    if algorithm == 'kmeans' and 'max_clusters' in params:
+        create_elbow_curve(df, params['max_clusters'])
+    return cluster_labels
+def display_performance_metrics(df, cluster_labels):
+    """
+    Display performance metrics for clustering results.
+    Parameters:
+    - df: DataFrame, dataset
+    - cluster_labels: array, cluster labels
+    """
+    if len(np.unique(cluster_labels)) > 1:
+        silhouette = silhouette_score(df, cluster_labels)
+        calinski_harabasz = calinski_harabasz_score(df, cluster_labels)
+        st.write(f"Silhouette Score: {silhouette:.2f}")
+        st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.2f}")
+# Function to allow users to input new data points
+def input_new_data(df):
+    """
+    Allow users to input new data points for prediction.
+    Parameters:
+    - df: DataFrame, dataset
+    """
+    st.sidebar.write("Input new data for prediction:")
+    new_data = {}
+    for col in df.columns:
+        if col != 'Cluster':
+            new_data[col] = st.sidebar.slider(f"Enter {col}", 1, 5)
+    new_df = pd.DataFrame([new_data])
+    scaled_new_df = scale_normalize_data(new_df)
+    return scaled_new_df
+# Function to download results
+def download_results(df):
+    """
+    Provide a downloadable CSV file of the results.
+    Parameters:
+    - df: DataFrame, results to be downloaded
+    """
+    csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode()
+    href = f'<a href="data:file/csv;base64,{b64}" download="cluster_results.csv">Download CSV File</a>'
+    st.markdown(href, unsafe_allow_html=True)
+# Main function to create the Streamlit app
+def main():
+    st.title("Unsupervised Learning on Student Performance Data")
+    st.write("This application implements five unsupervised learning algorithms on a dataset of student performance. The algorithms include K-Means, DBSCAN, Agglomerative Clustering, Birch, and Mean Shift. The application provides interactive visualizations, parameter tuning, and performance metrics.")
+    # Load and preprocess the data
+    file_path = './Student-Employability-Datasets(Data).csv'
+    df = load_data(file_path)
+    if df is not None:
+        st.write("Preprocessed Data:")
+        st.dataframe(df)
+        # Scale and normalize the data
+        df_for_scaling = df.drop(columns=['CLASS'])
+        scaled_df = scale_normalize_data(df_for_scaling)
+        st.write("Scaled and Normalized Data:")
+        st.dataframe(scaled_df)
+        # Feature correlation analysis
+        st.write("Feature Correlation Analysis:")
+        # Exclude 'CLASS' and 'Cluster' columns from correlation analysis
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        numeric_cols = [col for col in numeric_cols if col not in ['CLASS', 'Cluster']]
+        corr_matrix = df[numeric_cols].corr()
+        st.write(corr_matrix)
+        plt.figure(figsize=(10, 8))
+        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
+        st.pyplot(plt)
+        # Create a radio button for algorithm selection
+        st.sidebar.header('Algorithms')
+        selected_algorithm = st.sidebar.radio("Select Algorithm", ["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"])
+        # Show parameters based on selected algorithm
+        st.sidebar.header('Parameters')
+        st.title("Algorithms Tab")
+        st.write("Choose the algorithm above first so that the options will show about the algorithm of choice! :)")
+        # Create tabs for each algorithm
+        tab1, tab2, tab3, tab4, tab5 = st.tabs(["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"])
+        with tab1:
+            st.header("K-Means Clustering")
+            if selected_algorithm == "K-Means":
+                n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='n_clusters')
+                max_clusters = st.sidebar.slider("Maximum Number of Clusters for Elbow Curve", 2, 15, 10, key='max')
+                cluster_labels = perform_clustering(scaled_df, 'kmeans', {'n_clusters': n_clusters, 'max_clusters': max_clusters})
+                display_performance_metrics(scaled_df, cluster_labels)
+        with tab2:
+            st.header("DBSCAN Clustering")
+            if selected_algorithm == 'DBSCAN':
+                eps = st.sidebar.slider("Epsilon", 0.1, 1.0, 0.5, 0.1, key='eps')
+                min_samples = st.slider("Minimum Samples", 1, 10, 5, key='min_dbscan')
+                cluster_labels = perform_clustering(scaled_df, 'dbscan', {'eps': eps, 'min_samples': min_samples})
+                display_performance_metrics(scaled_df, cluster_labels)
+        with tab3:
+            st.header("Agglomerative Clustering")
+            if selected_algorithm == 'Agglomerative Clustering':
+                n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='agg_cluster')
+                cluster_labels = perform_clustering(scaled_df, 'agglomerative', {'n_clusters': n_clusters})
+                display_performance_metrics(scaled_df, cluster_labels)
+        with tab4:
+            st.header("Birch Clustering")
+            if selected_algorithm == 'Birch':
+                n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='birch_cluster')
+                cluster_labels = perform_clustering(scaled_df, 'birch', {'n_clusters': n_clusters})
+                display_performance_metrics(scaled_df, cluster_labels)
+        with tab5:
+            st.header("Mean Shift Clustering")
+            if selected_algorithm == 'Mean Shift':
+                bandwidth = st.sidebar.slider("Bandwidth", 0.1, 1.0, 0.5, 0.1, key='bandwidth')
+                cluster_labels = perform_clustering(scaled_df, 'meanshift', {'bandwidth': bandwidth})
+                display_performance_metrics(scaled_df, cluster_labels)
+        # Allow users to input new data points
+        new_data = input_new_data(scaled_df)
+        if st.sidebar.button("Predict Cluster for New Data"):
+            # Perform clustering on the new data point
+            if selected_algorithm == "K-Means":
+                params = {'n_clusters': n_clusters}
+                with tab1:
+                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
+                    cluster_label = perform_clustering(scaled_df_no_cluster, 'kmeans', params)
+                    st.write(f"Predicted Cluster for K-Means: {cluster_label[0]}")
+            elif selected_algorithm == "DBSCAN":
+                params = {'eps': eps, 'min_samples': min_samples}
+                with tab2:
+                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
+                    cluster_label = perform_clustering(scaled_df_no_cluster, 'dbscan', params)
+                    st.write(f"Predicted Cluster for DBSCAN: {cluster_label[0]}")
+            elif selected_algorithm == "Agglomerative Clustering":
+                params = {'n_clusters': n_clusters}
+                with tab3:
+                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
+                    cluster_label = perform_clustering(scaled_df_no_cluster, 'agglomerative', params)
+                    st.write(f"Predicted Cluster for Agglomerative Clustering: {cluster_label[0]}")
+            elif selected_algorithm == "Birch":
+                params = {'n_clusters': n_clusters}
+                with tab4:
+                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
+                    cluster_label = perform_clustering(scaled_df_no_cluster, 'birch', params)
+                    st.write(f"Predicted Cluster for Birch: {cluster_label[0]}")
+            elif selected_algorithm == "Mean Shift":
+                params = {'bandwidth': bandwidth}
+                with tab5:
+                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
+                    cluster_label = perform_clustering(scaled_df_no_cluster, 'meanshift', params)
+                    st.write(f"Predicted Cluster for Mean Shift: {cluster_label[0]}")
+        # Download results
+        download_results(df)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pandas
+numpy
+scikit-learn
+matplotlib
+seaborn