import streamlit as st import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MeanShift from sklearn.metrics import silhouette_score, calinski_harabasz_score import matplotlib.pyplot as plt import seaborn as sns import base64 # Function to load and preprocess the data def load_data(file_path): """ Load and preprocess the dataset from a CSV file. Parameters: - file_path: str, path to the CSV file Returns: - df: DataFrame, preprocessed dataset """ try: df = pd.read_csv(file_path) # Drop the 'Name of Student' column as it is not numerical df = df.drop(columns=['Name of Student']) # Convert categorical 'CLASS' to numerical df['CLASS'] = df['CLASS'].astype('category').cat.codes return df except Exception as e: st.error(f"Error loading data: {e}") return None # Function to scale and normalize the data def scale_normalize_data(df): """ Scale and normalize the dataset. Parameters: - df: DataFrame, dataset to be scaled and normalized Returns: - scaled_df: DataFrame, scaled and normalized dataset """ scaler = StandardScaler() # Drop 'Cluster' column if it exists if 'Cluster' in df.columns: df = df.drop(columns=['Cluster']) scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) return scaled_df # Function to create a scatter plot def create_scatter_plot(df, x_col, y_col, cluster_labels): """ Create a scatter plot for visualization. Parameters: - df: DataFrame, dataset - x_col: str, column for x-axis - y_col: str, column for y-axis - cluster_labels: array, cluster labels """ plt.figure(figsize=(10, 6)) sns.scatterplot(x=x_col, y=y_col, hue=cluster_labels, data=df, palette='viridis') plt.title(f'Scatter Plot of {x_col} vs {y_col}') st.pyplot(plt) # Function to create an elbow curve def create_elbow_curve(df, max_clusters): """ Create an elbow curve to determine the optimal number of clusters. Parameters: - df: DataFrame, dataset - max_clusters: int, maximum number of clusters to consider """ wcss = [] for i in range(1, max_clusters + 1): kmeans = KMeans(n_clusters=i, random_state=42) kmeans.fit(df) wcss.append(kmeans.inertia_) plt.figure(figsize=(10, 6)) plt.plot(range(1, max_clusters + 1), wcss, marker='o') plt.title('Elbow Curve') plt.xlabel('Number of Clusters') plt.ylabel('WCSS') st.pyplot(plt) # Function to perform clustering and display results def perform_clustering(df, algorithm, params): """ Perform clustering using the specified algorithm and parameters. Parameters: - df: DataFrame, dataset - algorithm: str, clustering algorithm ('kmeans', 'dbscan', 'agglomerative', 'birch', 'meanshift') - params: dict, parameters for the algorithm Returns: - model: fitted clustering model """ if algorithm == 'kmeans': model = KMeans(n_clusters=params['n_clusters'], random_state=42) elif algorithm == 'dbscan': model = DBSCAN(eps=params['eps'], min_samples=params['min_samples']) elif algorithm == 'agglomerative': model = AgglomerativeClustering(n_clusters=params['n_clusters']) elif algorithm == 'birch': model = Birch(n_clusters=params['n_clusters']) elif algorithm == 'meanshift': model = MeanShift(bandwidth=params['bandwidth']) else: st.error("Invalid algorithm") return None cluster_labels = model.fit_predict(df) df['Cluster'] = cluster_labels st.write("Cluster Assignments:") st.dataframe(df) # Create elbow curve if applicable if algorithm == 'kmeans' and 'max_clusters' in params: create_elbow_curve(df, params['max_clusters']) return cluster_labels def display_performance_metrics(df, cluster_labels): """ Display performance metrics for clustering results. Parameters: - df: DataFrame, dataset - cluster_labels: array, cluster labels """ if len(np.unique(cluster_labels)) > 1: silhouette = silhouette_score(df, cluster_labels) calinski_harabasz = calinski_harabasz_score(df, cluster_labels) st.write(f"Silhouette Score: {silhouette:.2f}") st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.2f}") # Function to allow users to input new data points def input_new_data(df): """ Allow users to input new data points for prediction. Parameters: - df: DataFrame, dataset """ st.sidebar.write("Input new data for prediction:") new_data = {} for col in df.columns: if col != 'Cluster': new_data[col] = st.sidebar.slider(f"Enter {col}", 1, 5) new_df = pd.DataFrame([new_data]) scaled_new_df = scale_normalize_data(new_df) return scaled_new_df # Function to download results def download_results(df): """ Provide a downloadable CSV file of the results. Parameters: - df: DataFrame, results to be downloaded """ csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download CSV File' st.markdown(href, unsafe_allow_html=True) # Main function to create the Streamlit app def main(): st.title("Unsupervised Learning on Student Performance Data") st.write("This application implements five unsupervised learning algorithms on a dataset of student performance. The algorithms include K-Means, DBSCAN, Agglomerative Clustering, Birch, and Mean Shift. The application provides interactive visualizations, parameter tuning, and performance metrics.") # Load and preprocess the data file_path = './Student-Employability-Datasets(Data).csv' df = load_data(file_path) if df is not None: st.write("Preprocessed Data:") st.dataframe(df) # Scale and normalize the data df_for_scaling = df.drop(columns=['CLASS']) scaled_df = scale_normalize_data(df_for_scaling) st.write("Scaled and Normalized Data:") st.dataframe(scaled_df) # Feature correlation analysis st.write("Feature Correlation Analysis:") # Exclude 'CLASS' and 'Cluster' columns from correlation analysis numeric_cols = df.select_dtypes(include=[np.number]).columns numeric_cols = [col for col in numeric_cols if col not in ['CLASS', 'Cluster']] corr_matrix = df[numeric_cols].corr() st.write(corr_matrix) plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm') st.pyplot(plt) # Create a radio button for algorithm selection st.sidebar.header('Algorithms') selected_algorithm = st.sidebar.radio("Select Algorithm", ["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"]) # Show parameters based on selected algorithm st.sidebar.header('Parameters') st.title("Algorithms Tab") st.write("Choose the algorithm above first so that the options will show about the algorithm of choice! :)") # Create tabs for each algorithm tab1, tab2, tab3, tab4, tab5 = st.tabs(["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"]) with tab1: st.header("K-Means Clustering") if selected_algorithm == "K-Means": n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='n_clusters') max_clusters = st.sidebar.slider("Maximum Number of Clusters for Elbow Curve", 2, 15, 10, key='max') cluster_labels = perform_clustering(scaled_df, 'kmeans', {'n_clusters': n_clusters, 'max_clusters': max_clusters}) display_performance_metrics(scaled_df, cluster_labels) with tab2: st.header("DBSCAN Clustering") if selected_algorithm == 'DBSCAN': eps = st.sidebar.slider("Epsilon", 0.1, 1.0, 0.5, 0.1, key='eps') min_samples = st.slider("Minimum Samples", 1, 10, 5, key='min_dbscan') cluster_labels = perform_clustering(scaled_df, 'dbscan', {'eps': eps, 'min_samples': min_samples}) display_performance_metrics(scaled_df, cluster_labels) with tab3: st.header("Agglomerative Clustering") if selected_algorithm == 'Agglomerative Clustering': n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='agg_cluster') cluster_labels = perform_clustering(scaled_df, 'agglomerative', {'n_clusters': n_clusters}) display_performance_metrics(scaled_df, cluster_labels) with tab4: st.header("Birch Clustering") if selected_algorithm == 'Birch': n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='birch_cluster') cluster_labels = perform_clustering(scaled_df, 'birch', {'n_clusters': n_clusters}) display_performance_metrics(scaled_df, cluster_labels) with tab5: st.header("Mean Shift Clustering") if selected_algorithm == 'Mean Shift': bandwidth = st.sidebar.slider("Bandwidth", 0.1, 1.0, 0.5, 0.1, key='bandwidth') cluster_labels = perform_clustering(scaled_df, 'meanshift', {'bandwidth': bandwidth}) display_performance_metrics(scaled_df, cluster_labels) # Allow users to input new data points new_data = input_new_data(scaled_df) if st.sidebar.button("Predict Cluster for New Data"): # Perform clustering on the new data point if selected_algorithm == "K-Means": params = {'n_clusters': n_clusters} with tab1: scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df cluster_label = perform_clustering(scaled_df_no_cluster, 'kmeans', params) st.write(f"Predicted Cluster for K-Means: {cluster_label[0]}") elif selected_algorithm == "DBSCAN": params = {'eps': eps, 'min_samples': min_samples} with tab2: scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df cluster_label = perform_clustering(scaled_df_no_cluster, 'dbscan', params) st.write(f"Predicted Cluster for DBSCAN: {cluster_label[0]}") elif selected_algorithm == "Agglomerative Clustering": params = {'n_clusters': n_clusters} with tab3: scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df cluster_label = perform_clustering(scaled_df_no_cluster, 'agglomerative', params) st.write(f"Predicted Cluster for Agglomerative Clustering: {cluster_label[0]}") elif selected_algorithm == "Birch": params = {'n_clusters': n_clusters} with tab4: scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df cluster_label = perform_clustering(scaled_df_no_cluster, 'birch', params) st.write(f"Predicted Cluster for Birch: {cluster_label[0]}") elif selected_algorithm == "Mean Shift": params = {'bandwidth': bandwidth} with tab5: scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df cluster_label = perform_clustering(scaled_df_no_cluster, 'meanshift', params) st.write(f"Predicted Cluster for Mean Shift: {cluster_label[0]}") # Download results download_results(df) if __name__ == "__main__": main()