Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
| from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MeanShift | |
| from sklearn.metrics import silhouette_score, calinski_harabasz_score | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import base64 | |
| # Function to load and preprocess the data | |
| def load_data(file_path): | |
| """ | |
| Load and preprocess the dataset from a CSV file. | |
| Parameters: | |
| - file_path: str, path to the CSV file | |
| Returns: | |
| - df: DataFrame, preprocessed dataset | |
| """ | |
| try: | |
| df = pd.read_csv(file_path) | |
| # Drop the 'Name of Student' column as it is not numerical | |
| df = df.drop(columns=['Name of Student']) | |
| # Convert categorical 'CLASS' to numerical | |
| df['CLASS'] = df['CLASS'].astype('category').cat.codes | |
| return df | |
| except Exception as e: | |
| st.error(f"Error loading data: {e}") | |
| return None | |
| # Function to scale and normalize the data | |
| def scale_normalize_data(df): | |
| """ | |
| Scale and normalize the dataset. | |
| Parameters: | |
| - df: DataFrame, dataset to be scaled and normalized | |
| Returns: | |
| - scaled_df: DataFrame, scaled and normalized dataset | |
| """ | |
| scaler = StandardScaler() | |
| # Drop 'Cluster' column if it exists | |
| if 'Cluster' in df.columns: | |
| df = df.drop(columns=['Cluster']) | |
| scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) | |
| return scaled_df | |
| # Function to create a scatter plot | |
| def create_scatter_plot(df, x_col, y_col, cluster_labels): | |
| """ | |
| Create a scatter plot for visualization. | |
| Parameters: | |
| - df: DataFrame, dataset | |
| - x_col: str, column for x-axis | |
| - y_col: str, column for y-axis | |
| - cluster_labels: array, cluster labels | |
| """ | |
| plt.figure(figsize=(10, 6)) | |
| sns.scatterplot(x=x_col, y=y_col, hue=cluster_labels, data=df, palette='viridis') | |
| plt.title(f'Scatter Plot of {x_col} vs {y_col}') | |
| st.pyplot(plt) | |
| # Function to create an elbow curve | |
| def create_elbow_curve(df, max_clusters): | |
| """ | |
| Create an elbow curve to determine the optimal number of clusters. | |
| Parameters: | |
| - df: DataFrame, dataset | |
| - max_clusters: int, maximum number of clusters to consider | |
| """ | |
| wcss = [] | |
| for i in range(1, max_clusters + 1): | |
| kmeans = KMeans(n_clusters=i, random_state=42) | |
| kmeans.fit(df) | |
| wcss.append(kmeans.inertia_) | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(range(1, max_clusters + 1), wcss, marker='o') | |
| plt.title('Elbow Curve') | |
| plt.xlabel('Number of Clusters') | |
| plt.ylabel('WCSS') | |
| st.pyplot(plt) | |
| # Function to perform clustering and display results | |
| def perform_clustering(df, algorithm, params): | |
| """ | |
| Perform clustering using the specified algorithm and parameters. | |
| Parameters: | |
| - df: DataFrame, dataset | |
| - algorithm: str, clustering algorithm ('kmeans', 'dbscan', 'agglomerative', 'birch', 'meanshift') | |
| - params: dict, parameters for the algorithm | |
| Returns: | |
| - model: fitted clustering model | |
| """ | |
| if algorithm == 'kmeans': | |
| model = KMeans(n_clusters=params['n_clusters'], random_state=42) | |
| elif algorithm == 'dbscan': | |
| model = DBSCAN(eps=params['eps'], min_samples=params['min_samples']) | |
| elif algorithm == 'agglomerative': | |
| model = AgglomerativeClustering(n_clusters=params['n_clusters']) | |
| elif algorithm == 'birch': | |
| model = Birch(n_clusters=params['n_clusters']) | |
| elif algorithm == 'meanshift': | |
| model = MeanShift(bandwidth=params['bandwidth']) | |
| else: | |
| st.error("Invalid algorithm") | |
| return None | |
| cluster_labels = model.fit_predict(df) | |
| df['Cluster'] = cluster_labels | |
| st.write("Cluster Assignments:") | |
| st.dataframe(df) | |
| # Create elbow curve if applicable | |
| if algorithm == 'kmeans' and 'max_clusters' in params: | |
| create_elbow_curve(df, params['max_clusters']) | |
| return cluster_labels | |
| def display_performance_metrics(df, cluster_labels): | |
| """ | |
| Display performance metrics for clustering results. | |
| Parameters: | |
| - df: DataFrame, dataset | |
| - cluster_labels: array, cluster labels | |
| """ | |
| if len(np.unique(cluster_labels)) > 1: | |
| silhouette = silhouette_score(df, cluster_labels) | |
| calinski_harabasz = calinski_harabasz_score(df, cluster_labels) | |
| st.write(f"Silhouette Score: {silhouette:.2f}") | |
| st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.2f}") | |
| # Function to allow users to input new data points | |
| def input_new_data(df): | |
| """ | |
| Allow users to input new data points for prediction. | |
| Parameters: | |
| - df: DataFrame, dataset | |
| """ | |
| st.sidebar.write("Input new data for prediction:") | |
| new_data = {} | |
| for col in df.columns: | |
| if col != 'Cluster': | |
| new_data[col] = st.sidebar.slider(f"Enter {col}", 1, 5) | |
| new_df = pd.DataFrame([new_data]) | |
| scaled_new_df = scale_normalize_data(new_df) | |
| return scaled_new_df | |
| # Function to download results | |
| def download_results(df): | |
| """ | |
| Provide a downloadable CSV file of the results. | |
| Parameters: | |
| - df: DataFrame, results to be downloaded | |
| """ | |
| csv = df.to_csv(index=False) | |
| b64 = base64.b64encode(csv.encode()).decode() | |
| href = f'<a href="data:file/csv;base64,{b64}" download="cluster_results.csv">Download CSV File</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| # Main function to create the Streamlit app | |
| def main(): | |
| st.title("Unsupervised Learning on Student Performance Data") | |
| st.write("This application implements five unsupervised learning algorithms on a dataset of student performance. The algorithms include K-Means, DBSCAN, Agglomerative Clustering, Birch, and Mean Shift. The application provides interactive visualizations, parameter tuning, and performance metrics.") | |
| # Load and preprocess the data | |
| file_path = './Student-Employability-Datasets(Data).csv' | |
| df = load_data(file_path) | |
| if df is not None: | |
| st.write("Preprocessed Data:") | |
| st.dataframe(df) | |
| # Scale and normalize the data | |
| df_for_scaling = df.drop(columns=['CLASS']) | |
| scaled_df = scale_normalize_data(df_for_scaling) | |
| st.write("Scaled and Normalized Data:") | |
| st.dataframe(scaled_df) | |
| # Feature correlation analysis | |
| st.write("Feature Correlation Analysis:") | |
| # Exclude 'CLASS' and 'Cluster' columns from correlation analysis | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns | |
| numeric_cols = [col for col in numeric_cols if col not in ['CLASS', 'Cluster']] | |
| corr_matrix = df[numeric_cols].corr() | |
| st.write(corr_matrix) | |
| plt.figure(figsize=(10, 8)) | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm') | |
| st.pyplot(plt) | |
| # Create a radio button for algorithm selection | |
| st.sidebar.header('Algorithms') | |
| selected_algorithm = st.sidebar.radio("Select Algorithm", ["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"]) | |
| # Show parameters based on selected algorithm | |
| st.sidebar.header('Parameters') | |
| st.title("Algorithms Tab") | |
| st.write("Choose the algorithm above first so that the options will show about the algorithm of choice! :)") | |
| # Create tabs for each algorithm | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs(["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"]) | |
| with tab1: | |
| st.header("K-Means Clustering") | |
| if selected_algorithm == "K-Means": | |
| n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='n_clusters') | |
| max_clusters = st.sidebar.slider("Maximum Number of Clusters for Elbow Curve", 2, 15, 10, key='max') | |
| cluster_labels = perform_clustering(scaled_df, 'kmeans', {'n_clusters': n_clusters, 'max_clusters': max_clusters}) | |
| display_performance_metrics(scaled_df, cluster_labels) | |
| with tab2: | |
| st.header("DBSCAN Clustering") | |
| if selected_algorithm == 'DBSCAN': | |
| eps = st.sidebar.slider("Epsilon", 0.1, 1.0, 0.5, 0.1, key='eps') | |
| min_samples = st.slider("Minimum Samples", 1, 10, 5, key='min_dbscan') | |
| cluster_labels = perform_clustering(scaled_df, 'dbscan', {'eps': eps, 'min_samples': min_samples}) | |
| display_performance_metrics(scaled_df, cluster_labels) | |
| with tab3: | |
| st.header("Agglomerative Clustering") | |
| if selected_algorithm == 'Agglomerative Clustering': | |
| n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='agg_cluster') | |
| cluster_labels = perform_clustering(scaled_df, 'agglomerative', {'n_clusters': n_clusters}) | |
| display_performance_metrics(scaled_df, cluster_labels) | |
| with tab4: | |
| st.header("Birch Clustering") | |
| if selected_algorithm == 'Birch': | |
| n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='birch_cluster') | |
| cluster_labels = perform_clustering(scaled_df, 'birch', {'n_clusters': n_clusters}) | |
| display_performance_metrics(scaled_df, cluster_labels) | |
| with tab5: | |
| st.header("Mean Shift Clustering") | |
| if selected_algorithm == 'Mean Shift': | |
| bandwidth = st.sidebar.slider("Bandwidth", 0.1, 1.0, 0.5, 0.1, key='bandwidth') | |
| cluster_labels = perform_clustering(scaled_df, 'meanshift', {'bandwidth': bandwidth}) | |
| display_performance_metrics(scaled_df, cluster_labels) | |
| # Allow users to input new data points | |
| new_data = input_new_data(scaled_df) | |
| if st.sidebar.button("Predict Cluster for New Data"): | |
| # Perform clustering on the new data point | |
| if selected_algorithm == "K-Means": | |
| params = {'n_clusters': n_clusters} | |
| with tab1: | |
| scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df | |
| cluster_label = perform_clustering(scaled_df_no_cluster, 'kmeans', params) | |
| st.write(f"Predicted Cluster for K-Means: {cluster_label[0]}") | |
| elif selected_algorithm == "DBSCAN": | |
| params = {'eps': eps, 'min_samples': min_samples} | |
| with tab2: | |
| scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df | |
| cluster_label = perform_clustering(scaled_df_no_cluster, 'dbscan', params) | |
| st.write(f"Predicted Cluster for DBSCAN: {cluster_label[0]}") | |
| elif selected_algorithm == "Agglomerative Clustering": | |
| params = {'n_clusters': n_clusters} | |
| with tab3: | |
| scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df | |
| cluster_label = perform_clustering(scaled_df_no_cluster, 'agglomerative', params) | |
| st.write(f"Predicted Cluster for Agglomerative Clustering: {cluster_label[0]}") | |
| elif selected_algorithm == "Birch": | |
| params = {'n_clusters': n_clusters} | |
| with tab4: | |
| scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df | |
| cluster_label = perform_clustering(scaled_df_no_cluster, 'birch', params) | |
| st.write(f"Predicted Cluster for Birch: {cluster_label[0]}") | |
| elif selected_algorithm == "Mean Shift": | |
| params = {'bandwidth': bandwidth} | |
| with tab5: | |
| scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df | |
| cluster_label = perform_clustering(scaled_df_no_cluster, 'meanshift', params) | |
| st.write(f"Predicted Cluster for Mean Shift: {cluster_label[0]}") | |
| # Download results | |
| download_results(df) | |
| if __name__ == "__main__": | |
| main() |