import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
import base64

# Function to load and preprocess the data
def load_data(file_path):
    """
    Load and preprocess the dataset from a CSV file.
    
    Parameters:
    - file_path: str, path to the CSV file
    
    Returns:
    - df: DataFrame, preprocessed dataset
    """
    try:
        df = pd.read_csv(file_path)
        # Drop the 'Name of Student' column as it is not numerical
        df = df.drop(columns=['Name of Student'])
        # Convert categorical 'CLASS' to numerical
        df['CLASS'] = df['CLASS'].astype('category').cat.codes
        return df
    except Exception as e:
        st.error(f"Error loading data: {e}")
        return None

# Function to scale and normalize the data
def scale_normalize_data(df):
    """
    Scale and normalize the dataset.
    
    Parameters:
    - df: DataFrame, dataset to be scaled and normalized
    
    Returns:
    - scaled_df: DataFrame, scaled and normalized dataset
    """
    scaler = StandardScaler()
    # Drop 'Cluster' column if it exists
    if 'Cluster' in df.columns:
        df = df.drop(columns=['Cluster'])
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return scaled_df

# Function to create a scatter plot
def create_scatter_plot(df, x_col, y_col, cluster_labels):
    """
    Create a scatter plot for visualization.
    
    Parameters:
    - df: DataFrame, dataset
    - x_col: str, column for x-axis
    - y_col: str, column for y-axis
    - cluster_labels: array, cluster labels
    """
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=x_col, y=y_col, hue=cluster_labels, data=df, palette='viridis')
    plt.title(f'Scatter Plot of {x_col} vs {y_col}')
    st.pyplot(plt)

# Function to create an elbow curve
def create_elbow_curve(df, max_clusters):
    """
    Create an elbow curve to determine the optimal number of clusters.
    
    Parameters:
    - df: DataFrame, dataset
    - max_clusters: int, maximum number of clusters to consider
    """
    wcss = []
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(df)
        wcss.append(kmeans.inertia_)
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_clusters + 1), wcss, marker='o')
    plt.title('Elbow Curve')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    st.pyplot(plt)

# Function to perform clustering and display results
def perform_clustering(df, algorithm, params):
    """
    Perform clustering using the specified algorithm and parameters.
    
    Parameters:
    - df: DataFrame, dataset
    - algorithm: str, clustering algorithm ('kmeans', 'dbscan', 'agglomerative', 'birch', 'meanshift')
    - params: dict, parameters for the algorithm
    
    Returns:
    - model: fitted clustering model
    """
    if algorithm == 'kmeans':
        model = KMeans(n_clusters=params['n_clusters'], random_state=42)
    elif algorithm == 'dbscan':
        model = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
    elif algorithm == 'agglomerative':
        model = AgglomerativeClustering(n_clusters=params['n_clusters'])
    elif algorithm == 'birch':
        model = Birch(n_clusters=params['n_clusters'])
    elif algorithm == 'meanshift':
        model = MeanShift(bandwidth=params['bandwidth'])
    else:
        st.error("Invalid algorithm")
        return None
    
    cluster_labels = model.fit_predict(df)
    df['Cluster'] = cluster_labels
    st.write("Cluster Assignments:")
    st.dataframe(df)
    
    
    # Create elbow curve if applicable
    if algorithm == 'kmeans' and 'max_clusters' in params:
        create_elbow_curve(df, params['max_clusters'])

    return cluster_labels

def display_performance_metrics(df, cluster_labels):
    """
    Display performance metrics for clustering results.
    
    Parameters:
    - df: DataFrame, dataset
    - cluster_labels: array, cluster labels
    """
    if len(np.unique(cluster_labels)) > 1:
        silhouette = silhouette_score(df, cluster_labels)
        calinski_harabasz = calinski_harabasz_score(df, cluster_labels)
        st.write(f"Silhouette Score: {silhouette:.2f}")
        st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.2f}")


# Function to allow users to input new data points
def input_new_data(df):
    """
    Allow users to input new data points for prediction.
    
    Parameters:
    - df: DataFrame, dataset
    """
    st.sidebar.write("Input new data for prediction:")
    new_data = {}
    for col in df.columns:
        if col != 'Cluster':
            new_data[col] = st.sidebar.slider(f"Enter {col}", 1, 5)
    new_df = pd.DataFrame([new_data])
    scaled_new_df = scale_normalize_data(new_df)
    return scaled_new_df

# Function to download results
def download_results(df):
    """
    Provide a downloadable CSV file of the results.
    
    Parameters:
    - df: DataFrame, results to be downloaded
    """
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="cluster_results.csv">Download CSV File</a>'
    st.markdown(href, unsafe_allow_html=True)

# Main function to create the Streamlit app
def main():
    st.title("Unsupervised Learning on Student Performance Data")
    st.write("This application implements five unsupervised learning algorithms on a dataset of student performance. The algorithms include K-Means, DBSCAN, Agglomerative Clustering, Birch, and Mean Shift. The application provides interactive visualizations, parameter tuning, and performance metrics.")

    # Load and preprocess the data
    file_path = './Student-Employability-Datasets(Data).csv'
    df = load_data(file_path)
    if df is not None:
        st.write("Preprocessed Data:")
        st.dataframe(df)
        
        # Scale and normalize the data
        df_for_scaling = df.drop(columns=['CLASS'])
        scaled_df = scale_normalize_data(df_for_scaling)
        st.write("Scaled and Normalized Data:")
        st.dataframe(scaled_df)
        
        # Feature correlation analysis
        st.write("Feature Correlation Analysis:")
        # Exclude 'CLASS' and 'Cluster' columns from correlation analysis
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['CLASS', 'Cluster']]
        corr_matrix = df[numeric_cols].corr()
        st.write(corr_matrix)
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
        st.pyplot(plt)

        # Create a radio button for algorithm selection
        st.sidebar.header('Algorithms')
        selected_algorithm = st.sidebar.radio("Select Algorithm", ["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"])
        # Show parameters based on selected algorithm
        st.sidebar.header('Parameters')
        
        st.title("Algorithms Tab")
        st.write("Choose the algorithm above first so that the options will show about the algorithm of choice! :)")
        # Create tabs for each algorithm
        tab1, tab2, tab3, tab4, tab5 = st.tabs(["K-Means", "DBSCAN", "Agglomerative Clustering", "Birch", "Mean Shift"])
        
        with tab1:
            st.header("K-Means Clustering")
            if selected_algorithm == "K-Means":
                n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='n_clusters')
                max_clusters = st.sidebar.slider("Maximum Number of Clusters for Elbow Curve", 2, 15, 10, key='max')
                cluster_labels = perform_clustering(scaled_df, 'kmeans', {'n_clusters': n_clusters, 'max_clusters': max_clusters})
                display_performance_metrics(scaled_df, cluster_labels)
        
        with tab2:
            st.header("DBSCAN Clustering")
            if selected_algorithm == 'DBSCAN':
                eps = st.sidebar.slider("Epsilon", 0.1, 1.0, 0.5, 0.1, key='eps')
                min_samples = st.slider("Minimum Samples", 1, 10, 5, key='min_dbscan')
                cluster_labels = perform_clustering(scaled_df, 'dbscan', {'eps': eps, 'min_samples': min_samples})
                display_performance_metrics(scaled_df, cluster_labels)
        
        with tab3:
            st.header("Agglomerative Clustering")
            if selected_algorithm == 'Agglomerative Clustering':
                n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='agg_cluster')
                cluster_labels = perform_clustering(scaled_df, 'agglomerative', {'n_clusters': n_clusters})
                display_performance_metrics(scaled_df, cluster_labels)
            
        with tab4:
            st.header("Birch Clustering")
            if selected_algorithm == 'Birch':
                n_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3, key='birch_cluster')
                cluster_labels = perform_clustering(scaled_df, 'birch', {'n_clusters': n_clusters})
                display_performance_metrics(scaled_df, cluster_labels)
        
        with tab5:
            st.header("Mean Shift Clustering")
            if selected_algorithm == 'Mean Shift':
                bandwidth = st.sidebar.slider("Bandwidth", 0.1, 1.0, 0.5, 0.1, key='bandwidth')
                cluster_labels = perform_clustering(scaled_df, 'meanshift', {'bandwidth': bandwidth})
                display_performance_metrics(scaled_df, cluster_labels)
        
        # Allow users to input new data points
        new_data = input_new_data(scaled_df)
        if st.sidebar.button("Predict Cluster for New Data"):
            # Perform clustering on the new data point
            if selected_algorithm == "K-Means":
                params = {'n_clusters': n_clusters}
                with tab1:
                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
                    cluster_label = perform_clustering(scaled_df_no_cluster, 'kmeans', params)
                    st.write(f"Predicted Cluster for K-Means: {cluster_label[0]}")

            elif selected_algorithm == "DBSCAN":
                params = {'eps': eps, 'min_samples': min_samples}
                with tab2:
                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
                    cluster_label = perform_clustering(scaled_df_no_cluster, 'dbscan', params)
                    st.write(f"Predicted Cluster for DBSCAN: {cluster_label[0]}")

            elif selected_algorithm == "Agglomerative Clustering":
                params = {'n_clusters': n_clusters}
                with tab3:
                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
                    cluster_label = perform_clustering(scaled_df_no_cluster, 'agglomerative', params)
                    st.write(f"Predicted Cluster for Agglomerative Clustering: {cluster_label[0]}")

            elif selected_algorithm == "Birch":
                params = {'n_clusters': n_clusters}
                with tab4:
                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
                    cluster_label = perform_clustering(scaled_df_no_cluster, 'birch', params)
                    st.write(f"Predicted Cluster for Birch: {cluster_label[0]}")

            elif selected_algorithm == "Mean Shift":
                params = {'bandwidth': bandwidth}
                with tab5:
                    scaled_df_no_cluster = scaled_df.drop(columns=['Cluster']) if 'Cluster' in scaled_df.columns else scaled_df
                    cluster_label = perform_clustering(scaled_df_no_cluster, 'meanshift', params)
                    st.write(f"Predicted Cluster for Mean Shift: {cluster_label[0]}")
        
        # Download results
        download_results(df)

if __name__ == "__main__":
    main()