Spaces:

QCDevs
/

Selector_GSoC

Sleeping

File size: 5,864 Bytes

bbcd4a0

# The Selector library provides a set of tools for selecting a
# subset of the dataset and computing diversity.
#
# Copyright (C) 2023 The QC-Devs Community
#
# This file is part of Selector.
#
# Selector is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# Selector is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --

import streamlit as st
import numpy as np
import pandas as pd
import json
import os

from sklearn.metrics import pairwise_distances

def set_page_config(page_title, page_icon):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    assets_dir = os.path.join(current_dir, "..", "assets")

    st.set_page_config(
        page_title=page_title,
        page_icon=os.path.join(assets_dir, page_icon)
    )

def display_sidebar_info(title, description, references):
    st.sidebar.header(title)
    st.sidebar.info(description)
    st.sidebar.title("References")
    st.sidebar.info(references)

# Load data from matrix file
def load_matrix(matrix_file):
    try:
        header_option = None
        if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"):
            header_option = st.checkbox("Does the file have a header?", key="header_option",
                                        on_change = clear_results())
            st.warning("Warning: This will affect the final output if not specified correctly.")

        if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"):
            if header_option:
                # Load the matrix with header
                matrix = pd.read_csv(matrix_file)
            else:
                # Load the matrix without header
                matrix = pd.read_csv(matrix_file, header=None)
            st.write("Matrix shape:", matrix.shape)
            st.write(matrix.values)


        elif matrix_file.name.endswith(".npz"):
            matrix_data = np.load(matrix_file)
            array_names = matrix_data.files # Select the array in the .npz file
            selected_array = st.selectbox("Select the array to use", array_names)
            matrix = matrix_data[selected_array]
            st.write("Matrix shape:", matrix.shape)
            st.write(matrix)
        elif matrix_file.name.endswith(".npy"):
            matrix = np.load(matrix_file)
            st.write("Matrix shape:", matrix.shape)
            st.write(matrix)
        return matrix
    except Exception as e:
        st.error(f'An error occurred while loading matrix file: {e}')
        return None

def load_labels(label_file):
    try:
        label_header_option = None
        if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"):
            label_header_option = st.checkbox("Does the file have a header?", key="label_header_option",
                                              on_change = clear_results())
            st.warning("Warning: This will affect the final output if not specified correctly.")

        if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"):
            if label_header_option:
                labels = pd.read_csv(label_file).values.flatten()
            else:
                labels = pd.read_csv(label_file, header=None).values.flatten()
            st.write("Cluster labels shape:", labels.shape)
            st.write(labels)
        return labels
    except Exception as e:
        st.error(f'An error occurred while loading cluster label file: {e}')
        return None

def run_algorithm(selector, matrix, num_points, labels):
    try:
        # Separate the non-numeric first column (element names) and the numeric data
        element_names = matrix.iloc[:, 0].values  # Assuming the first column contains the names
        numeric_matrix = matrix.select_dtypes(include=[np.number]).values

        if labels is not None:
            selected_ids = selector.select(numeric_matrix, size = num_points, labels = labels)
        else:
            selected_ids = selector.select(numeric_matrix, size = num_points)

        selected_ids = [(element_names[i], i) for i in selected_ids]
        st.session_state['selected_ids'] = selected_ids
        return selected_ids
    except ValueError as ve:
        st.error(f"An error occurred while running the algorithm: {ve}")
    except Exception as e:
        st.error(f"An error occurred while running the algorithm: {e}")
    return None

def export_results(selected_ids):
    export_format = st.selectbox("Select export format", ["CSV", "JSON"], key="export_format")

    if export_format == "CSV":
        csv_data = pd.DataFrame(selected_ids, columns=["Element", "Index"])
        csv = csv_data.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download as CSV",
            data=csv,
            file_name='selected_indices.csv',
            mime='text/csv',
        )
    else:
        json_data = json.dumps([{"Element": elem, "Index": i} for i, elem in selected_ids])
        st.download_button(
            label="Download as JSON",
            data=json_data,
            file_name='selected_indices.json',
            mime='application/json',
        )

# Function to clear selected indices from session state
def clear_results():
    if 'selected_ids' in st.session_state:
        del st.session_state['selected_ids']
    if 'selector' in st.session_state:
        del st.session_state['selector']