# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py

"""
This files includes a the data processing for Tox21.
As an input it takes a list of SMILES and it outputs a nested dictionary with
SMILES and target names as keys.
"""

import os
import argparse

import numpy as np

from src.data import create_descriptors, get_tox21_split
from src.utils import (
    TASKS,
    HF_TOKEN,
    write_pickle,
    create_dir,
)

parser = argparse.ArgumentParser(
    description="Data preprocessing script for the Tox21 dataset"
)

parser.add_argument(
    "--data_folder",
    type=str,
    default="data/",
    help="Folder containing the tox21_compoundData.csv file.",
)

parser.add_argument(
    "--save_folder",
    type=str,
    default="data/",
    help="Folder to which preprocessed the data CSV and NPZ files should be saved.",
)

parser.add_argument(
    "--cv_fold",
    type=int,
    default=4,
    help="Select fold used as validation set.",
)

parser.add_argument(
    "--feature_selection",
    type=int,
    default=1,
    help="True (=1) to use feature selection.",
)

parser.add_argument(
    "--feature_selection_path",
    type=str,
    default="feat_selection.npz",
    help="Filename for saving feature selections.",
)

parser.add_argument(
    "--min_var",
    type=float,
    default=0.05,
    help="Minimum variance threshold for selecting features.",
)

parser.add_argument(
    "--max_corr",
    type=float,
    default=0.95,
    help="Maximum correlation threshold for selecting features.",
)

parser.add_argument(
    "--ecdfs_path",
    type=str,
    default="ecdfs.pkl",
    help="Filename to save ECDFs.",
)

parser.add_argument(
    "--ecfps_radius",
    type=int,
    default=3,
    help="Radius used for creating ECFPs.",
)

parser.add_argument(
    "--ecfps_folds",
    type=int,
    default=8192,
    help="Folds used for creating ECFPs.",
)


def main(args):
    """Preprocessing train/val data to use for TabPFN.

    1. Download Tox21 train/val data from HF
    2. Preprocess dataset splits
    """
    ds = get_tox21_split(HF_TOKEN, cvfold=args.cv_fold)

    feature_creation_kwargs = {
        "radius": args.ecfps_radius,
        "fpsize": args.ecfps_folds,
        "min_var": args.min_var,
        "max_corr": args.max_corr,
    }

    splits = ["train", "validation"]
    for split in splits:

        print(f"Preprocess {split} molecules")

        ds_split = ds[split]

        smiles = list(ds_split["smiles"])

        if split == "train":
            output = create_descriptors(
                smiles,
                return_feature_selection=True,
                return_ecdfs=True,
                **feature_creation_kwargs,
            )
            features = output.pop("features")
            feature_selection = output.pop("feature_selection")
            ecdfs = output.pop("ecdfs")

            np.savez(
                args.feature_selection_path,
                ecfps_selec=feature_selection["ecfps_selec"],
                tox_selec=feature_selection["tox_selec"],
            )

            print(f"Saved feature selection under {args.feature_selection_path}")

            write_pickle(args.ecdfs_path, ecdfs)
            print(f"Saved ECDFs under {args.ecdfs_path}")

        else:
            features = create_descriptors(
                smiles,
                ecdfs=ecdfs,
                feature_selection=feature_selection,
                **feature_creation_kwargs,
            )["features"]

        labels = []
        for task in TASKS:
            labels.append(ds_split[task].to_numpy())
        labels = np.stack(labels, axis=1)

        save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
        with open(save_path, "wb") as f:
            np.savez(
                f,
                labels=labels,
                **features,
            )
            print(f"Saved preprocessed {split} split under {save_path}")

    print("Preprocessing finished successfully")


if __name__ == "__main__":
    args = parser.parse_args()

    args.ecdfs_path = os.path.join(args.save_folder, args.ecdfs_path)
    args.feature_selection_path = os.path.join(
        args.save_folder, args.feature_selection_path
    )

    create_dir(args.save_folder)
    create_dir(args.ecdfs_path, is_file=True)
    create_dir(args.feature_selection_path, is_file=True)

    main(args)