| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| import importlib.resources as pkg_resources |
| import polyatomic_complexes |
| import numpy as np |
| from typing import Tuple |
| from pathlib import Path |
|
|
|
|
| def load_dataset(name) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: |
| if name.lower() == "esol": |
| data_path = ( |
| pkg_resources.files("polyatomic_complexes.dataset.esol") / "ESOL.csv" |
| ) |
| df = pd.read_csv(str(data_path)) |
| target_col = "measured log solubility in mols per litre" |
| elif name.lower() == "freesolv": |
| data_path = ( |
| pkg_resources.files("polyatomic_complexes.dataset.free_solv") |
| / "FreeSolv.csv" |
| ) |
| df = pd.read_csv(str(data_path)) |
| target_col = "expt" |
| elif name.lower() == "lipophil": |
| data_path = ( |
| pkg_resources.files("polyatomic_complexes.dataset.lipophilicity") |
| / "Lipophilicity.csv" |
| ) |
| df = pd.read_csv(str(data_path)) |
| target_col = "exp" |
| elif name.lower() == "boilingpoint": |
| data_path = ( |
| Path(__file__).parent.parent / "benchmark_csv/boiling_point.csv".__str__() |
| ) |
| df = pd.read_csv(data_path) |
| target_col = "boiling_point_K" |
| elif name.lower() == "qm9": |
| data_path = ( |
| Path(__file__).parent.parent / "benchmark_csv/qm9_subset.csv".__str__() |
| ) |
| df = pd.read_csv(data_path) |
| target_col = "cv" |
| elif name.lower() == "ic50": |
| data_path = ( |
| Path(__file__).parent.parent / "benchmark_csv/ic_50_subset.csv".__str__() |
| ) |
| df = pd.read_csv(data_path) |
| target_col = "pIC50" |
| elif name.lower() == "bindingdb": |
| data_path = ( |
| Path(__file__).parent.parent / "benchmark_csv/bindingdb.csv".__str__() |
| ) |
| df = pd.read_csv(data_path) |
| target_col = "pIC50" |
| else: |
| raise ValueError(f"Unknown dataset: {name}") |
|
|
| df.dropna(subset=["smiles", target_col], inplace=True) |
| smiles = df["smiles"] |
| targets = df[target_col] |
| X_train, X_test, y_train, y_test = train_test_split( |
| smiles, targets, test_size=0.2, random_state=42 |
| ) |
| return X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy() |
|
|