| import numpy as np |
| import joblib |
| from sklearn.preprocessing import KBinsDiscretizer |
| from sklearn.feature_selection import VarianceThreshold |
| from rdkit import Chem |
| import pandas as pd |
| from rdkit.Chem import Descriptors |
| from tqdm import tqdm |
|
|
| MAX_NA = 0.2 |
|
|
| class NanFilter(object): |
| def __init__(self): |
| self._name = "nan_filter" |
|
|
| def fit(self, X): |
| max_na = int((1 - MAX_NA) * X.shape[0]) |
| idxs = [] |
| for j in range(X.shape[1]): |
| c = np.sum(np.isnan(X[:, j])) |
| if c > max_na: |
| continue |
| else: |
| idxs += [j] |
| self.col_idxs = idxs |
|
|
| def transform(self, X): |
| return X[:, self.col_idxs] |
|
|
| def save(self, file_name): |
| joblib.dump(self, file_name) |
|
|
| def load(self, file_name): |
| return joblib.load(file_name) |
|
|
|
|
| class Imputer(object): |
| def __init__(self): |
| self._name = "imputer" |
| self._fallback = 0 |
|
|
| def fit(self, X): |
| ms = [] |
| for j in range(X.shape[1]): |
| vals = X[:, j] |
| mask = ~np.isnan(vals) |
| vals = vals[mask] |
| if len(vals) == 0: |
| m = self._fallback |
| else: |
| m = np.median(vals) |
| ms += [m] |
| self.impute_values = np.array(ms) |
|
|
| def transform(self, X): |
| for j in range(X.shape[1]): |
| mask = np.isnan(X[:, j]) |
| X[mask, j] = self.impute_values[j] |
| return X |
|
|
| def save(self, file_name): |
| joblib.dump(self, file_name) |
|
|
| def load(self, file_name): |
| return joblib.load(file_name) |
|
|
|
|
| class VarianceFilter(object): |
| def __init__(self): |
| self._name = "variance_filter" |
|
|
| def fit(self, X): |
| self.sel = VarianceThreshold() |
| self.sel.fit(X) |
| self.col_idxs = self.sel.transform([[i for i in range(X.shape[1])]]).ravel() |
|
|
| def transform(self, X): |
| return self.sel.transform(X) |
|
|
| def save(self, file_name): |
| joblib.dump(self, file_name) |
|
|
| def load(self, file_name): |
| return joblib.load(file_name) |
|
|
|
|
| def physchem_featurizer(self, smiles_list): |
| R = [] |
| for smiles in tqdm(smiles_list): |
| mol = Chem.MolFromSmiles(smiles) |
| descriptors = [] |
| for _, descr_calc_fn in Descriptors._descList: |
| descriptors.append(descr_calc_fn(mol)) |
| R += [np.array(descriptors)] |
| return np.array(R) |
|
|
|
|
| def physchem_featurizer_as_dataframe(self, smiles_list): |
| R = [] |
| for smiles in tqdm(smiles_list): |
| mol = Chem.MolFromSmiles(smiles) |
| descriptors = [] |
| for _, descr_calc_fn in Descriptors._descList: |
| descriptors.append(descr_calc_fn(mol)) |
| R += [np.array(descriptors)] |
| return pd.DataFrame(np.array(R), columns=[x[0] for x in Descriptors._descList]) |
|
|
|
|
| class PhyschemDescriptor(object): |
| def __init__(self, discretize=True): |
| self.nan_filter = NanFilter() |
| self.imputer = Imputer() |
| self.variance_filter = VarianceFilter() |
| self.discretizer = KBinsDiscretizer( |
| n_bins=5, encode="ordinal", strategy="quantile" |
| ) |
| self.discretize = discretize |
|
|
| def fit(self, smiles): |
| R = physchem_featurizer(smiles) |
| X = np.array(R, dtype=np.float32) |
| self.nan_filter.fit(X) |
| X = self.nan_filter.transform(X) |
| self.imputer.fit(X) |
| X = self.imputer.transform(X) |
| self.variance_filter.fit(X) |
| X = self.variance_filter.transform(X) |
| self.discretizer.fit(X) |
|
|
| def transform(self, smiles): |
| df = physchem_featurizer_as_dataframe(self, smiles) |
| X = np.array(df, dtype=np.float32) |
| X = self.nan_filter.transform(X) |
| X = self.imputer.transform(X) |
| X = self.variance_filter.transform(X) |
| X = self.discretizer.transform(X) |
| return np.array(X, dtype=int) |
|
|
|
|
| class PhyschemDescriptorWithFeatures(object): |
| def __init__(self, discretize=True): |
| self.nan_filter = NanFilter() |
| self.imputer = Imputer() |
| self.variance_filter = VarianceFilter() |
| self.discretizer = KBinsDiscretizer( |
| n_bins=5, encode="ordinal", strategy="quantile" |
| ) |
| self.discretize = discretize |
|
|
| def fit(self, smiles): |
| df = physchem_featurizer_as_dataframe(smiles) |
| X = np.array(df, dtype=np.float32) |
| self.nan_filter.fit(X) |
| X = self.nan_filter.transform(X) |
| self.imputer.fit(X) |
| X = self.imputer.transform(X) |
| self.variance_filter.fit(X) |
| X = self.variance_filter.transform(X) |
| if self.discretize: |
| self.discretizer.fit(X) |
| col_idxs = self.variance_filter.col_idxs |
| feature_names = list(df.columns) |
| self.feature_names = [feature_names[i] for i in col_idxs] |
|
|
| def transform(self, smiles): |
| df = physchem_featurizer_as_dataframe(smiles) |
| X = np.array(df, dtype=np.float32) |
| X = self.nan_filter.transform(X) |
| X = self.imputer.transform(X) |
| X = self.variance_filter.transform(X) |
| if self.discretize: |
| X = self.discretizer.transform(X) |
| return np.array(X, dtype=int) |
|
|
|
|