antoniaebner commited on
Commit
7c1c2c8
·
1 Parent(s): 6df6d5b

add preprocessing

Browse files
Files changed (3) hide show
  1. preprocess.py +180 -0
  2. src/{preprocess.py → data.py} +32 -166
  3. src/utils.py +10 -0
preprocess.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
2
+
3
+ """
4
+ This files includes a the data processing for Tox21.
5
+ As an input it takes a list of SMILES and it outputs a nested dictionary with
6
+ SMILES and target names as keys.
7
+ """
8
+
9
+ import os
10
+ import argparse
11
+
12
+ import numpy as np
13
+
14
+ from src.data import create_descriptors, get_tox21_split
15
+ from src.utils import (
16
+ TASKS,
17
+ HF_TOKEN,
18
+ write_pickle,
19
+ create_dir,
20
+ )
21
+
22
+ parser = argparse.ArgumentParser(
23
+ description="Data preprocessing script for the Tox21 dataset"
24
+ )
25
+
26
+ parser.add_argument(
27
+ "--data_folder",
28
+ type=str,
29
+ default="data/",
30
+ help="Folder containing the tox21_compoundData.csv file.",
31
+ )
32
+
33
+ parser.add_argument(
34
+ "--save_folder",
35
+ type=str,
36
+ default="data/",
37
+ help="Folder to which preprocessed the data CSV and NPZ files should be saved.",
38
+ )
39
+
40
+ parser.add_argument(
41
+ "--cv_fold",
42
+ type=int,
43
+ default=4,
44
+ help="Select fold used as validation set.",
45
+ )
46
+
47
+ parser.add_argument(
48
+ "--feature_selection",
49
+ type=int,
50
+ default=1,
51
+ help="True (=1) to use feature selection.",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--feature_selection_path",
56
+ type=str,
57
+ default="feat_selection.npz",
58
+ help="Filename for saving feature selections.",
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--min_var",
63
+ type=float,
64
+ default=0.05,
65
+ help="Minimum variance threshold for selecting features.",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--max_corr",
70
+ type=float,
71
+ default=0.95,
72
+ help="Maximum correlation threshold for selecting features.",
73
+ )
74
+
75
+ parser.add_argument(
76
+ "--ecdfs_path",
77
+ type=str,
78
+ default="ecdfs.pkl",
79
+ help="Filename to save ECDFs.",
80
+ )
81
+
82
+ parser.add_argument(
83
+ "--ecfps_radius",
84
+ type=int,
85
+ default=3,
86
+ help="Radius used for creating ECFPs.",
87
+ )
88
+
89
+ parser.add_argument(
90
+ "--ecfps_folds",
91
+ type=int,
92
+ default=8192,
93
+ help="Folds used for creating ECFPs.",
94
+ )
95
+
96
+
97
+ def main(args):
98
+ """Preprocessing train/val data to use for TabPFN.
99
+
100
+ 1. Download Tox21 train/val data from HF
101
+ 2. Preprocess dataset splits
102
+ """
103
+ ds = get_tox21_split(HF_TOKEN, cvfold=args.cv_fold)
104
+
105
+ feature_creation_kwargs = {
106
+ "radius": args.ecfps_radius,
107
+ "fpsize": args.ecfps_folds,
108
+ "min_var": args.min_var,
109
+ "max_corr": args.max_corr,
110
+ }
111
+
112
+ splits = ["train", "validation"]
113
+ for split in splits:
114
+
115
+ print(f"Preprocess {split} molecules")
116
+
117
+ ds_split = ds[split]
118
+
119
+ smiles = list(ds_split["smiles"])
120
+
121
+ if split == "train":
122
+ output = create_descriptors(
123
+ smiles,
124
+ return_feature_selection=True,
125
+ return_ecdfs=True,
126
+ **feature_creation_kwargs,
127
+ )
128
+ features = output.pop("features")
129
+ feature_selection = output.pop("feature_selection")
130
+ ecdfs = output.pop("ecdfs")
131
+
132
+ np.savez(
133
+ args.feature_selection_path,
134
+ ecfps_selec=feature_selection["ecfps_selec"],
135
+ tox_selec=feature_selection["tox_selec"],
136
+ )
137
+
138
+ print(f"Saved feature selection under {args.feature_selection_path}")
139
+
140
+ write_pickle(args.ecdfs_path, ecdfs)
141
+ print(f"Saved ECDFs under {args.ecdfs_path}")
142
+
143
+ else:
144
+ features = create_descriptors(
145
+ smiles,
146
+ ecdfs=ecdfs,
147
+ feature_selection=feature_selection,
148
+ **feature_creation_kwargs,
149
+ )["features"]
150
+
151
+ labels = []
152
+ for task in TASKS:
153
+ labels.append(ds_split[task].to_numpy())
154
+ labels = np.stack(labels, axis=1)
155
+
156
+ save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
157
+ with open(save_path, "wb") as f:
158
+ np.savez(
159
+ f,
160
+ labels=labels,
161
+ **features,
162
+ )
163
+ print(f"Saved preprocessed {split} split under {save_path}")
164
+
165
+ print("Preprocessing finished successfully")
166
+
167
+
168
+ if __name__ == "__main__":
169
+ args = parser.parse_args()
170
+
171
+ args.ecdfs_path = os.path.join(args.save_folder, args.ecdfs_path)
172
+ args.feature_selection_path = os.path.join(
173
+ args.save_folder, args.feature_selection_path
174
+ )
175
+
176
+ create_dir(args.save_folder)
177
+ create_dir(args.ecdfs_path, is_file=True)
178
+ create_dir(args.feature_selection_path, is_file=True)
179
+
180
+ main(args)
src/{preprocess.py → data.py} RENAMED
@@ -6,97 +6,23 @@ As an input it takes a list of SMILES and it outputs a nested dictionary with
6
  SMILES and target names as keys.
7
  """
8
 
9
- import os
10
- import argparse
11
  import json
12
 
13
  import numpy as np
14
  import pandas as pd
15
 
 
16
  from sklearn.feature_selection import VarianceThreshold
17
  from statsmodels.distributions.empirical_distribution import ECDF
18
- from datasets import load_dataset
19
 
20
  from rdkit import Chem, DataStructs
21
  from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
22
  from rdkit.Chem.rdchem import Mol
23
 
24
- from utils import (
25
- TASKS,
26
- HF_TOKEN,
27
  USED_200_DESCR,
 
28
  Standardizer,
29
- write_pickle,
30
- )
31
-
32
- parser = argparse.ArgumentParser(
33
- description="Data preprocessing script for the Tox21 dataset"
34
- )
35
-
36
- parser.add_argument(
37
- "--data_folder",
38
- type=str,
39
- default="data/",
40
- )
41
-
42
- parser.add_argument(
43
- "--save_folder",
44
- type=str,
45
- default="data/",
46
- )
47
-
48
- parser.add_argument(
49
- "--use_hf",
50
- type=int,
51
- default=0,
52
- )
53
-
54
- parser.add_argument(
55
- "--path_ecdfs",
56
- type=str,
57
- default="ecdfs.pkl",
58
- )
59
-
60
- parser.add_argument(
61
- "--path_feat_selec",
62
- type=str,
63
- default="feat_selection.npz",
64
- )
65
-
66
- parser.add_argument(
67
- "--tox_smarts_filepath",
68
- type=str,
69
- default="tox_smarts.json",
70
- )
71
-
72
- parser.add_argument(
73
- "--feature_selection",
74
- type=int,
75
- default=1,
76
- )
77
-
78
- parser.add_argument(
79
- "--min_var",
80
- type=float,
81
- default=0.05,
82
- )
83
-
84
- parser.add_argument(
85
- "--max_corr",
86
- type=float,
87
- default=0.95,
88
- )
89
-
90
- parser.add_argument(
91
- "--ecfps_radius",
92
- type=int,
93
- default=3,
94
- )
95
-
96
- parser.add_argument(
97
- "--ecfps_folds",
98
- type=int,
99
- default=8192,
100
  )
101
 
102
 
@@ -128,7 +54,7 @@ def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray
128
  return mols, np.array(clean_mol_mask)
129
 
130
 
131
- def create_ecfp_fps(mols: list[Mol], radius=None, fpsize=None) -> np.ndarray:
132
  """This function ECFP fingerprints for a list of molecules.
133
 
134
  Args:
@@ -139,13 +65,10 @@ def create_ecfp_fps(mols: list[Mol], radius=None, fpsize=None) -> np.ndarray:
139
  """
140
  ecfps = list()
141
 
142
- kwargs = {}
143
- if not fpsize is None:
144
- kwargs["fpSize"] = fpsize
145
- if not radius is None:
146
- kwargs["radius"] = radius
147
  for mol in mols:
148
- gen = rdFingerprintGenerator.GetMorganGenerator(countSimulation=True, **kwargs)
 
 
149
  fp_sparse_vec = gen.GetCountFingerprint(mol)
150
 
151
  fp = np.zeros((0,), np.int8)
@@ -283,15 +206,16 @@ def create_descriptors(
283
  feature_selection=None,
284
  return_ecdfs=False,
285
  return_feature_selection=False,
 
286
  ):
287
  # Create cleanded rdkit mol objects
288
  mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
289
  print("Cleaned molecules")
290
 
291
- tox_patterns = get_tox_patterns(args.tox_smarts_filepath)
292
 
293
  # Create fingerprints and descriptors
294
- ecfps = create_ecfp_fps(mols, radius=args.ecfps_radius, fpsize=args.ecfps_folds)
295
  # expand using mol_mask
296
  ecfps = fill(ecfps, ~clean_mol_mask)
297
  print("Created ECFP fingerprints")
@@ -303,8 +227,8 @@ def create_descriptors(
303
  # Create and save feature selection for ecfps and tox
304
  if feature_selection is None:
305
  print("Create Feature selection")
306
- ecfps_selec = get_feature_selection(ecfps, args.min_var, args.max_corr)
307
- tox_selec = get_feature_selection(tox, args.min_var, args.max_corr)
308
  feature_selection = {"ecfps_selec": ecfps_selec, "tox_selec": tox_selec}
309
 
310
  else:
@@ -351,7 +275,7 @@ def create_descriptors(
351
 
352
 
353
  def get_feature_selection(
354
- raw_features: np.ndarray, min_var=0.01, max_corr=0.95
355
  ) -> np.ndarray:
356
  # select features with at least min_var variation
357
  var_thresh = VarianceThreshold(threshold=min_var)
@@ -372,86 +296,28 @@ def get_feature_selection(
372
  return feature_selection
373
 
374
 
375
- def main(args):
376
- """Preprocessing train/val data to use for TabPFN.
377
 
378
- 1. Download Tox21 train/val data from HF & CVfolds used in DeepTox
379
- 2. Combine datasets & re-split data. New validation split is CVfold=4
380
- 3. Preprocess dataset splits
381
- """
382
- splits = ["train", "validation", "test"] # TODO: remove test
383
- if args.use_hf:
384
- ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
385
 
386
- else:
387
- ds = {}
388
- for split in splits:
389
- if split == "train":
390
- ds[split] = pd.read_csv(
391
- os.path.join(args.data_folder, f"tox21_{split}_cv4.csv")
392
- )
393
- else:
394
- ds[split] = pd.read_csv(
395
- os.path.join(args.data_folder, f"tox21_{split}_cv4.csv")
396
- )
397
-
398
- for split in splits:
399
 
400
- print(f"Preprocess {split} molecules")
401
- smiles = list(ds[split]["smiles"])
402
 
403
- if split == "train":
404
- output = create_descriptors(
405
- smiles, return_feature_selection=True, return_ecdfs=True
406
- )
407
- features = output.pop("features")
408
- feature_selection = output.pop("feature_selection")
409
- ecdfs = output.pop("ecdfs")
410
 
411
- np.savez(
412
- args.path_feat_selec,
413
- ecfps_selec=feature_selection["ecfps_selec"],
414
- tox_selec=feature_selection["tox_selec"],
415
- )
416
- print(f"Saved feature selection under {args.path_feat_selec}")
417
 
418
- write_pickle(args.path_ecdfs, ecdfs)
419
- print(f"Saved ECDFs under {args.path_ecdfs}")
420
-
421
- else:
422
- features = create_descriptors(
423
- smiles, ecdfs=ecdfs, feature_selection=feature_selection
424
- )["features"]
425
-
426
- labels = []
427
- for task in TASKS:
428
- datasplit = ds[split].to_pandas() if args.use_hf else ds[split]
429
- labels.append(datasplit[task].to_numpy())
430
- labels = np.stack(labels, axis=1)
431
-
432
- save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
433
- with open(save_path, "wb") as f:
434
- np.savez(
435
- f,
436
- labels=labels,
437
- **features,
438
- )
439
- print(f"Saved preprocessed {split} split under {save_path}")
440
-
441
- print("Preprocessing finished successfully")
442
-
443
-
444
- if __name__ == "__main__":
445
- args = parser.parse_args()
446
-
447
- if not os.path.exists(args.save_folder):
448
- os.makedirs(args.save_folder)
449
-
450
- args.path_ecdfs = os.path.join(args.save_folder, args.path_ecdfs)
451
- args.path_feat_selec = os.path.join(args.save_folder, args.path_feat_selec)
452
- args.tox_smarts_filepath = os.path.join(args.data_folder, args.tox_smarts_filepath)
453
-
454
- if not os.path.exists(os.path.dirname(args.path_ecdfs)):
455
- os.makedirs(os.path.dirname(args.path_ecdfs))
456
-
457
- main(args)
 
6
  SMILES and target names as keys.
7
  """
8
 
 
 
9
  import json
10
 
11
  import numpy as np
12
  import pandas as pd
13
 
14
+ from datasets import load_dataset
15
  from sklearn.feature_selection import VarianceThreshold
16
  from statsmodels.distributions.empirical_distribution import ECDF
 
17
 
18
  from rdkit import Chem, DataStructs
19
  from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
20
  from rdkit.Chem.rdchem import Mol
21
 
22
+ from .utils import (
 
 
23
  USED_200_DESCR,
24
+ TOX_SMARTS_PATH,
25
  Standardizer,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
 
28
 
 
54
  return mols, np.array(clean_mol_mask)
55
 
56
 
57
+ def create_ecfp_fps(mols: list[Mol], radius=3, fpsize=2048, **kwargs) -> np.ndarray:
58
  """This function ECFP fingerprints for a list of molecules.
59
 
60
  Args:
 
65
  """
66
  ecfps = list()
67
 
 
 
 
 
 
68
  for mol in mols:
69
+ gen = rdFingerprintGenerator.GetMorganGenerator(
70
+ countSimulation=True, fpSize=fpsize, radius=radius
71
+ )
72
  fp_sparse_vec = gen.GetCountFingerprint(mol)
73
 
74
  fp = np.zeros((0,), np.int8)
 
206
  feature_selection=None,
207
  return_ecdfs=False,
208
  return_feature_selection=False,
209
+ **kwargs,
210
  ):
211
  # Create cleanded rdkit mol objects
212
  mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
213
  print("Cleaned molecules")
214
 
215
+ tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
216
 
217
  # Create fingerprints and descriptors
218
+ ecfps = create_ecfp_fps(mols, **kwargs)
219
  # expand using mol_mask
220
  ecfps = fill(ecfps, ~clean_mol_mask)
221
  print("Created ECFP fingerprints")
 
227
  # Create and save feature selection for ecfps and tox
228
  if feature_selection is None:
229
  print("Create Feature selection")
230
+ ecfps_selec = get_feature_selection(ecfps, **kwargs)
231
+ tox_selec = get_feature_selection(tox, **kwargs)
232
  feature_selection = {"ecfps_selec": ecfps_selec, "tox_selec": tox_selec}
233
 
234
  else:
 
275
 
276
 
277
  def get_feature_selection(
278
+ raw_features: np.ndarray, min_var=0.01, max_corr=0.95, **kwargs
279
  ) -> np.ndarray:
280
  # select features with at least min_var variation
281
  var_thresh = VarianceThreshold(threshold=min_var)
 
296
  return feature_selection
297
 
298
 
299
+ def get_tox21_split(token, cvfold=None):
300
+ ds = load_dataset("tschouis/tox21", token=token)
301
 
302
+ train_df = ds["train"].to_pandas()
303
+ val_df = ds["validation"].to_pandas()
 
 
 
 
 
304
 
305
+ if cvfold is None:
306
+ return {"train": train_df, "validation": val_df}
 
 
 
 
 
 
 
 
 
 
 
307
 
308
+ combined_df = pd.concat([train_df, val_df], ignore_index=True)
309
+ cvfold = float(cvfold)
310
 
311
+ # create new splits
312
+ cvfold = float(cvfold)
313
+ train_df = combined_df[combined_df.CVfold != cvfold]
314
+ val_df = combined_df[combined_df.CVfold == cvfold]
 
 
 
315
 
316
+ # exclude train mols that occur in the validation split
317
+ val_inchikeys = set(val_df["inchikey"])
318
+ train_df = train_df[~train_df["inchikey"].isin(val_inchikeys)]
 
 
 
319
 
320
+ return {
321
+ "train": train_df.reset_index(drop=True),
322
+ "validation": val_df.reset_index(drop=True),
323
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py CHANGED
@@ -28,6 +28,8 @@ TASKS = [
28
  "SR-p53",
29
  ]
30
 
 
 
31
  KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
32
 
33
  USED_200_DESCR = [
@@ -441,3 +443,11 @@ def load_pickle(path: str):
441
  def write_pickle(path: str, obj: object):
442
  with open(path, "wb") as file:
443
  pickle.dump(obj, file)
 
 
 
 
 
 
 
 
 
28
  "SR-p53",
29
  ]
30
 
31
+ TOX_SMARTS_PATH = "data/tox_smarts.json"
32
+
33
  KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
34
 
35
  USED_200_DESCR = [
 
443
  def write_pickle(path: str, obj: object):
444
  with open(path, "wb") as file:
445
  pickle.dump(obj, file)
446
+
447
+
448
+ def create_dir(path, is_file=False):
449
+ """Creates the parent directories if a path to a file is given, else create the given directory"""
450
+
451
+ to_create = os.path.dirname(path) if is_file else path
452
+ if not os.path.exists(to_create):
453
+ os.makedirs(to_create)