yzhouchen001 commited on
Commit
0b51da1
·
1 Parent(s): e0cc56a

magma loader

Browse files
mvp/data/datasets.py CHANGED
@@ -154,15 +154,13 @@ class MassSpecDataset_PeakFormulas(JESTR1_MassSpecDataset):
154
  print("Data path: ", self.pth)
155
  self.metadata = pd.read_csv(self.pth, sep="\t")
156
 
157
- # Used for training on consensus spectra
158
- # with open(self.pth, 'rb') as f:
159
- # self.metadata = pickle.load(f)
160
- # self.metadata['identifier'] = self.metadata['smiles'].tolist()
161
-
162
  # load subformulas
163
  all_spec_ids = self.metadata['identifier'].tolist()
164
  subformulaLoader = data_utils.Subformula_Loader(spectra_view=spectra_view, dir_path=subformula_dir_pth)
165
- id_to_spec = subformulaLoader(all_spec_ids)
 
 
 
166
 
167
  # create subformula spectra if no subformula is available
168
  tmp_ids = [spec_id for spec_id in all_spec_ids if spec_id not in id_to_spec]
 
154
  print("Data path: ", self.pth)
155
  self.metadata = pd.read_csv(self.pth, sep="\t")
156
 
 
 
 
 
 
157
  # load subformulas
158
  all_spec_ids = self.metadata['identifier'].tolist()
159
  subformulaLoader = data_utils.Subformula_Loader(spectra_view=spectra_view, dir_path=subformula_dir_pth)
160
+
161
+ form_list = self.metadata['formula'].tolist()
162
+ prec_mz_list = self.metadata['precursor_mz'].tolist()
163
+ id_to_spec = subformulaLoader(all_spec_ids, form_list, prec_mz_list)
164
 
165
  # create subformula spectra if no subformula is available
166
  tmp_ids = [spec_id for spec_id in all_spec_ids if spec_id not in id_to_spec]
mvp/models/contrastive.py CHANGED
@@ -270,6 +270,7 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
270
  def get_checkpoint_monitors(self) -> T.List[dict]:
271
  monitors = [
272
  {"monitor": f"{Stage.TRAIN.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor train loss
 
273
  ]
274
  return monitors
275
 
 
270
  def get_checkpoint_monitors(self) -> T.List[dict]:
271
  monitors = [
272
  {"monitor": f"{Stage.TRAIN.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor train loss
273
+ {"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor val loss
274
  ]
275
  return monitors
276
 
mvp/params_formSpec.yaml CHANGED
@@ -1,13 +1,13 @@
1
  # Experiment setup
2
  job_key: ''
3
- run_name: 'filip_quick_test'
4
  run_details: ""
5
  project_name: ''
6
  wandb_entity_name: 'mass-spec-ml'
7
  no_wandb: True
8
  seed: 0
9
  debug: False
10
- checkpoint_pth: #'../pretrained_models/msgym_formSpec.ckpt'
11
 
12
  # Training setup
13
  max_epochs: 2000
@@ -19,10 +19,10 @@ val_check_interval: 1.0
19
  # Data paths
20
  candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
21
  dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
22
- subformula_dir_pth: /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
23
  split_pth:
24
- fp_dir_pth: '../data/MassSpecGym/data/morganfp_r5_1024.pickle'
25
- cons_spec_dir_pth: "../data/MassSpecGym/data/sample_consensus_formSpec.pkl"
26
  NL_spec_dir_pth: ""
27
  partial_checkpoint: ""
28
 
 
1
  # Experiment setup
2
  job_key: ''
3
+ run_name: 'sirius_labels'
4
  run_details: ""
5
  project_name: ''
6
  wandb_entity_name: 'mass-spec-ml'
7
  no_wandb: True
8
  seed: 0
9
  debug: False
10
+ checkpoint_pth:
11
 
12
  # Training setup
13
  max_epochs: 2000
 
19
  # Data paths
20
  candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
21
  dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
22
+ subformula_dir_pth: /r/hassounlab/msgym_sirius # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
23
  split_pth:
24
+ fp_dir_pth:
25
+ cons_spec_dir_pth:
26
  NL_spec_dir_pth: ""
27
  partial_checkpoint: ""
28
 
mvp/params_tmp.yaml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experiment setup
2
+ job_key: ''
3
+ run_name: 'filipContrastive'
4
+ run_details: ""
5
+ project_name: ''
6
+ wandb_entity_name: 'mass-spec-ml'
7
+ no_wandb: True
8
+ seed: 0
9
+ debug: False
10
+ checkpoint_pth:
11
+
12
+ # Training setup
13
+ max_epochs: 2000
14
+ accelerator: 'gpu'
15
+ devices: [1]
16
+ log_every_n_steps: 250
17
+ val_check_interval: 1.0
18
+
19
+ # Data paths
20
+ candidates_pth: /data/yzhouc01/cancer/candidates.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
21
+ dataset_pth: /data/yzhouc01/cancer/data.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
22
+ subformula_dir_pth: /data/yzhouc01/cancer/subformulae # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
23
+ split_pth:
24
+ fp_dir_pth:
25
+ cons_spec_dir_pth:
26
+ NL_spec_dir_pth: ""
27
+ partial_checkpoint: ""
28
+
29
+ # General hyperparameters
30
+ batch_size: 64
31
+ lr: 5.0e-05
32
+ weight_decay: 0
33
+ contr_temp: 0.05
34
+ early_stopping_patience: 300
35
+ loss_strategy: 'static'
36
+ num_workers: 50
37
+
38
+
39
+ ############################## Data transforms ##############################
40
+ # - Spectra
41
+ spectra_view: SpecFormula #SpecMzIntTokens #SpecFormula
42
+ # 1. Binner
43
+ max_mz: 1000
44
+ bin_width: 1
45
+ mask_peak_ratio: 0.00
46
+
47
+ # 2. SpecFormula
48
+ element_list: ['H', 'C', 'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
49
+ add_intensities: True
50
+ mask_precursor: False
51
+
52
+ # - Molecule
53
+ molecule_view: "MolGraph"
54
+ atom_feature: 'full'
55
+ bond_feature: 'full'
56
+
57
+
58
+ ############################## Views ##############################
59
+ # contrastive
60
+ use_contr: False
61
+ contr_wt: 1
62
+ contr_wt_update: {}
63
+
64
+ # consensus spectra
65
+ use_cons_spec: False
66
+ cons_spec_wt: 3
67
+ cons_spec_wt_update: {}
68
+ cons_loss_type: 'l2' # cosine, l2
69
+
70
+ # fp prediction/usage
71
+ pred_fp: False
72
+ use_fp: False
73
+ fp_loss_type: 'cosine' #cosine, bce
74
+ fp_wt: 3
75
+ fp_wt_update: {}
76
+ fp_size: 1024
77
+ fp_radius: 5
78
+ fp_dropout: 0.4
79
+
80
+ # candidates
81
+ aug_cands: False
82
+ aug_cands_wt: 0.1
83
+ aug_cands_update: {}
84
+ aug_cands_size: 3
85
+
86
+ # neutral loss
87
+ use_NL: False
88
+
89
+
90
+ ############################## Task and model ##############################
91
+ task: 'retrieval'
92
+ spec_enc: Transformer_Formula # Transformer_MzInt #Transformer_Formula
93
+ mol_enc: "GNN"
94
+ model: filipContrastive # "MultiviewContrastive"
95
+ contr_views: [['spec_enc', 'mol_enc']] #[['spec_enc', 'mol_enc'], ['spec_enc', 'NL_spec_enc'], ['mol_enc', 'NL_spec_enc']] #[['spec_enc', 'mol_enc'], ['mol_enc', 'cons_spec_enc'], ['cons_spec_enc', 'spec_enc'], ['fp_enc', 'mol_enc'], ['fp_enc', 'spec_enc'], ['fp_enc', 'cons_spec_enc']]
96
+ log_only_loss_at_stages: []
97
+ df_test_path: ""
98
+
99
+ # - Spectra encoder
100
+ final_embedding_dim: 512
101
+ fc_dropout: 0.4
102
+
103
+ # - Spectra Token encoder
104
+ hidden_dims: [64, 128]
105
+ peak_dropout: 0.2
106
+
107
+ # - Formula-based spec encoders
108
+ formula_dropout: 0.2
109
+ formula_dims: [64, 128, 256]
110
+ cross_attn_heads: 2
111
+ use_cls: False
112
+
113
+ # -- GAT params
114
+ attn_heads: [12,12,12]
115
+
116
+ # - Molecule encoder (GNN)
117
+ gnn_channels: [64,128,256]
118
+ gnn_type: "gcn"
119
+ num_gnn_layers: 3
120
+ gnn_hidden_dim: 512
121
+ gnn_dropout: 0.3
mvp/run.sh CHANGED
@@ -1,12 +1,3 @@
1
- # 1. preprocess data (subformula labels should be obtained through MIST)
2
- # python data_preprocess.py --spec_type formSpec --dataset_pth ../data/sample/data.tsv --candidates_pth ../data/sample/candidates_mass.json --subformula_dir_pth ../data/sample/subformulae_default/ --output_dir ../data/sample/
3
-
4
- # 2. train model on msgym
5
- # python train.py --param_pth params_formSpec.yaml
6
-
7
- # 3. test model on msgym
8
- # python train.py --param_pth params_binnedSpec.yaml
9
-
10
- # python train.py
11
- python test.py
12
  python test.py --candidates_pth /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_formula.json
 
1
+ python train.py
2
+ python test.py
 
 
 
 
 
 
 
 
 
3
  python test.py --candidates_pth /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_formula.json
mvp/test.py CHANGED
@@ -2,7 +2,7 @@ import argparse
2
  import datetime
3
  import sys
4
  sys.path.insert(0, "/data/yzhouc01/MassSpecGym")
5
- sys.path.insert(0, "/data/yzhouc01/MVP")
6
 
7
  from rdkit import RDLogger
8
  import pytorch_lightning as pl
 
2
  import datetime
3
  import sys
4
  sys.path.insert(0, "/data/yzhouc01/MassSpecGym")
5
+ sys.path.insert(0, "/data/yzhouc01/FILIP-MS")
6
 
7
  from rdkit import RDLogger
8
  import pytorch_lightning as pl
mvp/train.py CHANGED
@@ -33,7 +33,7 @@ def main(params):
33
 
34
  # Init paths to data files
35
  if params['debug']:
36
- params['dataset_pth'] = "../data/sample/data.tsv"
37
  params['candidates_pth'] =None
38
  params['split_pth']=None
39
 
@@ -80,7 +80,7 @@ def main(params):
80
  filename=f'{{epoch}}-{{{monitor_name}:.2f}}',
81
  # filename='{epoch}-{val_loss:.2f}-{train_loss:.2f}',
82
  auto_insert_metric_name=True,
83
- save_last=(i == 0)
84
  )
85
  callbacks.append(checkpoint)
86
  if monitor.get('early_stopping', False):
 
33
 
34
  # Init paths to data files
35
  if params['debug']:
36
+ params['dataset_pth'] = "/data/yzhouc01/MVP/data/sample/data.tsv"
37
  params['candidates_pth'] =None
38
  params['split_pth']=None
39
 
 
80
  filename=f'{{epoch}}-{{{monitor_name}:.2f}}',
81
  # filename='{epoch}-{val_loss:.2f}-{train_loss:.2f}',
82
  auto_insert_metric_name=True,
83
+ # save_last=(i == 0)
84
  )
85
  callbacks.append(checkpoint)
86
  if monitor.get('early_stopping', False):
mvp/utils/data.py CHANGED
@@ -9,11 +9,18 @@ import mvp.data.datasets as jestr_datasets
9
  import typing as T
10
  from mvp.definitions import MSGYM_FORMULA_VECTOR_NORM, MSGYM_STANDARD_MH
11
  import matchms
 
12
 
13
  class Subformula_Loader:
14
- def __init__(self, spectra_view, dir_path) -> None:
 
 
 
 
15
 
16
  self.dir_path = dir_path
 
 
17
  if spectra_view == 'SpecFormula':
18
  self.load = self.load_subformula_data
19
  elif spectra_view == "SpecFormulaMz":
@@ -21,37 +28,90 @@ class Subformula_Loader:
21
  else:
22
  raise Exception("Spectra view is not supported.")
23
 
24
- def __call__(self, ids):
25
  id_to_form_spec = {}
26
- for id in ids:
27
- data = self.load(id)
28
- if data:
29
  id_to_form_spec[id] = data
30
-
31
  return id_to_form_spec
32
-
33
- def load_subformula_data(self, spec_id: str):
34
- '''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
35
  '''
36
  try:
37
- file = os.path.join(self.dir_path, spec_id+".json")
38
- with open(file) as f:
39
- data = json.load(f)
40
  mzs = np.array(data['output_tbl']['mz'])
41
  formulas = np.array(data['output_tbl']['formula'])
42
  intensities = np.array(data['output_tbl']['ms2_inten'])
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # sort by mzs
45
  ind = mzs.argsort()
46
  mzs = mzs[ind]
47
  formulas = formulas[ind]
48
  intensities = intensities[ind]
49
  return {'formulas': formulas, 'formula_mzs': mzs, 'formula_intensities': intensities}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  except:
51
  return None
52
 
53
  def load_subformula_dict(self, spec_id: str):
54
- '''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
55
  '''
56
  try:
57
  file = os.path.join(self.dir_path, spec_id+".json")
@@ -64,7 +124,7 @@ class Subformula_Loader:
64
  mz_to_formulas = {mz:f for mz, f in zip(mzs, formulas)}
65
  for mz, f in zip(mzs, formulas):
66
  mz_to_formulas[mz] = f
67
-
68
  ind = mzs.argsort()
69
  mzs = mzs[ind]
70
  formulas = formulas[ind]
 
9
  import typing as T
10
  from mvp.definitions import MSGYM_FORMULA_VECTOR_NORM, MSGYM_STANDARD_MH
11
  import matchms
12
+ import tqdm
13
 
14
  class Subformula_Loader:
15
+ """
16
+ :param dir_path: path to folder containing either MIST or SIRIUS formulas, automatically parses the file type as needed
17
+ :param use_prec_mz: add precursor m/z when fragment precursor peak is not present or remove precursor peak when their is no fragment precursor peak
18
+ """
19
+ def __init__(self, spectra_view, dir_path, use_prec_mz=True, formula_source='default') -> None:
20
 
21
  self.dir_path = dir_path
22
+ self.use_prec_mz = use_prec_mz
23
+ self.formula_source = formula_source
24
  if spectra_view == 'SpecFormula':
25
  self.load = self.load_subformula_data
26
  elif spectra_view == "SpecFormulaMz":
 
28
  else:
29
  raise Exception("Spectra view is not supported.")
30
 
31
+ def __call__(self, ids, form_list, prec_mz_list):
32
  id_to_form_spec = {}
33
+ for id, curr_form, curr_prec_mz in tqdm.tqdm(zip(ids, form_list, prec_mz_list), total=len(ids)):
34
+ data = self.load(id, curr_form, curr_prec_mz)
35
+ if data is not None:
36
  id_to_form_spec[id] = data
37
+
38
  return id_to_form_spec
39
+
40
+ def load_mist_data(self, data, curr_form, curr_prec_mz):
41
+ '''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
42
  '''
43
  try:
44
+ # file = os.path.join(self.dir_path, spec_id+".json")
45
+ # with open(file) as f:
46
+ # data = json.load(f)
47
  mzs = np.array(data['output_tbl']['mz'])
48
  formulas = np.array(data['output_tbl']['formula'])
49
  intensities = np.array(data['output_tbl']['ms2_inten'])
50
 
51
+ if curr_form not in formulas and self.use_prec_mz:
52
+ mzs = np.concatenate([mzs, [curr_prec_mz]])
53
+ formulas = np.concatenate([formulas, [curr_form]])
54
+ intensities = np.concatenate([intensities, [1.1]])
55
+ elif curr_form in formulas and self.use_prec_mz:
56
+ idx = np.where(formulas == curr_form)[0][0]
57
+ intensities[idx] = 1.1
58
+
59
+ # sort by mzs
60
+ ind = mzs.argsort()
61
+ mzs = mzs[ind]
62
+ formulas = formulas[ind]
63
+ intensities = intensities[ind]
64
+ return {'formulas': formulas, 'formula_mzs': mzs, 'formula_intensities': intensities}
65
+ except:
66
+ return None
67
+
68
+ def load_magma_data(self, data, curr_form, curr_prec_mz):
69
+
70
+ return None
71
+
72
+ def load_sirius_data(self, data):
73
+ try:
74
+
75
+ mzs = np.array([entry['mz'] for entry in data['fragments']])
76
+ formulas = np.array([entry['molecularFormula'] for entry in data['fragments']])
77
+ intensities = np.array([entry['relativeIntensity'] for entry in data['fragments'] ])
78
+
79
+ intensities[formulas == data['molecularFormula']] = 1.1
80
+
81
+ if not self.use_prec_mz:
82
+ not_append_prec_mz = np.array([len(entry['peaks']) != 0 for entry in data['fragments']])
83
+
84
+ mzs = mzs[not_append_prec_mz]
85
+ formulas = formulas[not_append_prec_mz]
86
+ intensities = intensities[not_append_prec_mz]
87
+
88
  # sort by mzs
89
  ind = mzs.argsort()
90
  mzs = mzs[ind]
91
  formulas = formulas[ind]
92
  intensities = intensities[ind]
93
  return {'formulas': formulas, 'formula_mzs': mzs, 'formula_intensities': intensities}
94
+ except:
95
+
96
+ return None
97
+
98
+ def load_subformula_data(self, spec_id: str, curr_form: str, curr_prec_mz: float):
99
+ try:
100
+ file = os.path.join(self.dir_path, spec_id+".json")
101
+ with open(file) as f:
102
+ data = json.load(f)
103
+ if self.formula_source == 'sirius':
104
+ return self.load_sirius_data(data)
105
+ elif self.formula_source == 'magma':
106
+ return self.load_magma_data(data, curr_form, curr_prec_mz)
107
+ else:
108
+ return self.load_mist_data(data, curr_form, curr_prec_mz)
109
+
110
  except:
111
  return None
112
 
113
  def load_subformula_dict(self, spec_id: str):
114
+ '''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
115
  '''
116
  try:
117
  file = os.path.join(self.dir_path, spec_id+".json")
 
124
  mz_to_formulas = {mz:f for mz, f in zip(mzs, formulas)}
125
  for mz, f in zip(mzs, formulas):
126
  mz_to_formulas[mz] = f
127
+
128
  ind = mzs.argsort()
129
  mzs = mzs[ind]
130
  formulas = formulas[ind]