Spaces:
Sleeping
Sleeping
Commit
·
0b51da1
1
Parent(s):
e0cc56a
magma loader
Browse files- mvp/data/datasets.py +4 -6
- mvp/models/contrastive.py +1 -0
- mvp/params_formSpec.yaml +5 -5
- mvp/params_tmp.yaml +121 -0
- mvp/run.sh +2 -11
- mvp/test.py +1 -1
- mvp/train.py +2 -2
- mvp/utils/data.py +74 -14
mvp/data/datasets.py
CHANGED
|
@@ -154,15 +154,13 @@ class MassSpecDataset_PeakFormulas(JESTR1_MassSpecDataset):
|
|
| 154 |
print("Data path: ", self.pth)
|
| 155 |
self.metadata = pd.read_csv(self.pth, sep="\t")
|
| 156 |
|
| 157 |
-
# Used for training on consensus spectra
|
| 158 |
-
# with open(self.pth, 'rb') as f:
|
| 159 |
-
# self.metadata = pickle.load(f)
|
| 160 |
-
# self.metadata['identifier'] = self.metadata['smiles'].tolist()
|
| 161 |
-
|
| 162 |
# load subformulas
|
| 163 |
all_spec_ids = self.metadata['identifier'].tolist()
|
| 164 |
subformulaLoader = data_utils.Subformula_Loader(spectra_view=spectra_view, dir_path=subformula_dir_pth)
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# create subformula spectra if no subformula is available
|
| 168 |
tmp_ids = [spec_id for spec_id in all_spec_ids if spec_id not in id_to_spec]
|
|
|
|
| 154 |
print("Data path: ", self.pth)
|
| 155 |
self.metadata = pd.read_csv(self.pth, sep="\t")
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
# load subformulas
|
| 158 |
all_spec_ids = self.metadata['identifier'].tolist()
|
| 159 |
subformulaLoader = data_utils.Subformula_Loader(spectra_view=spectra_view, dir_path=subformula_dir_pth)
|
| 160 |
+
|
| 161 |
+
form_list = self.metadata['formula'].tolist()
|
| 162 |
+
prec_mz_list = self.metadata['precursor_mz'].tolist()
|
| 163 |
+
id_to_spec = subformulaLoader(all_spec_ids, form_list, prec_mz_list)
|
| 164 |
|
| 165 |
# create subformula spectra if no subformula is available
|
| 166 |
tmp_ids = [spec_id for spec_id in all_spec_ids if spec_id not in id_to_spec]
|
mvp/models/contrastive.py
CHANGED
|
@@ -270,6 +270,7 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
|
|
| 270 |
def get_checkpoint_monitors(self) -> T.List[dict]:
|
| 271 |
monitors = [
|
| 272 |
{"monitor": f"{Stage.TRAIN.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor train loss
|
|
|
|
| 273 |
]
|
| 274 |
return monitors
|
| 275 |
|
|
|
|
| 270 |
def get_checkpoint_monitors(self) -> T.List[dict]:
|
| 271 |
monitors = [
|
| 272 |
{"monitor": f"{Stage.TRAIN.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor train loss
|
| 273 |
+
{"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor val loss
|
| 274 |
]
|
| 275 |
return monitors
|
| 276 |
|
mvp/params_formSpec.yaml
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
# Experiment setup
|
| 2 |
job_key: ''
|
| 3 |
-
run_name: '
|
| 4 |
run_details: ""
|
| 5 |
project_name: ''
|
| 6 |
wandb_entity_name: 'mass-spec-ml'
|
| 7 |
no_wandb: True
|
| 8 |
seed: 0
|
| 9 |
debug: False
|
| 10 |
-
checkpoint_pth:
|
| 11 |
|
| 12 |
# Training setup
|
| 13 |
max_epochs: 2000
|
|
@@ -19,10 +19,10 @@ val_check_interval: 1.0
|
|
| 19 |
# Data paths
|
| 20 |
candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
|
| 21 |
dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
|
| 22 |
-
subformula_dir_pth: /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
|
| 23 |
split_pth:
|
| 24 |
-
fp_dir_pth:
|
| 25 |
-
cons_spec_dir_pth:
|
| 26 |
NL_spec_dir_pth: ""
|
| 27 |
partial_checkpoint: ""
|
| 28 |
|
|
|
|
| 1 |
# Experiment setup
|
| 2 |
job_key: ''
|
| 3 |
+
run_name: 'sirius_labels'
|
| 4 |
run_details: ""
|
| 5 |
project_name: ''
|
| 6 |
wandb_entity_name: 'mass-spec-ml'
|
| 7 |
no_wandb: True
|
| 8 |
seed: 0
|
| 9 |
debug: False
|
| 10 |
+
checkpoint_pth:
|
| 11 |
|
| 12 |
# Training setup
|
| 13 |
max_epochs: 2000
|
|
|
|
| 19 |
# Data paths
|
| 20 |
candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
|
| 21 |
dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
|
| 22 |
+
subformula_dir_pth: /r/hassounlab/msgym_sirius # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
|
| 23 |
split_pth:
|
| 24 |
+
fp_dir_pth:
|
| 25 |
+
cons_spec_dir_pth:
|
| 26 |
NL_spec_dir_pth: ""
|
| 27 |
partial_checkpoint: ""
|
| 28 |
|
mvp/params_tmp.yaml
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Experiment setup
|
| 2 |
+
job_key: ''
|
| 3 |
+
run_name: 'filipContrastive'
|
| 4 |
+
run_details: ""
|
| 5 |
+
project_name: ''
|
| 6 |
+
wandb_entity_name: 'mass-spec-ml'
|
| 7 |
+
no_wandb: True
|
| 8 |
+
seed: 0
|
| 9 |
+
debug: False
|
| 10 |
+
checkpoint_pth:
|
| 11 |
+
|
| 12 |
+
# Training setup
|
| 13 |
+
max_epochs: 2000
|
| 14 |
+
accelerator: 'gpu'
|
| 15 |
+
devices: [1]
|
| 16 |
+
log_every_n_steps: 250
|
| 17 |
+
val_check_interval: 1.0
|
| 18 |
+
|
| 19 |
+
# Data paths
|
| 20 |
+
candidates_pth: /data/yzhouc01/cancer/candidates.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
|
| 21 |
+
dataset_pth: /data/yzhouc01/cancer/data.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
|
| 22 |
+
subformula_dir_pth: /data/yzhouc01/cancer/subformulae # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
|
| 23 |
+
split_pth:
|
| 24 |
+
fp_dir_pth:
|
| 25 |
+
cons_spec_dir_pth:
|
| 26 |
+
NL_spec_dir_pth: ""
|
| 27 |
+
partial_checkpoint: ""
|
| 28 |
+
|
| 29 |
+
# General hyperparameters
|
| 30 |
+
batch_size: 64
|
| 31 |
+
lr: 5.0e-05
|
| 32 |
+
weight_decay: 0
|
| 33 |
+
contr_temp: 0.05
|
| 34 |
+
early_stopping_patience: 300
|
| 35 |
+
loss_strategy: 'static'
|
| 36 |
+
num_workers: 50
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
############################## Data transforms ##############################
|
| 40 |
+
# - Spectra
|
| 41 |
+
spectra_view: SpecFormula #SpecMzIntTokens #SpecFormula
|
| 42 |
+
# 1. Binner
|
| 43 |
+
max_mz: 1000
|
| 44 |
+
bin_width: 1
|
| 45 |
+
mask_peak_ratio: 0.00
|
| 46 |
+
|
| 47 |
+
# 2. SpecFormula
|
| 48 |
+
element_list: ['H', 'C', 'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
|
| 49 |
+
add_intensities: True
|
| 50 |
+
mask_precursor: False
|
| 51 |
+
|
| 52 |
+
# - Molecule
|
| 53 |
+
molecule_view: "MolGraph"
|
| 54 |
+
atom_feature: 'full'
|
| 55 |
+
bond_feature: 'full'
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
############################## Views ##############################
|
| 59 |
+
# contrastive
|
| 60 |
+
use_contr: False
|
| 61 |
+
contr_wt: 1
|
| 62 |
+
contr_wt_update: {}
|
| 63 |
+
|
| 64 |
+
# consensus spectra
|
| 65 |
+
use_cons_spec: False
|
| 66 |
+
cons_spec_wt: 3
|
| 67 |
+
cons_spec_wt_update: {}
|
| 68 |
+
cons_loss_type: 'l2' # cosine, l2
|
| 69 |
+
|
| 70 |
+
# fp prediction/usage
|
| 71 |
+
pred_fp: False
|
| 72 |
+
use_fp: False
|
| 73 |
+
fp_loss_type: 'cosine' #cosine, bce
|
| 74 |
+
fp_wt: 3
|
| 75 |
+
fp_wt_update: {}
|
| 76 |
+
fp_size: 1024
|
| 77 |
+
fp_radius: 5
|
| 78 |
+
fp_dropout: 0.4
|
| 79 |
+
|
| 80 |
+
# candidates
|
| 81 |
+
aug_cands: False
|
| 82 |
+
aug_cands_wt: 0.1
|
| 83 |
+
aug_cands_update: {}
|
| 84 |
+
aug_cands_size: 3
|
| 85 |
+
|
| 86 |
+
# neutral loss
|
| 87 |
+
use_NL: False
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
############################## Task and model ##############################
|
| 91 |
+
task: 'retrieval'
|
| 92 |
+
spec_enc: Transformer_Formula # Transformer_MzInt #Transformer_Formula
|
| 93 |
+
mol_enc: "GNN"
|
| 94 |
+
model: filipContrastive # "MultiviewContrastive"
|
| 95 |
+
contr_views: [['spec_enc', 'mol_enc']] #[['spec_enc', 'mol_enc'], ['spec_enc', 'NL_spec_enc'], ['mol_enc', 'NL_spec_enc']] #[['spec_enc', 'mol_enc'], ['mol_enc', 'cons_spec_enc'], ['cons_spec_enc', 'spec_enc'], ['fp_enc', 'mol_enc'], ['fp_enc', 'spec_enc'], ['fp_enc', 'cons_spec_enc']]
|
| 96 |
+
log_only_loss_at_stages: []
|
| 97 |
+
df_test_path: ""
|
| 98 |
+
|
| 99 |
+
# - Spectra encoder
|
| 100 |
+
final_embedding_dim: 512
|
| 101 |
+
fc_dropout: 0.4
|
| 102 |
+
|
| 103 |
+
# - Spectra Token encoder
|
| 104 |
+
hidden_dims: [64, 128]
|
| 105 |
+
peak_dropout: 0.2
|
| 106 |
+
|
| 107 |
+
# - Formula-based spec encoders
|
| 108 |
+
formula_dropout: 0.2
|
| 109 |
+
formula_dims: [64, 128, 256]
|
| 110 |
+
cross_attn_heads: 2
|
| 111 |
+
use_cls: False
|
| 112 |
+
|
| 113 |
+
# -- GAT params
|
| 114 |
+
attn_heads: [12,12,12]
|
| 115 |
+
|
| 116 |
+
# - Molecule encoder (GNN)
|
| 117 |
+
gnn_channels: [64,128,256]
|
| 118 |
+
gnn_type: "gcn"
|
| 119 |
+
num_gnn_layers: 3
|
| 120 |
+
gnn_hidden_dim: 512
|
| 121 |
+
gnn_dropout: 0.3
|
mvp/run.sh
CHANGED
|
@@ -1,12 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
# 2. train model on msgym
|
| 5 |
-
# python train.py --param_pth params_formSpec.yaml
|
| 6 |
-
|
| 7 |
-
# 3. test model on msgym
|
| 8 |
-
# python train.py --param_pth params_binnedSpec.yaml
|
| 9 |
-
|
| 10 |
-
# python train.py
|
| 11 |
-
python test.py
|
| 12 |
python test.py --candidates_pth /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_formula.json
|
|
|
|
| 1 |
+
python train.py
|
| 2 |
+
python test.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
python test.py --candidates_pth /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_formula.json
|
mvp/test.py
CHANGED
|
@@ -2,7 +2,7 @@ import argparse
|
|
| 2 |
import datetime
|
| 3 |
import sys
|
| 4 |
sys.path.insert(0, "/data/yzhouc01/MassSpecGym")
|
| 5 |
-
sys.path.insert(0, "/data/yzhouc01/
|
| 6 |
|
| 7 |
from rdkit import RDLogger
|
| 8 |
import pytorch_lightning as pl
|
|
|
|
| 2 |
import datetime
|
| 3 |
import sys
|
| 4 |
sys.path.insert(0, "/data/yzhouc01/MassSpecGym")
|
| 5 |
+
sys.path.insert(0, "/data/yzhouc01/FILIP-MS")
|
| 6 |
|
| 7 |
from rdkit import RDLogger
|
| 8 |
import pytorch_lightning as pl
|
mvp/train.py
CHANGED
|
@@ -33,7 +33,7 @@ def main(params):
|
|
| 33 |
|
| 34 |
# Init paths to data files
|
| 35 |
if params['debug']:
|
| 36 |
-
params['dataset_pth'] = "
|
| 37 |
params['candidates_pth'] =None
|
| 38 |
params['split_pth']=None
|
| 39 |
|
|
@@ -80,7 +80,7 @@ def main(params):
|
|
| 80 |
filename=f'{{epoch}}-{{{monitor_name}:.2f}}',
|
| 81 |
# filename='{epoch}-{val_loss:.2f}-{train_loss:.2f}',
|
| 82 |
auto_insert_metric_name=True,
|
| 83 |
-
save_last=(i == 0)
|
| 84 |
)
|
| 85 |
callbacks.append(checkpoint)
|
| 86 |
if monitor.get('early_stopping', False):
|
|
|
|
| 33 |
|
| 34 |
# Init paths to data files
|
| 35 |
if params['debug']:
|
| 36 |
+
params['dataset_pth'] = "/data/yzhouc01/MVP/data/sample/data.tsv"
|
| 37 |
params['candidates_pth'] =None
|
| 38 |
params['split_pth']=None
|
| 39 |
|
|
|
|
| 80 |
filename=f'{{epoch}}-{{{monitor_name}:.2f}}',
|
| 81 |
# filename='{epoch}-{val_loss:.2f}-{train_loss:.2f}',
|
| 82 |
auto_insert_metric_name=True,
|
| 83 |
+
# save_last=(i == 0)
|
| 84 |
)
|
| 85 |
callbacks.append(checkpoint)
|
| 86 |
if monitor.get('early_stopping', False):
|
mvp/utils/data.py
CHANGED
|
@@ -9,11 +9,18 @@ import mvp.data.datasets as jestr_datasets
|
|
| 9 |
import typing as T
|
| 10 |
from mvp.definitions import MSGYM_FORMULA_VECTOR_NORM, MSGYM_STANDARD_MH
|
| 11 |
import matchms
|
|
|
|
| 12 |
|
| 13 |
class Subformula_Loader:
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
self.dir_path = dir_path
|
|
|
|
|
|
|
| 17 |
if spectra_view == 'SpecFormula':
|
| 18 |
self.load = self.load_subformula_data
|
| 19 |
elif spectra_view == "SpecFormulaMz":
|
|
@@ -21,37 +28,90 @@ class Subformula_Loader:
|
|
| 21 |
else:
|
| 22 |
raise Exception("Spectra view is not supported.")
|
| 23 |
|
| 24 |
-
def __call__(self, ids):
|
| 25 |
id_to_form_spec = {}
|
| 26 |
-
for id in ids:
|
| 27 |
-
data = self.load(id)
|
| 28 |
-
if data:
|
| 29 |
id_to_form_spec[id] = data
|
| 30 |
-
|
| 31 |
return id_to_form_spec
|
| 32 |
-
|
| 33 |
-
def
|
| 34 |
-
'''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
|
| 35 |
'''
|
| 36 |
try:
|
| 37 |
-
file = os.path.join(self.dir_path, spec_id+".json")
|
| 38 |
-
with open(file) as f:
|
| 39 |
-
|
| 40 |
mzs = np.array(data['output_tbl']['mz'])
|
| 41 |
formulas = np.array(data['output_tbl']['formula'])
|
| 42 |
intensities = np.array(data['output_tbl']['ms2_inten'])
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# sort by mzs
|
| 45 |
ind = mzs.argsort()
|
| 46 |
mzs = mzs[ind]
|
| 47 |
formulas = formulas[ind]
|
| 48 |
intensities = intensities[ind]
|
| 49 |
return {'formulas': formulas, 'formula_mzs': mzs, 'formula_intensities': intensities}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
except:
|
| 51 |
return None
|
| 52 |
|
| 53 |
def load_subformula_dict(self, spec_id: str):
|
| 54 |
-
'''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
|
| 55 |
'''
|
| 56 |
try:
|
| 57 |
file = os.path.join(self.dir_path, spec_id+".json")
|
|
@@ -64,7 +124,7 @@ class Subformula_Loader:
|
|
| 64 |
mz_to_formulas = {mz:f for mz, f in zip(mzs, formulas)}
|
| 65 |
for mz, f in zip(mzs, formulas):
|
| 66 |
mz_to_formulas[mz] = f
|
| 67 |
-
|
| 68 |
ind = mzs.argsort()
|
| 69 |
mzs = mzs[ind]
|
| 70 |
formulas = formulas[ind]
|
|
|
|
| 9 |
import typing as T
|
| 10 |
from mvp.definitions import MSGYM_FORMULA_VECTOR_NORM, MSGYM_STANDARD_MH
|
| 11 |
import matchms
|
| 12 |
+
import tqdm
|
| 13 |
|
| 14 |
class Subformula_Loader:
|
| 15 |
+
"""
|
| 16 |
+
:param dir_path: path to folder containing either MIST or SIRIUS formulas, automatically parses the file type as needed
|
| 17 |
+
:param use_prec_mz: add precursor m/z when fragment precursor peak is not present or remove precursor peak when their is no fragment precursor peak
|
| 18 |
+
"""
|
| 19 |
+
def __init__(self, spectra_view, dir_path, use_prec_mz=True, formula_source='default') -> None:
|
| 20 |
|
| 21 |
self.dir_path = dir_path
|
| 22 |
+
self.use_prec_mz = use_prec_mz
|
| 23 |
+
self.formula_source = formula_source
|
| 24 |
if spectra_view == 'SpecFormula':
|
| 25 |
self.load = self.load_subformula_data
|
| 26 |
elif spectra_view == "SpecFormulaMz":
|
|
|
|
| 28 |
else:
|
| 29 |
raise Exception("Spectra view is not supported.")
|
| 30 |
|
| 31 |
+
def __call__(self, ids, form_list, prec_mz_list):
|
| 32 |
id_to_form_spec = {}
|
| 33 |
+
for id, curr_form, curr_prec_mz in tqdm.tqdm(zip(ids, form_list, prec_mz_list), total=len(ids)):
|
| 34 |
+
data = self.load(id, curr_form, curr_prec_mz)
|
| 35 |
+
if data is not None:
|
| 36 |
id_to_form_spec[id] = data
|
| 37 |
+
|
| 38 |
return id_to_form_spec
|
| 39 |
+
|
| 40 |
+
def load_mist_data(self, data, curr_form, curr_prec_mz):
|
| 41 |
+
'''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
|
| 42 |
'''
|
| 43 |
try:
|
| 44 |
+
# file = os.path.join(self.dir_path, spec_id+".json")
|
| 45 |
+
# with open(file) as f:
|
| 46 |
+
# data = json.load(f)
|
| 47 |
mzs = np.array(data['output_tbl']['mz'])
|
| 48 |
formulas = np.array(data['output_tbl']['formula'])
|
| 49 |
intensities = np.array(data['output_tbl']['ms2_inten'])
|
| 50 |
|
| 51 |
+
if curr_form not in formulas and self.use_prec_mz:
|
| 52 |
+
mzs = np.concatenate([mzs, [curr_prec_mz]])
|
| 53 |
+
formulas = np.concatenate([formulas, [curr_form]])
|
| 54 |
+
intensities = np.concatenate([intensities, [1.1]])
|
| 55 |
+
elif curr_form in formulas and self.use_prec_mz:
|
| 56 |
+
idx = np.where(formulas == curr_form)[0][0]
|
| 57 |
+
intensities[idx] = 1.1
|
| 58 |
+
|
| 59 |
+
# sort by mzs
|
| 60 |
+
ind = mzs.argsort()
|
| 61 |
+
mzs = mzs[ind]
|
| 62 |
+
formulas = formulas[ind]
|
| 63 |
+
intensities = intensities[ind]
|
| 64 |
+
return {'formulas': formulas, 'formula_mzs': mzs, 'formula_intensities': intensities}
|
| 65 |
+
except:
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def load_magma_data(self, data, curr_form, curr_prec_mz):
|
| 69 |
+
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
def load_sirius_data(self, data):
|
| 73 |
+
try:
|
| 74 |
+
|
| 75 |
+
mzs = np.array([entry['mz'] for entry in data['fragments']])
|
| 76 |
+
formulas = np.array([entry['molecularFormula'] for entry in data['fragments']])
|
| 77 |
+
intensities = np.array([entry['relativeIntensity'] for entry in data['fragments'] ])
|
| 78 |
+
|
| 79 |
+
intensities[formulas == data['molecularFormula']] = 1.1
|
| 80 |
+
|
| 81 |
+
if not self.use_prec_mz:
|
| 82 |
+
not_append_prec_mz = np.array([len(entry['peaks']) != 0 for entry in data['fragments']])
|
| 83 |
+
|
| 84 |
+
mzs = mzs[not_append_prec_mz]
|
| 85 |
+
formulas = formulas[not_append_prec_mz]
|
| 86 |
+
intensities = intensities[not_append_prec_mz]
|
| 87 |
+
|
| 88 |
# sort by mzs
|
| 89 |
ind = mzs.argsort()
|
| 90 |
mzs = mzs[ind]
|
| 91 |
formulas = formulas[ind]
|
| 92 |
intensities = intensities[ind]
|
| 93 |
return {'formulas': formulas, 'formula_mzs': mzs, 'formula_intensities': intensities}
|
| 94 |
+
except:
|
| 95 |
+
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
def load_subformula_data(self, spec_id: str, curr_form: str, curr_prec_mz: float):
|
| 99 |
+
try:
|
| 100 |
+
file = os.path.join(self.dir_path, spec_id+".json")
|
| 101 |
+
with open(file) as f:
|
| 102 |
+
data = json.load(f)
|
| 103 |
+
if self.formula_source == 'sirius':
|
| 104 |
+
return self.load_sirius_data(data)
|
| 105 |
+
elif self.formula_source == 'magma':
|
| 106 |
+
return self.load_magma_data(data, curr_form, curr_prec_mz)
|
| 107 |
+
else:
|
| 108 |
+
return self.load_mist_data(data, curr_form, curr_prec_mz)
|
| 109 |
+
|
| 110 |
except:
|
| 111 |
return None
|
| 112 |
|
| 113 |
def load_subformula_dict(self, spec_id: str):
|
| 114 |
+
'''MIST subformula format:https://github.com/samgoldman97/mist/blob/main_v2/src/mist/utils/spectra_utils.py
|
| 115 |
'''
|
| 116 |
try:
|
| 117 |
file = os.path.join(self.dir_path, spec_id+".json")
|
|
|
|
| 124 |
mz_to_formulas = {mz:f for mz, f in zip(mzs, formulas)}
|
| 125 |
for mz, f in zip(mzs, formulas):
|
| 126 |
mz_to_formulas[mz] = f
|
| 127 |
+
|
| 128 |
ind = mzs.argsort()
|
| 129 |
mzs = mzs[ind]
|
| 130 |
formulas = formulas[ind]
|