| |
| """ |
| Created on Wed Sep 7 14:55:39 2022 |
| |
| @author: DELL |
| """ |
|
|
|
|
| import numpy as np |
| import pandas as pd |
| from tqdm import tqdm |
|
|
| from matchms.Spectrum import Spectrum |
| from matchms.similarity import CosineGreedy |
|
|
| from rdkit import Chem, DataStructs |
| from rdkit.Chem import AllChem |
| from rdkit.Chem import rdFMCS |
|
|
|
|
| def disable_rdkit_logging(): |
| """ |
| Disables RDKit whiny logging. |
| """ |
| import rdkit.rdBase as rkrb |
| import rdkit.RDLogger as rkl |
| logger = rkl.logger() |
| logger.setLevel(rkl.ERROR) |
| rkrb.DisableLog('rdApp.error') |
|
|
| disable_rdkit_logging() |
| get_fp = lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius=2) |
| get_sim = lambda x, y: DataStructs.DiceSimilarity(x, y) |
|
|
|
|
| def get_tagged_atoms_from_mol(mol): |
| '''Takes an RDKit molecule and returns list of tagged atoms and their |
| corresponding numbers''' |
| atoms = [] |
| atom_tags = [] |
| for atom in mol.GetAtoms(): |
| if atom.HasProp('molAtomMapNumber'): |
| atoms.append(atom) |
| atom_tags.append(int(atom.GetProp('molAtomMapNumber'))) |
| return atom_tags |
|
|
|
|
| def calc_frag_mass(frag): |
| '''Takes an RDKit fragment and returns the exact mass of the fragment''' |
| mass = 0 |
| for ad in frag.GetAtoms(): |
| mass += ad.GetMass() |
| return mass |
| |
|
|
| def calc_possible_spectrum_loss(smiles_1, smiles_2): |
| """ |
| Calculate mass difference between related fragments of two compounds. |
| Arguments: |
| smiles_1, smiles_2: str, two different smiles of compounds. |
| Returns: |
| DataFrame, |
| transform, transformation from smiles_1 to smiles_2. |
| loss, corresponding neutral loss of the transformations. |
| Example: |
| smiles_1 = 'COc1cc(O)c2c(c1)OC(c1ccc(O)cc1)CC2=O' |
| smiles_2 = 'CC1OC(OCC2OC(Oc3cc(O)c4c(c3)OC(c3ccc(O)cc3)CC4=O)C(O)C(O)C2O)C(O)C(O)C1O' |
| calc_possible_spectrum_loss(smiles_1, smiles_2) |
| """ |
| |
| try: |
| x = Chem.MolToSmiles(Chem.MolFromSmiles(smiles_1)) |
| y = Chem.MolToSmiles(Chem.MolFromSmiles(smiles_2)) |
| except: |
| return None |
| |
| mol1 = Chem.AddHs(Chem.MolFromSmiles(x)) |
| mol2 = Chem.AddHs(Chem.MolFromSmiles(y)) |
| |
| if get_sim(get_fp(mol1), get_fp(mol2)) < 0.3: |
| return None |
| |
| mcs = rdFMCS.FindMCS([mol1, mol2], bondCompare=rdFMCS.BondCompare.CompareOrderExact, |
| matchValences = True, ringMatchesRingOnly = True) |
| if mcs.numAtoms <= 5: |
| return None |
| |
| mcs_str = mcs.smartsString |
| |
| rdu1 = AllChem.DeleteSubstructs(mol1, Chem.MolFromSmarts(mcs_str)) |
| rdu2 = AllChem.DeleteSubstructs(mol2, Chem.MolFromSmarts(mcs_str)) |
| |
| try: |
| rdu1 = Chem.GetMolFrags(rdu1, asMols=True) |
| except: |
| rdu1 = np.array([rdu1]) |
| |
| try: |
| rdu2 = Chem.GetMolFrags(rdu2, asMols=True) |
| except: |
| rdu2 = np.array([rdu2]) |
| |
| if (len(rdu1) == 0) and (len(rdu2) == 0): |
| return None |
|
|
| mass_1 = np.array([calc_frag_mass(m) for m in rdu1]) |
| mass_2 = np.array([calc_frag_mass(m) for m in rdu2]) |
| if len(mass_1) == 0: |
| mass_1 = np.array([0]) |
| if len(mass_2) == 0: |
| mass_2 = np.array([0]) |
| |
| mass_diffs, mol_transform = [], [] |
| for i in range(len(mass_1)): |
| for j in range(len(mass_2)): |
| try: |
| a = Chem.MolToSmiles(Chem.RemoveHs(rdu1[i])) |
| except: |
| a = 'None' |
| try: |
| b = Chem.MolToSmiles(Chem.RemoveHs(rdu2[j])) |
| except: |
| b = 'None' |
| mol_transform.append('{}>>{}'.format(a,b)) |
| mass_diffs.append(mass_2[j] - mass_1[i]) |
| |
| return pd.DataFrame({'transform': mol_transform, 'loss': mass_diffs}) |
| |
| |
| def calc_aligned_similarity(smiles_1, smiles_2, spectrum_1, spectrum_2, mz_tol=0.05, similarity_function=CosineGreedy()): |
| """ |
| Calculate dtw similarity between two spectrums. |
| Arguments: |
| smiles_1, smiles_2: str, two different smiles of compounds. |
| spectrum_1, spectrum_2: Two different spectrum of matchms. |
| Returns: |
| similarity: float, similarity between aligned spectrums. |
| matching_data: DataFrame, fragment matching information. |
| Example: |
| smiles_1 = 'CCCC=C1C2=CC=CC=C2C(=O)O1' |
| smiles_2 = 'CCCC=C1C2=C(C=CCC2)C(=O)O1' |
| spectrum_1 = Spectrum(mz = np.array([91.1, 115.1, 117.1, 128.1, 129.1, 143.1, 145.1, 152.1, 153.1, 171.1, 189.1]), |
| intensities = np.array([0.12314933, 0.10446688, 0.16478671, 0.56083889, 0.11087135, |
| 0.43528005, 0.1149675 , 0.10339803, 0.51058281, 0.999999, 0.88490263]), |
| metadata={"precursor_mz": 189.0909}) |
| spectrum_2 = Spectrum(mz = np.array([ 79.1, 93.1, 105.1, 117.1, 145.1, 173.1, 191.1]), |
| intensities = np.array([0.10704697, 0.10657389, 0.1382483 , 0.12679477, 0.16397634, |
| 0.26150501, 0.999999]), |
| metadata={"precursor_mz": 191.1064}) |
| calc_aligned_similarity(smiles_1, smiles_2, spectrum_1, spectrum_2) |
| """ |
| |
| loss = calc_possible_spectrum_loss(smiles_1, smiles_2) |
| |
| if loss is None: |
| loss_1 = loss_2 = 0 |
| loss = pd.DataFrame({'transform': [], 'loss': []}) |
| else: |
| loss_1 = -sum([l for l in list(loss['loss']) if l < 0]) |
| loss_2 = sum([l for l in list(loss['loss']) if l > 0]) |
| |
| mcs1 = 9999 |
| mcs2 = 9999 |
| try: |
| mcs1 = spectrum_1.metadata['precursor_mz'] - loss_1 |
| except: |
| pass |
| try: |
| mcs2 = spectrum_2.metadata['precursor_mz'] - loss_2 |
| except: |
| pass |
| maxCS = min(mcs1, mcs2) |
| |
| if (len(spectrum_1.mz) == 0) or (len(spectrum_2.mz) == 0): |
| return 0, None |
| |
| x_mz, x_intensities = spectrum_1.mz, spectrum_1.intensities |
| y_mz, y_intensities = spectrum_2.mz, spectrum_2.intensities |
| |
| y_mz_new, y_intensities_new = [], [] |
| matching_data = [] |
| for i, y_mz_ in enumerate(y_mz): |
| if y_intensities[i] < 0.01: |
| continue |
| if np.min(np.abs(y_mz_ - x_mz)) <= mz_tol: |
| if y_mz_ > maxCS + 2.006: |
| continue |
| a = y_mz_ |
| b = x_mz[np.argmin(np.abs(y_mz_ - x_mz))] |
| c = abs(a - b) |
| d = y_intensities[i] |
| y_mz_new.append(y_mz_) |
| y_intensities_new.append(y_intensities[i]) |
| matching_data.append([a, b, c, d]) |
| else: |
| matched = False |
| for loss_ in loss['loss']: |
| ''' |
| if y_mz_ - loss_ > maxCS - loss_ + 2.006: |
| continue |
| ''' |
| if np.min(np.abs(y_mz_ - loss_ - x_mz)) <= mz_tol: |
| matched = True |
| a = y_mz_ |
| b = x_mz[np.argmin(np.abs(y_mz_ - loss_ - x_mz))] |
| c = abs(a - b) |
| d = y_intensities[i] |
| y_mz_new.append(y_mz_ - loss_) |
| y_intensities_new.append(y_intensities[i]) |
| matching_data.append([a, b, c, d]) |
| break |
| if not matched: |
| y_mz_new.append(y_mz_) |
| y_intensities_new.append(y_intensities[i]) |
| y_mz_new = np.array(y_mz_new) |
| y_intensities_new = np.array(y_intensities_new) |
| |
| index = np.argsort(y_mz_new) |
| y_mz_new = y_mz_new[index] |
| y_intensities_new = y_intensities_new[index] |
| |
| spectrum_2_aligned = Spectrum(mz = y_mz_new, |
| intensities = y_intensities_new, |
| metadata = spectrum_2.metadata) |
| |
| similarity = float(similarity_function.pair(spectrum_1, spectrum_2_aligned)['score']) |
| matching_data = pd.DataFrame(matching_data, columns = ['reference', 'query', 'loss', 'intensity']) |
| return similarity, matching_data |
|
|