DeepMS2 / scripts /Example.py
kairongLi's picture
Upload 44 files
fb5f46a
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 7 08:49:29 2023
@author: DELL
"""
import umap
import hnswlib
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matchms.filtering as msfilters
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from matchms.importing import load_from_mgf
from gensim.models import Word2Vec
from rdkit import Chem
from rdkit.Chem import DataStructs, AllChem, Draw
from spec2vec import SpectrumDocument
from spec2vec.vector_operations import calc_vector
from core.identification import identify_unknown, match_spectrum
def plot_spectrum_comparison(s1, s2, mzrange, loss=False):
if not loss:
plt.vlines(s1.mz, ymin=0, ymax=s1.intensities / np.max(s1.intensities), color='r', lw = 1, label = 'query')
plt.vlines(s2.mz, ymin=0, ymax=-s2.intensities / np.max(s2.intensities), color='b', lw = 1, label = 'neigbor')
else:
s1 = msfilters.add_losses(s1)
s2 = msfilters.add_losses(s2)
plt.vlines(s1.losses.mz, ymin=0, ymax=s1.losses.intensities / np.max(s1.losses.intensities), color='r', lw = 1, label = 'query')
plt.vlines(s2.losses.mz, ymin=0, ymax=-s2.losses.intensities / np.max(s2.losses.intensities), color='b', lw = 1, label = 'neigbor')
plt.axhline(y=0,color='black', lw = 1)
plt.ylabel('abundance')
plt.xlim(mzrange)
database = pd.read_csv('data/DeepMassStructureDB-v1.0.csv')
spectrums = [s for s in load_from_mgf("D:/DeepMASS2_Data_Processing/Example/CASMI/all_casmi.mgf")]
# Example 1
s = spectrums[255]
print(s.metadata)
model = Word2Vec.load("model/Ms2Vec_allGNPSnegative.hdf5")
p = hnswlib.Index(space='l2', dim=300)
p.load_index('data/references_index_negative_spec2vec.bin')
with open('data/references_spectrums_negative.pickle', 'rb') as file:
references = pickle.load(file)
references = np.array(references)
precursors = [s.get('precursor_mz') for s in references]
precursors = np.array(precursors)
s_metadata = s.metadata
s_matchms = match_spectrum(s, precursors, references)
s_matchms_metadata = s_matchms.metadata
s_deepmass = identify_unknown(s, p, model, references, database)
s_deepmass_metadata = s_deepmass.metadata
get_mol_fingerprint = lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
get_mol_similarity = lambda x, y: DataStructs.FingerprintSimilarity(x, y)
calc_ms2vec_vector = lambda x: calc_vector(model, SpectrumDocument(x, n_decimals=2))
deepmass_candidate = s_deepmass_metadata['annotation']['CanonicalSMILES']
deepmass_reference = s_deepmass_metadata['reference']
deepmass_candidate_vector = []
for r in deepmass_candidate:
# a = calc_ms2vec_vector(s)
b = get_mol_fingerprint(r)
deepmass_candidate_vector.append(list(b))
deepmass_candidate_vector = np.array(deepmass_candidate_vector)
deepmass_reference_vector = []
for r in deepmass_reference:
# a = calc_ms2vec_vector(r)
b = get_mol_fingerprint(r.get('smiles'))
deepmass_reference_vector.append(list(b))
deepmass_reference_vector = np.array(deepmass_reference_vector)
X = np.vstack((deepmass_candidate_vector, deepmass_reference_vector))
pca = umap.UMAP(n_components = 2)
X_r = pca.fit_transform(scale(X))
Draw.MolToFile(Chem.MolFromSmiles(s.get('smiles')), 'temp/temp1.png')
a, b = len(deepmass_candidate_vector), len(deepmass_reference_vector)
plt.figure(dpi = 300, figsize = (3.5, 3.5))
plt.scatter(X_r[1:a,0], X_r[1:a,1], color = 'green', alpha = 0.5, label = 'Candidates')
plt.scatter(X_r[a:a+10,0], X_r[a:a+10,1], color = 'blue', alpha = 0.5, label = 'Top 10 Neigbors')
plt.scatter(X_r[0,0], X_r[0,1], color = 'red', alpha = 0.8, label = 'True Annotation')
plt.scatter(X_r[55,0], X_r[55,1], color = 'orange', alpha = 0.5)
plt.xlabel('Dim 1')
plt.ylabel('Dim 2')
plt.xlim(24, 41)
plt.ylim(-16, -9)
plt.legend(loc = 'upper right')
plt.show()
plt.figure(dpi = 300, figsize = (3, 4.8))
plt.subplot(311)
plot_spectrum_comparison(s, deepmass_reference[9], (50,250))
plt.subplot(312)
plot_spectrum_comparison(s, deepmass_reference[0], (50,250))
plt.subplot(313)
plot_spectrum_comparison(s, deepmass_reference[3], (50,250))
plt.subplots_adjust(hspace=0.5)
plt.xlabel('m/z')
# plt.legend(loc='upper center', bbox_to_anchor=(1.2, 4))
plt.show()
# Example 2
s = spectrums[368]
print(s.metadata)
model = Word2Vec.load("model/Ms2Vec_allGNPSpositive.hdf5")
p = hnswlib.Index(space='l2', dim=300)
p.load_index('data/references_index_positive_spec2vec.bin')
with open('data/references_spectrums_positive.pickle', 'rb') as file:
references = pickle.load(file)
references = np.array(references)
precursors = [s.get('precursor_mz') for s in references]
precursors = np.array(precursors)
model = Word2Vec.load("model/Ms2Vec_allGNPSpositive.hdf5")
p = hnswlib.Index(space='l2', dim=300)
p.load_index('data/references_index_positive_spec2vec.bin')
with open('data/references_spectrums_positive.pickle', 'rb') as file:
references = pickle.load(file)
references = np.array(references)
precursors = [s.get('precursor_mz') for s in references]
precursors = np.array(precursors)
s_metadata = s.metadata
s_matchms = match_spectrum(s, precursors, references)
s_matchms_metadata = s_matchms.metadata
s_deepmass = identify_unknown(s, p, model, references, database)
s_deepmass_metadata = s_deepmass.metadata
get_mol_fingerprint = lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
get_mol_similarity = lambda x, y: DataStructs.FingerprintSimilarity(x, y)
calc_ms2vec_vector = lambda x: calc_vector(model, SpectrumDocument(x, n_decimals=2))
deepmass_candidate = s_deepmass_metadata['annotation']['CanonicalSMILES']
deepmass_reference = s_deepmass_metadata['reference']
deepmass_candidate_vector = []
for r in deepmass_candidate:
b = get_mol_fingerprint(r)
deepmass_candidate_vector.append(list(b))
deepmass_candidate_vector = np.array(deepmass_candidate_vector)
deepmass_reference_vector = []
for r in deepmass_reference:
b = get_mol_fingerprint(r.get('smiles'))
deepmass_reference_vector.append(list(b))
deepmass_reference_vector = np.array(deepmass_reference_vector)
X = np.vstack((deepmass_candidate_vector, deepmass_reference_vector))
pca = umap.UMAP(n_components = 2)
X_r = pca.fit_transform(scale(X))
Draw.MolToFile(Chem.MolFromSmiles(s.get('smiles')), 'temp/temp.png')
a, b = len(deepmass_candidate_vector), len(deepmass_reference_vector)
plt.figure(dpi = 300, figsize = (3.5, 3.5))
plt.scatter(X_r[1:a,0], X_r[1:a,1], color = 'green', alpha = 0.5, label = 'Candidates')
plt.scatter(X_r[0,0], X_r[0,1], color = 'red', alpha = 0.8, label = 'True Annotation')
plt.scatter(X_r[a:a+10,0], X_r[a:a+10,1], color = 'blue', alpha = 0.5, label = 'Top 10 Neigbors')
plt.scatter(X_r[a+9:a+10,0], X_r[a+9:a+10,1], color = 'orange', alpha = 0.5)
plt.xlabel('Dim 1')
plt.ylabel('Dim 2')
plt.legend()
plt.figure(dpi = 300, figsize = (3, 4.8))
plt.subplot(311)
plot_spectrum_comparison(s, deepmass_reference[0], (0,150), loss=True)
plt.subplot(312)
plot_spectrum_comparison(s, deepmass_reference[1], (100,350))
plt.subplot(313)
plot_spectrum_comparison(s, deepmass_reference[9], (100,350))
plt.subplots_adjust(hspace=0.5)
plt.xlabel('m/z')
# plt.legend(loc='upper center', bbox_to_anchor=(1.2, 4))
plt.show()