Spaces:

kairongLi
/

DeepMS2

Runtime error

App Files Files Community

DeepMS2 / scripts /Example.py

kairongLi

Upload 44 files

fb5f46a over 2 years ago

raw

history blame contribute delete

7.28 kB

	# -- coding: utf-8 --
	"""
	Created on Thu Sep 7 08:49:29 2023

	@author: DELL
	"""


	import umap
	import hnswlib
	import pickle
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import matchms.filtering as msfilters
	from sklearn.preprocessing import scale
	from sklearn.decomposition import PCA
	from matchms.importing import load_from_mgf
	from gensim.models import Word2Vec
	from rdkit import Chem
	from rdkit.Chem import DataStructs, AllChem, Draw
	from spec2vec import SpectrumDocument
	from spec2vec.vector_operations import calc_vector

	from core.identification import identify_unknown, match_spectrum


	def plot_spectrum_comparison(s1, s2, mzrange, loss=False):
	if not loss:
	plt.vlines(s1.mz, ymin=0, ymax=s1.intensities / np.max(s1.intensities), color='r', lw = 1, label = 'query')
	plt.vlines(s2.mz, ymin=0, ymax=-s2.intensities / np.max(s2.intensities), color='b', lw = 1, label = 'neigbor')
	else:
	s1 = msfilters.add_losses(s1)
	s2 = msfilters.add_losses(s2)
	plt.vlines(s1.losses.mz, ymin=0, ymax=s1.losses.intensities / np.max(s1.losses.intensities), color='r', lw = 1, label = 'query')
	plt.vlines(s2.losses.mz, ymin=0, ymax=-s2.losses.intensities / np.max(s2.losses.intensities), color='b', lw = 1, label = 'neigbor')
	plt.axhline(y=0,color='black', lw = 1)
	plt.ylabel('abundance')
	plt.xlim(mzrange)


	database = pd.read_csv('data/DeepMassStructureDB-v1.0.csv')
	spectrums = [s for s in load_from_mgf("D:/DeepMASS2_Data_Processing/Example/CASMI/all_casmi.mgf")]

	# Example 1
	s = spectrums[255]
	print(s.metadata)

	model = Word2Vec.load("model/Ms2Vec_allGNPSnegative.hdf5")
	p = hnswlib.Index(space='l2', dim=300)
	p.load_index('data/references_index_negative_spec2vec.bin')
	with open('data/references_spectrums_negative.pickle', 'rb') as file:
	references = pickle.load(file)
	references = np.array(references)
	precursors = [s.get('precursor_mz') for s in references]
	precursors = np.array(precursors)

	s_metadata = s.metadata
	s_matchms = match_spectrum(s, precursors, references)
	s_matchms_metadata = s_matchms.metadata
	s_deepmass = identify_unknown(s, p, model, references, database)
	s_deepmass_metadata = s_deepmass.metadata


	get_mol_fingerprint = lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
	get_mol_similarity = lambda x, y: DataStructs.FingerprintSimilarity(x, y)
	calc_ms2vec_vector = lambda x: calc_vector(model, SpectrumDocument(x, n_decimals=2))
	deepmass_candidate = s_deepmass_metadata['annotation']['CanonicalSMILES']
	deepmass_reference = s_deepmass_metadata['reference']

	deepmass_candidate_vector = []
	for r in deepmass_candidate:
	# a = calc_ms2vec_vector(s)
	b = get_mol_fingerprint(r)
	deepmass_candidate_vector.append(list(b))
	deepmass_candidate_vector = np.array(deepmass_candidate_vector)

	deepmass_reference_vector = []
	for r in deepmass_reference:
	# a = calc_ms2vec_vector(r)
	b = get_mol_fingerprint(r.get('smiles'))
	deepmass_reference_vector.append(list(b))
	deepmass_reference_vector = np.array(deepmass_reference_vector)

	X = np.vstack((deepmass_candidate_vector, deepmass_reference_vector))
	pca = umap.UMAP(n_components = 2)
	X_r = pca.fit_transform(scale(X))

	Draw.MolToFile(Chem.MolFromSmiles(s.get('smiles')), 'temp/temp1.png')

	a, b = len(deepmass_candidate_vector), len(deepmass_reference_vector)
	plt.figure(dpi = 300, figsize = (3.5, 3.5))
	plt.scatter(X_r[1:a,0], X_r[1:a,1], color = 'green', alpha = 0.5, label = 'Candidates')
	plt.scatter(X_r[a:a+10,0], X_r[a:a+10,1], color = 'blue', alpha = 0.5, label = 'Top 10 Neigbors')
	plt.scatter(X_r[0,0], X_r[0,1], color = 'red', alpha = 0.8, label = 'True Annotation')
	plt.scatter(X_r[55,0], X_r[55,1], color = 'orange', alpha = 0.5)
	plt.xlabel('Dim 1')
	plt.ylabel('Dim 2')
	plt.xlim(24, 41)
	plt.ylim(-16, -9)
	plt.legend(loc = 'upper right')
	plt.show()


	plt.figure(dpi = 300, figsize = (3, 4.8))
	plt.subplot(311)
	plot_spectrum_comparison(s, deepmass_reference[9], (50,250))
	plt.subplot(312)
	plot_spectrum_comparison(s, deepmass_reference[0], (50,250))
	plt.subplot(313)
	plot_spectrum_comparison(s, deepmass_reference[3], (50,250))
	plt.subplots_adjust(hspace=0.5)
	plt.xlabel('m/z')
	# plt.legend(loc='upper center', bbox_to_anchor=(1.2, 4))
	plt.show()


	# Example 2
	s = spectrums[368]
	print(s.metadata)

	model = Word2Vec.load("model/Ms2Vec_allGNPSpositive.hdf5")
	p = hnswlib.Index(space='l2', dim=300)
	p.load_index('data/references_index_positive_spec2vec.bin')
	with open('data/references_spectrums_positive.pickle', 'rb') as file:
	references = pickle.load(file)
	references = np.array(references)
	precursors = [s.get('precursor_mz') for s in references]
	precursors = np.array(precursors)

	model = Word2Vec.load("model/Ms2Vec_allGNPSpositive.hdf5")
	p = hnswlib.Index(space='l2', dim=300)
	p.load_index('data/references_index_positive_spec2vec.bin')
	with open('data/references_spectrums_positive.pickle', 'rb') as file:
	references = pickle.load(file)
	references = np.array(references)
	precursors = [s.get('precursor_mz') for s in references]
	precursors = np.array(precursors)

	s_metadata = s.metadata
	s_matchms = match_spectrum(s, precursors, references)
	s_matchms_metadata = s_matchms.metadata
	s_deepmass = identify_unknown(s, p, model, references, database)
	s_deepmass_metadata = s_deepmass.metadata


	get_mol_fingerprint = lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
	get_mol_similarity = lambda x, y: DataStructs.FingerprintSimilarity(x, y)
	calc_ms2vec_vector = lambda x: calc_vector(model, SpectrumDocument(x, n_decimals=2))
	deepmass_candidate = s_deepmass_metadata['annotation']['CanonicalSMILES']
	deepmass_reference = s_deepmass_metadata['reference']

	deepmass_candidate_vector = []
	for r in deepmass_candidate:
	b = get_mol_fingerprint(r)
	deepmass_candidate_vector.append(list(b))
	deepmass_candidate_vector = np.array(deepmass_candidate_vector)

	deepmass_reference_vector = []
	for r in deepmass_reference:
	b = get_mol_fingerprint(r.get('smiles'))
	deepmass_reference_vector.append(list(b))
	deepmass_reference_vector = np.array(deepmass_reference_vector)

	X = np.vstack((deepmass_candidate_vector, deepmass_reference_vector))
	pca = umap.UMAP(n_components = 2)
	X_r = pca.fit_transform(scale(X))

	Draw.MolToFile(Chem.MolFromSmiles(s.get('smiles')), 'temp/temp.png')

	a, b = len(deepmass_candidate_vector), len(deepmass_reference_vector)
	plt.figure(dpi = 300, figsize = (3.5, 3.5))
	plt.scatter(X_r[1:a,0], X_r[1:a,1], color = 'green', alpha = 0.5, label = 'Candidates')
	plt.scatter(X_r[0,0], X_r[0,1], color = 'red', alpha = 0.8, label = 'True Annotation')
	plt.scatter(X_r[a:a+10,0], X_r[a:a+10,1], color = 'blue', alpha = 0.5, label = 'Top 10 Neigbors')
	plt.scatter(X_r[a+9:a+10,0], X_r[a+9:a+10,1], color = 'orange', alpha = 0.5)
	plt.xlabel('Dim 1')
	plt.ylabel('Dim 2')
	plt.legend()


	plt.figure(dpi = 300, figsize = (3, 4.8))
	plt.subplot(311)
	plot_spectrum_comparison(s, deepmass_reference[0], (0,150), loss=True)
	plt.subplot(312)
	plot_spectrum_comparison(s, deepmass_reference[1], (100,350))
	plt.subplot(313)
	plot_spectrum_comparison(s, deepmass_reference[9], (100,350))
	plt.subplots_adjust(hspace=0.5)
	plt.xlabel('m/z')
	# plt.legend(loc='upper center', bbox_to_anchor=(1.2, 4))
	plt.show()