Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,494 +1,494 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
"""
|
| 3 |
-
Created on Thu Oct 10 16:46:36 2024
|
| 4 |
-
|
| 5 |
-
@author: ZNDX002
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from __future__ import annotations
|
| 9 |
-
import sqlite3
|
| 10 |
-
import gradio as gr
|
| 11 |
-
from infer import ModelInference
|
| 12 |
-
from model import ModelCLR
|
| 13 |
-
from matchms.importing import load_from_mgf,load_from_msp
|
| 14 |
-
import matchms.filtering as msfilters
|
| 15 |
-
import numpy as np
|
| 16 |
-
from rdkit.Chem import Draw
|
| 17 |
-
from rdkit import Chem
|
| 18 |
-
import torch
|
| 19 |
-
import yaml
|
| 20 |
-
import pickle
|
| 21 |
-
import subprocess
|
| 22 |
-
import pandas as pd
|
| 23 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 24 |
-
import tempfile
|
| 25 |
-
import shutil
|
| 26 |
-
import os
|
| 27 |
-
import matplotlib.pyplot as plt
|
| 28 |
-
import gradio as gr
|
| 29 |
-
from typing import Iterable
|
| 30 |
-
from gradio.themes.base import Base
|
| 31 |
-
from gradio.themes.utils import colors, fonts, sizes
|
| 32 |
-
import time
|
| 33 |
-
|
| 34 |
-
class Seafoam(Base):
|
| 35 |
-
def __init__(
|
| 36 |
-
self,
|
| 37 |
-
*,
|
| 38 |
-
primary_hue: colors.Color | str = colors.emerald,
|
| 39 |
-
secondary_hue: colors.Color | str = colors.blue,
|
| 40 |
-
neutral_hue: colors.Color | str = colors.blue,
|
| 41 |
-
spacing_size: sizes.Size | str = sizes.spacing_md,
|
| 42 |
-
radius_size: sizes.Size | str = sizes.radius_md,
|
| 43 |
-
text_size: sizes.Size | str = sizes.text_lg,
|
| 44 |
-
font: fonts.Font
|
| 45 |
-
| str
|
| 46 |
-
| Iterable[fonts.Font | str] = (
|
| 47 |
-
fonts.GoogleFont("Quicksand"),
|
| 48 |
-
"ui-sans-serif",
|
| 49 |
-
"sans-serif",
|
| 50 |
-
),
|
| 51 |
-
font_mono: fonts.Font
|
| 52 |
-
| str
|
| 53 |
-
| Iterable[fonts.Font | str] = (
|
| 54 |
-
fonts.GoogleFont("IBM Plex Mono"),
|
| 55 |
-
"ui-monospace",
|
| 56 |
-
"monospace",
|
| 57 |
-
),
|
| 58 |
-
):
|
| 59 |
-
super().__init__(
|
| 60 |
-
primary_hue=primary_hue,
|
| 61 |
-
secondary_hue=secondary_hue,
|
| 62 |
-
neutral_hue=neutral_hue,
|
| 63 |
-
spacing_size=spacing_size,
|
| 64 |
-
radius_size=radius_size,
|
| 65 |
-
text_size=text_size,
|
| 66 |
-
font=font,
|
| 67 |
-
font_mono=font_mono,
|
| 68 |
-
)
|
| 69 |
-
super().set(
|
| 70 |
-
#body_background_fill="repeating-linear-gradient(45deg, *primary_200, *primary_200 10px, *primary_50 10px, *primary_50 20px)",
|
| 71 |
-
body_background_fill_dark="repeating-linear-gradient(45deg, *primary_800, *primary_800 10px, *primary_900 10px, *primary_900 20px)",
|
| 72 |
-
button_primary_background_fill="linear-gradient(90deg, *primary_300, *secondary_400)",
|
| 73 |
-
button_primary_background_fill_hover="linear-gradient(90deg, *primary_200, *secondary_300)",
|
| 74 |
-
button_primary_text_color="white",
|
| 75 |
-
button_primary_background_fill_dark="linear-gradient(90deg, *primary_600, *secondary_800)",
|
| 76 |
-
slider_color="*secondary_300",
|
| 77 |
-
slider_color_dark="*secondary_600",
|
| 78 |
-
block_title_text_weight="600",
|
| 79 |
-
block_border_width="3px",
|
| 80 |
-
block_shadow="*shadow_drop_lg",
|
| 81 |
-
button_large_padding="17px",
|
| 82 |
-
body_text_color="#000000",
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
seafoam = Seafoam()
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
# 定义CSS样式
|
| 89 |
-
custom_css = """
|
| 90 |
-
<style>
|
| 91 |
-
.file-upload-height {
|
| 92 |
-
height:320px !important;
|
| 93 |
-
display: none;
|
| 94 |
-
}
|
| 95 |
-
.file-upload-height2 {
|
| 96 |
-
height:190px !important;
|
| 97 |
-
}
|
| 98 |
-
.gallery-height {
|
| 99 |
-
height: 380px !important;
|
| 100 |
-
}
|
| 101 |
-
#custom_plot {
|
| 102 |
-
height: 360px !important;
|
| 103 |
-
}
|
| 104 |
-
#custom_plot2 {
|
| 105 |
-
height: 480px !important;
|
| 106 |
-
}
|
| 107 |
-
</style>
|
| 108 |
-
|
| 109 |
-
"""
|
| 110 |
-
def spectrum_processing(s):
|
| 111 |
-
"""This is how one would typically design a desired pre- and post-
|
| 112 |
-
processing pipeline."""
|
| 113 |
-
s = msfilters.normalize_intensities(s)
|
| 114 |
-
s = msfilters.select_by_mz(s, mz_from=0, mz_to=1500)
|
| 115 |
-
return s
|
| 116 |
-
|
| 117 |
-
def draw_mass_spectrum(peak_data_path):
|
| 118 |
-
# 解析输入的质谱峰数据
|
| 119 |
-
ms2 = list(load_from_msp(peak_data_path.name))[0]
|
| 120 |
-
ms2 = spectrum_processing(ms2)
|
| 121 |
-
|
| 122 |
-
# 示例数据(m/z值和强度)
|
| 123 |
-
Mz = np.array(ms2.mz)
|
| 124 |
-
Intens = np.array(ms2.intensities) # 随机生成强度值
|
| 125 |
-
plt.figure(figsize=(8.5,5))
|
| 126 |
-
# 绘制质谱图
|
| 127 |
-
for i in range(len(Mz)):
|
| 128 |
-
plt.axvline(x=Mz[i], ymin=0, ymax=Intens[i],c='red')
|
| 129 |
-
plt.xlabel("m/z")
|
| 130 |
-
plt.ylabel("Intensity")
|
| 131 |
-
plt.title("Mass Spectrum")
|
| 132 |
-
return plt
|
| 133 |
-
|
| 134 |
-
conn = sqlite3.connect("/csu_ms2_db.db", check_same_thread=False)
|
| 135 |
-
device='cpu'
|
| 136 |
-
pretrain_model_path_low,pretrain_model_path_median,pretrain_model_path_high='
|
| 137 |
-
config_path = "
|
| 138 |
-
config = yaml.load(open(config_path, "r"), Loader=yaml.FullLoader)
|
| 139 |
-
|
| 140 |
-
model_low = ModelCLR(**config["model_config"]).to(device)
|
| 141 |
-
model_median = ModelCLR(**config["model_config"]).to(device)
|
| 142 |
-
model_high = ModelCLR(**config["model_config"]).to(device)
|
| 143 |
-
|
| 144 |
-
state_dict_low = torch.load(pretrain_model_path_low)
|
| 145 |
-
state_dict_median = torch.load(pretrain_model_path_median)
|
| 146 |
-
state_dict_high = torch.load(pretrain_model_path_high)
|
| 147 |
-
model_low.load_state_dict(state_dict_low)
|
| 148 |
-
model_low.eval()
|
| 149 |
-
model_median.load_state_dict(state_dict_median)
|
| 150 |
-
model_median.eval()
|
| 151 |
-
model_high.load_state_dict(state_dict_high)
|
| 152 |
-
model_high.eval()
|
| 153 |
-
|
| 154 |
-
def generate_file(file_obj):
|
| 155 |
-
global tmpdir
|
| 156 |
-
shutil.copy(file_obj.name, tmpdir)
|
| 157 |
-
FileName=os.path.basename(file_obj.name)
|
| 158 |
-
NewfilePath=os.path.join(tmpdir,FileName)
|
| 159 |
-
return NewfilePath
|
| 160 |
-
|
| 161 |
-
def MS2Embedding(spectra):
|
| 162 |
-
spec_mzs = [spec.mz for spec in [spectra]]
|
| 163 |
-
spec_intens = [spec.intensities for spec in [spectra]]
|
| 164 |
-
num_peaks = [len(i) for i in spec_mzs]
|
| 165 |
-
spec_mzs = [np.around(spec_mz, decimals=4) for spec_mz in spec_mzs]
|
| 166 |
-
if len(spec_mzs[0]) > 300:
|
| 167 |
-
spec_mzs = [spec_mzs[0][-300:]]
|
| 168 |
-
spec_intens = [spec_intens[0][-300:]]
|
| 169 |
-
num_peaks=[300]
|
| 170 |
-
else:
|
| 171 |
-
spec_mzs = [np.pad(spec_mz, (0, 300 - len(spec_mz)), mode='constant', constant_values=0) for spec_mz in spec_mzs]
|
| 172 |
-
spec_intens = [np.pad(spec_inten, (0, 300 - len(spec_inten)), mode='constant', constant_values=0) for spec_inten in spec_intens]
|
| 173 |
-
spec_mzs= torch.tensor(spec_mzs).float()
|
| 174 |
-
spec_intens= torch.tensor(spec_intens).float()
|
| 175 |
-
num_peaks = torch.LongTensor(num_peaks)
|
| 176 |
-
spec_tensor_low,spec_mask_low = model_low.ms_encoder(spec_mzs,spec_intens,num_peaks)
|
| 177 |
-
spec_tensor_low=model_low.spec_esa(spec_tensor_low,spec_mask_low)
|
| 178 |
-
spec_tensor_low = model_low.spec_proj(spec_tensor_low)
|
| 179 |
-
spec_tensor_low = spec_tensor_low/spec_tensor_low.norm(dim=-1, keepdim=True)
|
| 180 |
-
spec_tensor_median,spec_mask_median = model_median.ms_encoder(spec_mzs,spec_intens,num_peaks)
|
| 181 |
-
spec_tensor_median=model_median.spec_esa(spec_tensor_median,spec_mask_median)
|
| 182 |
-
spec_tensor_median = model_median.spec_proj(spec_tensor_median)
|
| 183 |
-
spec_tensor_median = spec_tensor_median/spec_tensor_median.norm(dim=-1, keepdim=True)
|
| 184 |
-
spec_tensor_high,spec_mask_high = model_high.ms_encoder(spec_mzs,spec_intens,num_peaks)
|
| 185 |
-
spec_tensor_high=model_high.spec_esa(spec_tensor_high,spec_mask_high)
|
| 186 |
-
spec_tensor_high = model_high.spec_proj(spec_tensor_high)
|
| 187 |
-
spec_tensor_high = spec_tensor_high/spec_tensor_high.norm(dim=-1, keepdim=True)
|
| 188 |
-
return np.array(spec_tensor_low.tolist()[0]),np.array(spec_tensor_median.tolist()[0]),np.array(spec_tensor_high.tolist()[0])
|
| 189 |
-
|
| 190 |
-
def calculate_cosine_similarity(vector1, vector2):
|
| 191 |
-
return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
|
| 192 |
-
|
| 193 |
-
def retrieve_similarity_scores( table_name, target_mass,collision_energy, ms2_embedding_low, ms2_embedding_median, ms2_embedding_high):
|
| 194 |
-
cur = conn.cursor()
|
| 195 |
-
if table_name == "CSU_MS2_DB":
|
| 196 |
-
table_name = 'ConSSDB'
|
| 197 |
-
if table_name == "BloodExp: blood exposome database":
|
| 198 |
-
table_name = 'BloodexpDB'
|
| 199 |
-
if table_name == "ChEBI: products of nature or synthetic products database":
|
| 200 |
-
table_name = 'ChebiDB'
|
| 201 |
-
if table_name == "ChemFOnt: Biochemical database including primary metabolites, secondary metabolites, natural products, etc":
|
| 202 |
-
table_name = 'ChemfontDB'
|
| 203 |
-
if table_name == "ContaminantDB: Contaminant data from different online references and databases on contaminants":
|
| 204 |
-
table_name = 'ContaminantdbDB'
|
| 205 |
-
if table_name == "DrugBank: drug biochemical and pharmacological information database":
|
| 206 |
-
table_name = 'DrugbankDB'
|
| 207 |
-
if table_name == "ECMDB: database of small molecule metabolites found in or produced by Escherichia coli":
|
| 208 |
-
table_name = 'EcmdbDB'
|
| 209 |
-
if table_name == "Exposome-Explorer: biomarkers of exposure to environmental risk factors for diseases":
|
| 210 |
-
table_name = 'ExposomeDB'
|
| 211 |
-
if table_name == "Foodb: food constituent database":
|
| 212 |
-
table_name = 'FoodbDB'
|
| 213 |
-
if table_name == "HMDB: human metabolome database":
|
| 214 |
-
table_name = 'HmdbDB'
|
| 215 |
-
if table_name == "KEGG: a collection of small molecules, biopolymers, and other chemical substances":
|
| 216 |
-
table_name = 'KeggDB'
|
| 217 |
-
if table_name == "KNApSAcK: integrated metabolite-plant species database":
|
| 218 |
-
table_name = 'KnapsackDB'
|
| 219 |
-
if table_name == "MCDB: small molecule metabolites found in cow milk":
|
| 220 |
-
table_name = 'MilkDB'
|
| 221 |
-
if table_name == "MiMeDB: taxonomic, microbiological, and body-site location data on most known human microbes":
|
| 222 |
-
table_name = 'MimedbDB'
|
| 223 |
-
if table_name == "NANPDB: database of natural products isolated from native organisms of Northern Africa":
|
| 224 |
-
table_name = 'NanpdbDB'
|
| 225 |
-
if table_name == "NPAtlas: natural products atlas database":
|
| 226 |
-
table_name = 'NpatlasDB'
|
| 227 |
-
if table_name == "Phenol-Explorer: Polyphenols":
|
| 228 |
-
table_name = 'PhenolDB'
|
| 229 |
-
if table_name == "PMHub: plant metabolite database":
|
| 230 |
-
table_name = 'PmhubDB'
|
| 231 |
-
if table_name == "PMN: plant metabolite database":
|
| 232 |
-
table_name = 'PmnDB'
|
| 233 |
-
if table_name == "SMPDB: small molecule pathway database":
|
| 234 |
-
table_name = 'SmpdbDB'
|
| 235 |
-
if table_name == "STOFF-IDENT: database of water relevant substances":
|
| 236 |
-
table_name = 'StoffDB'
|
| 237 |
-
if table_name == "T3DB: toxic exposome database":
|
| 238 |
-
table_name = 'T3dbDB'
|
| 239 |
-
if table_name == "TCMSP: traditional chinese medicine systems pharmacology database":
|
| 240 |
-
table_name = 'TcmspDB'
|
| 241 |
-
if table_name == "YMDB: yeast metabolome database":
|
| 242 |
-
table_name = 'YmdbDB'
|
| 243 |
-
target_mass = target_mass-1.008
|
| 244 |
-
tolerance = target_mass * 20 / 1000000
|
| 245 |
-
query = f"""
|
| 246 |
-
SELECT SMILES
|
| 247 |
-
FROM {table_name}
|
| 248 |
-
WHERE MonoisotopicMass >= ? - ? AND MonoisotopicMass <= ? + ?
|
| 249 |
-
"""
|
| 250 |
-
cur.execute(query, (target_mass, tolerance, target_mass, tolerance))
|
| 251 |
-
filtered_smiles = cur.fetchall()
|
| 252 |
-
similarity_scores = []
|
| 253 |
-
|
| 254 |
-
for smile in filtered_smiles:
|
| 255 |
-
query = f"""
|
| 256 |
-
SELECT low_energy_embedding, median_energy_embedding, high_energy_embedding
|
| 257 |
-
FROM {table_name}
|
| 258 |
-
WHERE SMILES = ?
|
| 259 |
-
"""
|
| 260 |
-
cur.execute(query, (smile[0],))
|
| 261 |
-
row = cur.fetchone()
|
| 262 |
-
if row is None:
|
| 263 |
-
return None
|
| 264 |
-
low_energy_embedding_db = np.array(pickle.loads(row[0]), dtype=np.float64)
|
| 265 |
-
median_energy_embedding_db = np.array(pickle.loads(row[1]), dtype=np.float64)
|
| 266 |
-
high_energy_embedding_db = np.array(pickle.loads(row[2]), dtype=np.float64)
|
| 267 |
-
low_energy_embedding_db,median_energy_embedding_db,high_energy_embedding_db = torch.tensor(low_energy_embedding_db).float(),torch.tensor(median_energy_embedding_db).float(),torch.tensor(high_energy_embedding_db).float()
|
| 268 |
-
# 计算余弦相似度
|
| 269 |
-
low_similarity =(ms2_embedding_low @ low_energy_embedding_db.t()).item()
|
| 270 |
-
median_similarity = (ms2_embedding_median @ median_energy_embedding_db.t()).item()
|
| 271 |
-
high_similarity = (ms2_embedding_high @ high_energy_embedding_db.t()).item()
|
| 272 |
-
'''
|
| 273 |
-
low_similarity = calculate_cosine_similarity(ms2_embedding_low, low_energy_embedding_db)
|
| 274 |
-
median_similarity = calculate_cosine_similarity(ms2_embedding_median, median_energy_embedding_db)
|
| 275 |
-
high_similarity = calculate_cosine_similarity(ms2_embedding_high, high_energy_embedding_db)'''
|
| 276 |
-
similarity_scores.append((smile, low_similarity, median_similarity, high_similarity))
|
| 277 |
-
|
| 278 |
-
weighted_similarity_scores = []
|
| 279 |
-
for smile, low_similarity, median_similarity, high_similarity in similarity_scores:
|
| 280 |
-
if collision_energy <=15:
|
| 281 |
-
weighted_similarity = 0.4 * low_similarity + 0.3 * median_similarity + 0.3 * high_similarity
|
| 282 |
-
weighted_similarity_scores.append((smile, weighted_similarity))
|
| 283 |
-
elif collision_energy >15 and collision_energy <= 25:
|
| 284 |
-
weighted_similarity = 0.3 * low_similarity + 0.4 * median_similarity + 0.3 * high_similarity
|
| 285 |
-
weighted_similarity_scores.append((smile, weighted_similarity))
|
| 286 |
-
elif collision_energy > 25:
|
| 287 |
-
weighted_similarity = 0.2 * low_similarity + 0.3 * median_similarity + 0.5 * high_similarity
|
| 288 |
-
weighted_similarity_scores.append((smile, weighted_similarity))
|
| 289 |
-
|
| 290 |
-
# 按加权相似度降序排序
|
| 291 |
-
weighted_similarity_scores.sort(key=lambda x: x[1], reverse=True)
|
| 292 |
-
|
| 293 |
-
# 取出前10个 SMILES
|
| 294 |
-
top_10_smiles = weighted_similarity_scores[:10]
|
| 295 |
-
cur.close()
|
| 296 |
-
#conn.close()
|
| 297 |
-
return top_10_smiles
|
| 298 |
-
|
| 299 |
-
def get_topK_result(library,ms_feature, smiles_feature, topK):
|
| 300 |
-
|
| 301 |
-
if topK >= len(library):
|
| 302 |
-
topK = len(library)
|
| 303 |
-
with torch.no_grad():
|
| 304 |
-
ms_smiles_distances_tmp = (
|
| 305 |
-
ms_feature.unsqueeze(0) @ smiles_feature.t()).cpu()
|
| 306 |
-
scores_, indices_ = ms_smiles_distances_tmp.topk(topK,
|
| 307 |
-
dim=1,
|
| 308 |
-
largest=True,
|
| 309 |
-
sorted=True)
|
| 310 |
-
candidates=[library[i] for i in indices_.tolist()[0]]
|
| 311 |
-
indices=indices_.tolist()[0]
|
| 312 |
-
scores=scores_.tolist()[0]
|
| 313 |
-
return indices, scores, candidates
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
def rank_lib(database_name,spectrum_path,instrument_type,adduct,parent_Mass,collision_energy):
|
| 317 |
-
ms2 = list(load_from_msp(spectrum_path.name))[0]
|
| 318 |
-
ms2 = spectrum_processing(ms2)
|
| 319 |
-
collision_energy=float(collision_energy)
|
| 320 |
-
parent_Mass=float(parent_Mass)
|
| 321 |
-
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = MS2Embedding(ms2)
|
| 322 |
-
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = torch.tensor(ms2_embedding_low).float(),torch.tensor(ms2_embedding_median).float(),torch.tensor(ms2_embedding_high).float()
|
| 323 |
-
top_10_smiles = retrieve_similarity_scores(database_name,parent_Mass,collision_energy,ms2_embedding_low,ms2_embedding_median,ms2_embedding_high)
|
| 324 |
-
smis = [x[0][0] for x in top_10_smiles]
|
| 325 |
-
scores = [x[1] for x in top_10_smiles]
|
| 326 |
-
images,image_descrips=[],[]
|
| 327 |
-
bw_draw_options = Draw.MolDrawOptions()
|
| 328 |
-
bw_draw_options.useBWAtomPalette()
|
| 329 |
-
|
| 330 |
-
for smi in smis:
|
| 331 |
-
try:
|
| 332 |
-
mol = Chem.MolFromSmiles(smi)
|
| 333 |
-
images.append(Draw.MolToImage(mol, options=bw_draw_options))
|
| 334 |
-
except:
|
| 335 |
-
images.append('NAN')
|
| 336 |
-
for i in range(len(smis)):
|
| 337 |
-
image_descrips.append((images[i],'SMILES: '+smis[i]+' ' + 'Score: '+str(scores[i])))
|
| 338 |
-
#top_10_results = pd.DataFrame({'SMILES':[x[0] for x in top_10_smiles],'Struture':images,'Score':[x[1] for x in top_10_smiles],'Rank':list(range(10))})
|
| 339 |
-
return image_descrips
|
| 340 |
-
|
| 341 |
-
def rank_user_lib(candidate_file,spectrum_path,instrument_type,adduct,collision_energy):
|
| 342 |
-
model_inference_low = ModelInference(config_path=config_path,
|
| 343 |
-
pretrain_model_path=pretrain_model_path_low,
|
| 344 |
-
device="cpu")
|
| 345 |
-
model_inference_median = ModelInference(config_path=config_path,
|
| 346 |
-
pretrain_model_path=pretrain_model_path_median,
|
| 347 |
-
device="cpu")
|
| 348 |
-
model_inference_high = ModelInference(config_path=config_path,
|
| 349 |
-
pretrain_model_path=pretrain_model_path_high,
|
| 350 |
-
device="cpu")
|
| 351 |
-
collision_energy=float(collision_energy)
|
| 352 |
-
candidate = pd.read_csv(candidate_file.name)
|
| 353 |
-
candidate_smiles = list(candidate['SMILES'])
|
| 354 |
-
ms2 = list(load_from_msp(spectrum_path.name))[0]
|
| 355 |
-
ms2 = spectrum_processing(ms2)
|
| 356 |
-
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = MS2Embedding(ms2)
|
| 357 |
-
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = torch.tensor(ms2_embedding_low).float(),torch.tensor(ms2_embedding_median).float(),torch.tensor(ms2_embedding_high).float()
|
| 358 |
-
contexts = []
|
| 359 |
-
for i in range(0, len(candidate_smiles),64):
|
| 360 |
-
contexts.append(candidate_smiles[i:i + 64])
|
| 361 |
-
result_low = [model_inference_low.smiles_encode(i).cpu() for i in contexts]
|
| 362 |
-
result_low = torch.cat(result_low, 0)
|
| 363 |
-
result_median = [model_inference_median.smiles_encode(i).cpu() for i in contexts]
|
| 364 |
-
result_median = torch.cat(result_median, 0)
|
| 365 |
-
result_high = [model_inference_high.smiles_encode(i).cpu() for i in contexts]
|
| 366 |
-
result_high = torch.cat(result_high, 0)
|
| 367 |
-
low_similarity = ms2_embedding_low @ result_low.t()
|
| 368 |
-
median_similarity = ms2_embedding_median @ result_median.t()
|
| 369 |
-
high_similarity = ms2_embedding_high @ result_high.t()
|
| 370 |
-
low_similarity = low_similarity.numpy()
|
| 371 |
-
median_similarity = median_similarity.numpy()
|
| 372 |
-
high_similarity = high_similarity.numpy()
|
| 373 |
-
if collision_energy <=15:
|
| 374 |
-
weighted_similarity = 0.4 * low_similarity + 0.3 * median_similarity + 0.3 * high_similarity
|
| 375 |
-
elif collision_energy >15 and collision_energy <= 25:
|
| 376 |
-
weighted_similarity = 0.3 * low_similarity + 0.4 * median_similarity + 0.3 * high_similarity
|
| 377 |
-
elif collision_energy > 25:
|
| 378 |
-
weighted_similarity = 0.2 * low_similarity + 0.3 * median_similarity + 0.5 * high_similarity
|
| 379 |
-
weighted_similarity_scores=[(candidate_smiles[i],weighted_similarity[i]) for i in range(len(candidate_smiles))]
|
| 380 |
-
weighted_similarity_scores.sort(key=lambda x: x[1], reverse=True)
|
| 381 |
-
top_10_smiles = weighted_similarity_scores[:10]
|
| 382 |
-
smis = [x[0] for x in top_10_smiles]
|
| 383 |
-
scores = [x[1] for x in top_10_smiles]
|
| 384 |
-
images,image_descrips=[],[]
|
| 385 |
-
bw_draw_options = Draw.MolDrawOptions()
|
| 386 |
-
bw_draw_options.useBWAtomPalette()
|
| 387 |
-
for smi in smis:
|
| 388 |
-
try:
|
| 389 |
-
mol = Chem.MolFromSmiles(smi)
|
| 390 |
-
images.append(Draw.MolToImage(mol, options=bw_draw_options))
|
| 391 |
-
except:
|
| 392 |
-
images.append('NAN')
|
| 393 |
-
for i in range(len(smis)):
|
| 394 |
-
image_descrips.append((images[i],'SMILES: '+smis[i]+' ' + 'Score: '+str(scores[i])))
|
| 395 |
-
return image_descrips
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
with gr.Blocks(theme=seafoam) as demo:
|
| 399 |
-
gr.HTML(custom_css)
|
| 400 |
-
gr.Markdown('<div style="font-size:40px; font-weight:bold;">🔍 Cross-Modal MS2 Retrieval Demo </div>')
|
| 401 |
-
gr.Markdown('''
|
| 402 |
-
<div style=" font-weight:bold;">
|
| 403 |
-
<span style="font-size:40px;"> Based on CSU-MS2 model </span>
|
| 404 |
-
</div>
|
| 405 |
-
''')
|
| 406 |
-
|
| 407 |
-
#gr.Markdown('<div style="font-size:20px;">You can select from a list of collected databases or upload your structural file by Clicking the button. The MS/MS spectrum to be identified can be uploaded in MSP format. You also need to set the necessary experimental parameters for the input spectrum, including accurate precursor mass and collision energy. Finally, by clicking the "Cross-Modal Retrieval" button, you can retrieve candidates from the selected database or your own structural file and access identification results that include compound structures, scores, and rankings.</div>')
|
| 408 |
-
with gr.Tab(label="📶 Struture library", elem_id='custom_tab'):
|
| 409 |
-
#Blocks特有组件,设置所有子组件按垂直排列
|
| 410 |
-
with gr.Row():
|
| 411 |
-
with gr.Column():
|
| 412 |
-
peak_data = gr.File(file_count="single", label="Upload MS/MS spectrum file in .msp format", elem_classes=".file-upload-height")
|
| 413 |
-
draw_button = gr.Button("Draw Mass Spectrum")
|
| 414 |
-
with gr.Column():
|
| 415 |
-
spectrum_output = gr.Plot(label="Mass Spectrum",elem_id="custom_plot")
|
| 416 |
-
with gr.Column():
|
| 417 |
-
dataset = gr.Dropdown(["CSU_MS2_DB",
|
| 418 |
-
"BloodExp: blood exposome database",
|
| 419 |
-
"ChEBI: products of nature or synthetic products database",
|
| 420 |
-
"ChemFOnt: Biochemical database including primary metabolites, secondary metabolites, natural products, etc",
|
| 421 |
-
"ContaminantDB: Contaminant data from different online references and databases on contaminants",
|
| 422 |
-
"DrugBank: drug biochemical and pharmacological information database",
|
| 423 |
-
"ECMDB: database of small molecule metabolites found in or produced by Escherichia coli",
|
| 424 |
-
"Exposome-Explorer: biomarkers of exposure to environmental risk factors for diseases",
|
| 425 |
-
"Foodb: food constituent database",
|
| 426 |
-
"HMDB: human metabolome database",
|
| 427 |
-
"KEGG: a collection of small molecules, biopolymers, and other chemical substances",
|
| 428 |
-
"KNApSAcK: integrated metabolite-plant species database",
|
| 429 |
-
"MCDB: small molecule metabolites found in cow milk",
|
| 430 |
-
"MiMeDB: taxonomic, microbiological, and body-site location data on most known human microbes",
|
| 431 |
-
"NANPDB: database of natural products isolated from native organisms of Northern Africa",
|
| 432 |
-
"NPAtlas: natural products atlas database",
|
| 433 |
-
"Phenol-Explorer: Polyphenols",
|
| 434 |
-
"PMHub: plant metabolite database",
|
| 435 |
-
"PMN: plant metabolite database",
|
| 436 |
-
"SMPDB: small molecule pathway database",
|
| 437 |
-
"STOFF-IDENT: database of water relevant substances",
|
| 438 |
-
"T3DB: toxic exposome database",
|
| 439 |
-
"TCMSP: traditional chinese medicine systems pharmacology database",
|
| 440 |
-
"YMDB: yeast metabolome database"], label="Choose a structure library")
|
| 441 |
-
#gr.CheckboxGroup(choices=["HMDB", "Lipidmaps", "CHEMBL"], label="Choose a structure library"),
|
| 442 |
-
with gr.Row():
|
| 443 |
-
instru=gr.Dropdown(["HCD"], label="Instrument Type")
|
| 444 |
-
ionmode=gr.Dropdown(["[M+H]+"], label="Adduct Type")
|
| 445 |
-
par_ion_mass=gr.Textbox(label="Parent Ion Mass",placeholder="e.g., 180.00")
|
| 446 |
-
collision_e=gr.Textbox(label="collision energy", placeholder="e.g., 40")
|
| 447 |
-
with gr.Column():
|
| 448 |
-
lib_button = gr.Button("Cross-Modal Retrieval")
|
| 449 |
-
lib_output = gr.Gallery(height='auto',columns=4,elem_classes="gallery-height",label='Cross-modal retrieval results')
|
| 450 |
-
#lib_output = gr.Dataframe(type="pandas")
|
| 451 |
-
with gr.Tab("📁 Upload structure file"):
|
| 452 |
-
with gr.Row():
|
| 453 |
-
with gr.Column():
|
| 454 |
-
use_dataset= gr.File(file_count="single", label="Upload the candidate structure file in csv format, columns=['SMIELS']",elem_classes="file-upload-height2")
|
| 455 |
-
user_peak_data=gr.File(file_count="single", label="Upload MS/MS spectrum file in .msp format", elem_classes="file-upload-height2")
|
| 456 |
-
user_draw_button = gr.Button("Draw Mass Spectrum")
|
| 457 |
-
with gr.Column():
|
| 458 |
-
user_spectrum_output = gr.Plot(label="Mass Spectrum",elem_id="custom_plot2")
|
| 459 |
-
with gr.Row():
|
| 460 |
-
user_instru=gr.Dropdown(["HCD"], label="Instrument Type")
|
| 461 |
-
user_ionmode=gr.Dropdown(["[M+H]+"], label="Adduct Type")
|
| 462 |
-
user_collision_e=gr.Textbox(label="collision energy", placeholder="e.g., 40")
|
| 463 |
-
with gr.Column():
|
| 464 |
-
user_button = gr.Button("Cross-Modal Retrieval")
|
| 465 |
-
user_output = gr.Gallery(height='auto',columns=4,elem_classes="gallery-height",label='Cross-modal retrieval results')
|
| 466 |
-
draw_button.click(draw_mass_spectrum, inputs=[peak_data], outputs=[spectrum_output])
|
| 467 |
-
user_draw_button.click(draw_mass_spectrum, inputs=[user_peak_data], outputs=[user_spectrum_output])
|
| 468 |
-
lib_button.click(rank_lib, inputs=[dataset,peak_data,instru,ionmode,par_ion_mass,collision_e], outputs=lib_output)
|
| 469 |
-
user_button.click(rank_user_lib, inputs=[use_dataset,user_peak_data,user_instru,user_ionmode,user_collision_e], outputs=user_output)
|
| 470 |
-
demo.launch(share=False)
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Thu Oct 10 16:46:36 2024
|
| 4 |
+
|
| 5 |
+
@author: ZNDX002
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
import sqlite3
|
| 10 |
+
import gradio as gr
|
| 11 |
+
from infer import ModelInference
|
| 12 |
+
from model import ModelCLR
|
| 13 |
+
from matchms.importing import load_from_mgf,load_from_msp
|
| 14 |
+
import matchms.filtering as msfilters
|
| 15 |
+
import numpy as np
|
| 16 |
+
from rdkit.Chem import Draw
|
| 17 |
+
from rdkit import Chem
|
| 18 |
+
import torch
|
| 19 |
+
import yaml
|
| 20 |
+
import pickle
|
| 21 |
+
import subprocess
|
| 22 |
+
import pandas as pd
|
| 23 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 24 |
+
import tempfile
|
| 25 |
+
import shutil
|
| 26 |
+
import os
|
| 27 |
+
import matplotlib.pyplot as plt
|
| 28 |
+
import gradio as gr
|
| 29 |
+
from typing import Iterable
|
| 30 |
+
from gradio.themes.base import Base
|
| 31 |
+
from gradio.themes.utils import colors, fonts, sizes
|
| 32 |
+
import time
|
| 33 |
+
|
| 34 |
+
class Seafoam(Base):
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
*,
|
| 38 |
+
primary_hue: colors.Color | str = colors.emerald,
|
| 39 |
+
secondary_hue: colors.Color | str = colors.blue,
|
| 40 |
+
neutral_hue: colors.Color | str = colors.blue,
|
| 41 |
+
spacing_size: sizes.Size | str = sizes.spacing_md,
|
| 42 |
+
radius_size: sizes.Size | str = sizes.radius_md,
|
| 43 |
+
text_size: sizes.Size | str = sizes.text_lg,
|
| 44 |
+
font: fonts.Font
|
| 45 |
+
| str
|
| 46 |
+
| Iterable[fonts.Font | str] = (
|
| 47 |
+
fonts.GoogleFont("Quicksand"),
|
| 48 |
+
"ui-sans-serif",
|
| 49 |
+
"sans-serif",
|
| 50 |
+
),
|
| 51 |
+
font_mono: fonts.Font
|
| 52 |
+
| str
|
| 53 |
+
| Iterable[fonts.Font | str] = (
|
| 54 |
+
fonts.GoogleFont("IBM Plex Mono"),
|
| 55 |
+
"ui-monospace",
|
| 56 |
+
"monospace",
|
| 57 |
+
),
|
| 58 |
+
):
|
| 59 |
+
super().__init__(
|
| 60 |
+
primary_hue=primary_hue,
|
| 61 |
+
secondary_hue=secondary_hue,
|
| 62 |
+
neutral_hue=neutral_hue,
|
| 63 |
+
spacing_size=spacing_size,
|
| 64 |
+
radius_size=radius_size,
|
| 65 |
+
text_size=text_size,
|
| 66 |
+
font=font,
|
| 67 |
+
font_mono=font_mono,
|
| 68 |
+
)
|
| 69 |
+
super().set(
|
| 70 |
+
#body_background_fill="repeating-linear-gradient(45deg, *primary_200, *primary_200 10px, *primary_50 10px, *primary_50 20px)",
|
| 71 |
+
body_background_fill_dark="repeating-linear-gradient(45deg, *primary_800, *primary_800 10px, *primary_900 10px, *primary_900 20px)",
|
| 72 |
+
button_primary_background_fill="linear-gradient(90deg, *primary_300, *secondary_400)",
|
| 73 |
+
button_primary_background_fill_hover="linear-gradient(90deg, *primary_200, *secondary_300)",
|
| 74 |
+
button_primary_text_color="white",
|
| 75 |
+
button_primary_background_fill_dark="linear-gradient(90deg, *primary_600, *secondary_800)",
|
| 76 |
+
slider_color="*secondary_300",
|
| 77 |
+
slider_color_dark="*secondary_600",
|
| 78 |
+
block_title_text_weight="600",
|
| 79 |
+
block_border_width="3px",
|
| 80 |
+
block_shadow="*shadow_drop_lg",
|
| 81 |
+
button_large_padding="17px",
|
| 82 |
+
body_text_color="#000000",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
seafoam = Seafoam()
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# 定义CSS样式
|
| 89 |
+
custom_css = """
|
| 90 |
+
<style>
|
| 91 |
+
.file-upload-height {
|
| 92 |
+
height:320px !important;
|
| 93 |
+
display: none;
|
| 94 |
+
}
|
| 95 |
+
.file-upload-height2 {
|
| 96 |
+
height:190px !important;
|
| 97 |
+
}
|
| 98 |
+
.gallery-height {
|
| 99 |
+
height: 380px !important;
|
| 100 |
+
}
|
| 101 |
+
#custom_plot {
|
| 102 |
+
height: 360px !important;
|
| 103 |
+
}
|
| 104 |
+
#custom_plot2 {
|
| 105 |
+
height: 480px !important;
|
| 106 |
+
}
|
| 107 |
+
</style>
|
| 108 |
+
|
| 109 |
+
"""
|
| 110 |
+
def spectrum_processing(s):
|
| 111 |
+
"""This is how one would typically design a desired pre- and post-
|
| 112 |
+
processing pipeline."""
|
| 113 |
+
s = msfilters.normalize_intensities(s)
|
| 114 |
+
s = msfilters.select_by_mz(s, mz_from=0, mz_to=1500)
|
| 115 |
+
return s
|
| 116 |
+
|
| 117 |
+
def draw_mass_spectrum(peak_data_path):
|
| 118 |
+
# 解析输入的质谱峰数据
|
| 119 |
+
ms2 = list(load_from_msp(peak_data_path.name))[0]
|
| 120 |
+
ms2 = spectrum_processing(ms2)
|
| 121 |
+
|
| 122 |
+
# 示例数据(m/z值和强度)
|
| 123 |
+
Mz = np.array(ms2.mz)
|
| 124 |
+
Intens = np.array(ms2.intensities) # 随机生成强度值
|
| 125 |
+
plt.figure(figsize=(8.5,5))
|
| 126 |
+
# 绘制质谱图
|
| 127 |
+
for i in range(len(Mz)):
|
| 128 |
+
plt.axvline(x=Mz[i], ymin=0, ymax=Intens[i],c='red')
|
| 129 |
+
plt.xlabel("m/z")
|
| 130 |
+
plt.ylabel("Intensity")
|
| 131 |
+
plt.title("Mass Spectrum")
|
| 132 |
+
return plt
|
| 133 |
+
|
| 134 |
+
conn = sqlite3.connect("/csu_ms2_db.db", check_same_thread=False)
|
| 135 |
+
device='cpu'
|
| 136 |
+
pretrain_model_path_low,pretrain_model_path_median,pretrain_model_path_high='model/low_energy/checkpoints/model.pth','model/median_energy/checkpoints/model.pth','model/high_energy/checkpoints/model.pth'
|
| 137 |
+
config_path = "model/low_energy/checkpoints/config.yaml"
|
| 138 |
+
config = yaml.load(open(config_path, "r"), Loader=yaml.FullLoader)
|
| 139 |
+
|
| 140 |
+
model_low = ModelCLR(**config["model_config"]).to(device)
|
| 141 |
+
model_median = ModelCLR(**config["model_config"]).to(device)
|
| 142 |
+
model_high = ModelCLR(**config["model_config"]).to(device)
|
| 143 |
+
|
| 144 |
+
state_dict_low = torch.load(pretrain_model_path_low)
|
| 145 |
+
state_dict_median = torch.load(pretrain_model_path_median)
|
| 146 |
+
state_dict_high = torch.load(pretrain_model_path_high)
|
| 147 |
+
model_low.load_state_dict(state_dict_low)
|
| 148 |
+
model_low.eval()
|
| 149 |
+
model_median.load_state_dict(state_dict_median)
|
| 150 |
+
model_median.eval()
|
| 151 |
+
model_high.load_state_dict(state_dict_high)
|
| 152 |
+
model_high.eval()
|
| 153 |
+
|
| 154 |
+
def generate_file(file_obj):
|
| 155 |
+
global tmpdir
|
| 156 |
+
shutil.copy(file_obj.name, tmpdir)
|
| 157 |
+
FileName=os.path.basename(file_obj.name)
|
| 158 |
+
NewfilePath=os.path.join(tmpdir,FileName)
|
| 159 |
+
return NewfilePath
|
| 160 |
+
|
| 161 |
+
def MS2Embedding(spectra):
|
| 162 |
+
spec_mzs = [spec.mz for spec in [spectra]]
|
| 163 |
+
spec_intens = [spec.intensities for spec in [spectra]]
|
| 164 |
+
num_peaks = [len(i) for i in spec_mzs]
|
| 165 |
+
spec_mzs = [np.around(spec_mz, decimals=4) for spec_mz in spec_mzs]
|
| 166 |
+
if len(spec_mzs[0]) > 300:
|
| 167 |
+
spec_mzs = [spec_mzs[0][-300:]]
|
| 168 |
+
spec_intens = [spec_intens[0][-300:]]
|
| 169 |
+
num_peaks=[300]
|
| 170 |
+
else:
|
| 171 |
+
spec_mzs = [np.pad(spec_mz, (0, 300 - len(spec_mz)), mode='constant', constant_values=0) for spec_mz in spec_mzs]
|
| 172 |
+
spec_intens = [np.pad(spec_inten, (0, 300 - len(spec_inten)), mode='constant', constant_values=0) for spec_inten in spec_intens]
|
| 173 |
+
spec_mzs= torch.tensor(spec_mzs).float()
|
| 174 |
+
spec_intens= torch.tensor(spec_intens).float()
|
| 175 |
+
num_peaks = torch.LongTensor(num_peaks)
|
| 176 |
+
spec_tensor_low,spec_mask_low = model_low.ms_encoder(spec_mzs,spec_intens,num_peaks)
|
| 177 |
+
spec_tensor_low=model_low.spec_esa(spec_tensor_low,spec_mask_low)
|
| 178 |
+
spec_tensor_low = model_low.spec_proj(spec_tensor_low)
|
| 179 |
+
spec_tensor_low = spec_tensor_low/spec_tensor_low.norm(dim=-1, keepdim=True)
|
| 180 |
+
spec_tensor_median,spec_mask_median = model_median.ms_encoder(spec_mzs,spec_intens,num_peaks)
|
| 181 |
+
spec_tensor_median=model_median.spec_esa(spec_tensor_median,spec_mask_median)
|
| 182 |
+
spec_tensor_median = model_median.spec_proj(spec_tensor_median)
|
| 183 |
+
spec_tensor_median = spec_tensor_median/spec_tensor_median.norm(dim=-1, keepdim=True)
|
| 184 |
+
spec_tensor_high,spec_mask_high = model_high.ms_encoder(spec_mzs,spec_intens,num_peaks)
|
| 185 |
+
spec_tensor_high=model_high.spec_esa(spec_tensor_high,spec_mask_high)
|
| 186 |
+
spec_tensor_high = model_high.spec_proj(spec_tensor_high)
|
| 187 |
+
spec_tensor_high = spec_tensor_high/spec_tensor_high.norm(dim=-1, keepdim=True)
|
| 188 |
+
return np.array(spec_tensor_low.tolist()[0]),np.array(spec_tensor_median.tolist()[0]),np.array(spec_tensor_high.tolist()[0])
|
| 189 |
+
|
| 190 |
+
def calculate_cosine_similarity(vector1, vector2):
|
| 191 |
+
return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]
|
| 192 |
+
|
| 193 |
+
def retrieve_similarity_scores( table_name, target_mass,collision_energy, ms2_embedding_low, ms2_embedding_median, ms2_embedding_high):
|
| 194 |
+
cur = conn.cursor()
|
| 195 |
+
if table_name == "CSU_MS2_DB":
|
| 196 |
+
table_name = 'ConSSDB'
|
| 197 |
+
if table_name == "BloodExp: blood exposome database":
|
| 198 |
+
table_name = 'BloodexpDB'
|
| 199 |
+
if table_name == "ChEBI: products of nature or synthetic products database":
|
| 200 |
+
table_name = 'ChebiDB'
|
| 201 |
+
if table_name == "ChemFOnt: Biochemical database including primary metabolites, secondary metabolites, natural products, etc":
|
| 202 |
+
table_name = 'ChemfontDB'
|
| 203 |
+
if table_name == "ContaminantDB: Contaminant data from different online references and databases on contaminants":
|
| 204 |
+
table_name = 'ContaminantdbDB'
|
| 205 |
+
if table_name == "DrugBank: drug biochemical and pharmacological information database":
|
| 206 |
+
table_name = 'DrugbankDB'
|
| 207 |
+
if table_name == "ECMDB: database of small molecule metabolites found in or produced by Escherichia coli":
|
| 208 |
+
table_name = 'EcmdbDB'
|
| 209 |
+
if table_name == "Exposome-Explorer: biomarkers of exposure to environmental risk factors for diseases":
|
| 210 |
+
table_name = 'ExposomeDB'
|
| 211 |
+
if table_name == "Foodb: food constituent database":
|
| 212 |
+
table_name = 'FoodbDB'
|
| 213 |
+
if table_name == "HMDB: human metabolome database":
|
| 214 |
+
table_name = 'HmdbDB'
|
| 215 |
+
if table_name == "KEGG: a collection of small molecules, biopolymers, and other chemical substances":
|
| 216 |
+
table_name = 'KeggDB'
|
| 217 |
+
if table_name == "KNApSAcK: integrated metabolite-plant species database":
|
| 218 |
+
table_name = 'KnapsackDB'
|
| 219 |
+
if table_name == "MCDB: small molecule metabolites found in cow milk":
|
| 220 |
+
table_name = 'MilkDB'
|
| 221 |
+
if table_name == "MiMeDB: taxonomic, microbiological, and body-site location data on most known human microbes":
|
| 222 |
+
table_name = 'MimedbDB'
|
| 223 |
+
if table_name == "NANPDB: database of natural products isolated from native organisms of Northern Africa":
|
| 224 |
+
table_name = 'NanpdbDB'
|
| 225 |
+
if table_name == "NPAtlas: natural products atlas database":
|
| 226 |
+
table_name = 'NpatlasDB'
|
| 227 |
+
if table_name == "Phenol-Explorer: Polyphenols":
|
| 228 |
+
table_name = 'PhenolDB'
|
| 229 |
+
if table_name == "PMHub: plant metabolite database":
|
| 230 |
+
table_name = 'PmhubDB'
|
| 231 |
+
if table_name == "PMN: plant metabolite database":
|
| 232 |
+
table_name = 'PmnDB'
|
| 233 |
+
if table_name == "SMPDB: small molecule pathway database":
|
| 234 |
+
table_name = 'SmpdbDB'
|
| 235 |
+
if table_name == "STOFF-IDENT: database of water relevant substances":
|
| 236 |
+
table_name = 'StoffDB'
|
| 237 |
+
if table_name == "T3DB: toxic exposome database":
|
| 238 |
+
table_name = 'T3dbDB'
|
| 239 |
+
if table_name == "TCMSP: traditional chinese medicine systems pharmacology database":
|
| 240 |
+
table_name = 'TcmspDB'
|
| 241 |
+
if table_name == "YMDB: yeast metabolome database":
|
| 242 |
+
table_name = 'YmdbDB'
|
| 243 |
+
target_mass = target_mass-1.008
|
| 244 |
+
tolerance = target_mass * 20 / 1000000
|
| 245 |
+
query = f"""
|
| 246 |
+
SELECT SMILES
|
| 247 |
+
FROM {table_name}
|
| 248 |
+
WHERE MonoisotopicMass >= ? - ? AND MonoisotopicMass <= ? + ?
|
| 249 |
+
"""
|
| 250 |
+
cur.execute(query, (target_mass, tolerance, target_mass, tolerance))
|
| 251 |
+
filtered_smiles = cur.fetchall()
|
| 252 |
+
similarity_scores = []
|
| 253 |
+
|
| 254 |
+
for smile in filtered_smiles:
|
| 255 |
+
query = f"""
|
| 256 |
+
SELECT low_energy_embedding, median_energy_embedding, high_energy_embedding
|
| 257 |
+
FROM {table_name}
|
| 258 |
+
WHERE SMILES = ?
|
| 259 |
+
"""
|
| 260 |
+
cur.execute(query, (smile[0],))
|
| 261 |
+
row = cur.fetchone()
|
| 262 |
+
if row is None:
|
| 263 |
+
return None
|
| 264 |
+
low_energy_embedding_db = np.array(pickle.loads(row[0]), dtype=np.float64)
|
| 265 |
+
median_energy_embedding_db = np.array(pickle.loads(row[1]), dtype=np.float64)
|
| 266 |
+
high_energy_embedding_db = np.array(pickle.loads(row[2]), dtype=np.float64)
|
| 267 |
+
low_energy_embedding_db,median_energy_embedding_db,high_energy_embedding_db = torch.tensor(low_energy_embedding_db).float(),torch.tensor(median_energy_embedding_db).float(),torch.tensor(high_energy_embedding_db).float()
|
| 268 |
+
# 计算余弦相似度
|
| 269 |
+
low_similarity =(ms2_embedding_low @ low_energy_embedding_db.t()).item()
|
| 270 |
+
median_similarity = (ms2_embedding_median @ median_energy_embedding_db.t()).item()
|
| 271 |
+
high_similarity = (ms2_embedding_high @ high_energy_embedding_db.t()).item()
|
| 272 |
+
'''
|
| 273 |
+
low_similarity = calculate_cosine_similarity(ms2_embedding_low, low_energy_embedding_db)
|
| 274 |
+
median_similarity = calculate_cosine_similarity(ms2_embedding_median, median_energy_embedding_db)
|
| 275 |
+
high_similarity = calculate_cosine_similarity(ms2_embedding_high, high_energy_embedding_db)'''
|
| 276 |
+
similarity_scores.append((smile, low_similarity, median_similarity, high_similarity))
|
| 277 |
+
|
| 278 |
+
weighted_similarity_scores = []
|
| 279 |
+
for smile, low_similarity, median_similarity, high_similarity in similarity_scores:
|
| 280 |
+
if collision_energy <=15:
|
| 281 |
+
weighted_similarity = 0.4 * low_similarity + 0.3 * median_similarity + 0.3 * high_similarity
|
| 282 |
+
weighted_similarity_scores.append((smile, weighted_similarity))
|
| 283 |
+
elif collision_energy >15 and collision_energy <= 25:
|
| 284 |
+
weighted_similarity = 0.3 * low_similarity + 0.4 * median_similarity + 0.3 * high_similarity
|
| 285 |
+
weighted_similarity_scores.append((smile, weighted_similarity))
|
| 286 |
+
elif collision_energy > 25:
|
| 287 |
+
weighted_similarity = 0.2 * low_similarity + 0.3 * median_similarity + 0.5 * high_similarity
|
| 288 |
+
weighted_similarity_scores.append((smile, weighted_similarity))
|
| 289 |
+
|
| 290 |
+
# 按加权相似度降序排序
|
| 291 |
+
weighted_similarity_scores.sort(key=lambda x: x[1], reverse=True)
|
| 292 |
+
|
| 293 |
+
# 取出前10个 SMILES
|
| 294 |
+
top_10_smiles = weighted_similarity_scores[:10]
|
| 295 |
+
cur.close()
|
| 296 |
+
#conn.close()
|
| 297 |
+
return top_10_smiles
|
| 298 |
+
|
| 299 |
+
def get_topK_result(library,ms_feature, smiles_feature, topK):
|
| 300 |
+
|
| 301 |
+
if topK >= len(library):
|
| 302 |
+
topK = len(library)
|
| 303 |
+
with torch.no_grad():
|
| 304 |
+
ms_smiles_distances_tmp = (
|
| 305 |
+
ms_feature.unsqueeze(0) @ smiles_feature.t()).cpu()
|
| 306 |
+
scores_, indices_ = ms_smiles_distances_tmp.topk(topK,
|
| 307 |
+
dim=1,
|
| 308 |
+
largest=True,
|
| 309 |
+
sorted=True)
|
| 310 |
+
candidates=[library[i] for i in indices_.tolist()[0]]
|
| 311 |
+
indices=indices_.tolist()[0]
|
| 312 |
+
scores=scores_.tolist()[0]
|
| 313 |
+
return indices, scores, candidates
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def rank_lib(database_name,spectrum_path,instrument_type,adduct,parent_Mass,collision_energy):
|
| 317 |
+
ms2 = list(load_from_msp(spectrum_path.name))[0]
|
| 318 |
+
ms2 = spectrum_processing(ms2)
|
| 319 |
+
collision_energy=float(collision_energy)
|
| 320 |
+
parent_Mass=float(parent_Mass)
|
| 321 |
+
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = MS2Embedding(ms2)
|
| 322 |
+
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = torch.tensor(ms2_embedding_low).float(),torch.tensor(ms2_embedding_median).float(),torch.tensor(ms2_embedding_high).float()
|
| 323 |
+
top_10_smiles = retrieve_similarity_scores(database_name,parent_Mass,collision_energy,ms2_embedding_low,ms2_embedding_median,ms2_embedding_high)
|
| 324 |
+
smis = [x[0][0] for x in top_10_smiles]
|
| 325 |
+
scores = [x[1] for x in top_10_smiles]
|
| 326 |
+
images,image_descrips=[],[]
|
| 327 |
+
bw_draw_options = Draw.MolDrawOptions()
|
| 328 |
+
bw_draw_options.useBWAtomPalette()
|
| 329 |
+
|
| 330 |
+
for smi in smis:
|
| 331 |
+
try:
|
| 332 |
+
mol = Chem.MolFromSmiles(smi)
|
| 333 |
+
images.append(Draw.MolToImage(mol, options=bw_draw_options))
|
| 334 |
+
except:
|
| 335 |
+
images.append('NAN')
|
| 336 |
+
for i in range(len(smis)):
|
| 337 |
+
image_descrips.append((images[i],'SMILES: '+smis[i]+' ' + 'Score: '+str(scores[i])))
|
| 338 |
+
#top_10_results = pd.DataFrame({'SMILES':[x[0] for x in top_10_smiles],'Struture':images,'Score':[x[1] for x in top_10_smiles],'Rank':list(range(10))})
|
| 339 |
+
return image_descrips
|
| 340 |
+
|
| 341 |
+
def rank_user_lib(candidate_file,spectrum_path,instrument_type,adduct,collision_energy):
|
| 342 |
+
model_inference_low = ModelInference(config_path=config_path,
|
| 343 |
+
pretrain_model_path=pretrain_model_path_low,
|
| 344 |
+
device="cpu")
|
| 345 |
+
model_inference_median = ModelInference(config_path=config_path,
|
| 346 |
+
pretrain_model_path=pretrain_model_path_median,
|
| 347 |
+
device="cpu")
|
| 348 |
+
model_inference_high = ModelInference(config_path=config_path,
|
| 349 |
+
pretrain_model_path=pretrain_model_path_high,
|
| 350 |
+
device="cpu")
|
| 351 |
+
collision_energy=float(collision_energy)
|
| 352 |
+
candidate = pd.read_csv(candidate_file.name)
|
| 353 |
+
candidate_smiles = list(candidate['SMILES'])
|
| 354 |
+
ms2 = list(load_from_msp(spectrum_path.name))[0]
|
| 355 |
+
ms2 = spectrum_processing(ms2)
|
| 356 |
+
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = MS2Embedding(ms2)
|
| 357 |
+
ms2_embedding_low,ms2_embedding_median,ms2_embedding_high = torch.tensor(ms2_embedding_low).float(),torch.tensor(ms2_embedding_median).float(),torch.tensor(ms2_embedding_high).float()
|
| 358 |
+
contexts = []
|
| 359 |
+
for i in range(0, len(candidate_smiles),64):
|
| 360 |
+
contexts.append(candidate_smiles[i:i + 64])
|
| 361 |
+
result_low = [model_inference_low.smiles_encode(i).cpu() for i in contexts]
|
| 362 |
+
result_low = torch.cat(result_low, 0)
|
| 363 |
+
result_median = [model_inference_median.smiles_encode(i).cpu() for i in contexts]
|
| 364 |
+
result_median = torch.cat(result_median, 0)
|
| 365 |
+
result_high = [model_inference_high.smiles_encode(i).cpu() for i in contexts]
|
| 366 |
+
result_high = torch.cat(result_high, 0)
|
| 367 |
+
low_similarity = ms2_embedding_low @ result_low.t()
|
| 368 |
+
median_similarity = ms2_embedding_median @ result_median.t()
|
| 369 |
+
high_similarity = ms2_embedding_high @ result_high.t()
|
| 370 |
+
low_similarity = low_similarity.numpy()
|
| 371 |
+
median_similarity = median_similarity.numpy()
|
| 372 |
+
high_similarity = high_similarity.numpy()
|
| 373 |
+
if collision_energy <=15:
|
| 374 |
+
weighted_similarity = 0.4 * low_similarity + 0.3 * median_similarity + 0.3 * high_similarity
|
| 375 |
+
elif collision_energy >15 and collision_energy <= 25:
|
| 376 |
+
weighted_similarity = 0.3 * low_similarity + 0.4 * median_similarity + 0.3 * high_similarity
|
| 377 |
+
elif collision_energy > 25:
|
| 378 |
+
weighted_similarity = 0.2 * low_similarity + 0.3 * median_similarity + 0.5 * high_similarity
|
| 379 |
+
weighted_similarity_scores=[(candidate_smiles[i],weighted_similarity[i]) for i in range(len(candidate_smiles))]
|
| 380 |
+
weighted_similarity_scores.sort(key=lambda x: x[1], reverse=True)
|
| 381 |
+
top_10_smiles = weighted_similarity_scores[:10]
|
| 382 |
+
smis = [x[0] for x in top_10_smiles]
|
| 383 |
+
scores = [x[1] for x in top_10_smiles]
|
| 384 |
+
images,image_descrips=[],[]
|
| 385 |
+
bw_draw_options = Draw.MolDrawOptions()
|
| 386 |
+
bw_draw_options.useBWAtomPalette()
|
| 387 |
+
for smi in smis:
|
| 388 |
+
try:
|
| 389 |
+
mol = Chem.MolFromSmiles(smi)
|
| 390 |
+
images.append(Draw.MolToImage(mol, options=bw_draw_options))
|
| 391 |
+
except:
|
| 392 |
+
images.append('NAN')
|
| 393 |
+
for i in range(len(smis)):
|
| 394 |
+
image_descrips.append((images[i],'SMILES: '+smis[i]+' ' + 'Score: '+str(scores[i])))
|
| 395 |
+
return image_descrips
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
with gr.Blocks(theme=seafoam) as demo:
|
| 399 |
+
gr.HTML(custom_css)
|
| 400 |
+
gr.Markdown('<div style="font-size:40px; font-weight:bold;">🔍 Cross-Modal MS2 Retrieval Demo </div>')
|
| 401 |
+
gr.Markdown('''
|
| 402 |
+
<div style=" font-weight:bold;">
|
| 403 |
+
<span style="font-size:40px;"> Based on CSU-MS2 model </span>
|
| 404 |
+
</div>
|
| 405 |
+
''')
|
| 406 |
+
|
| 407 |
+
#gr.Markdown('<div style="font-size:20px;">You can select from a list of collected databases or upload your structural file by Clicking the button. The MS/MS spectrum to be identified can be uploaded in MSP format. You also need to set the necessary experimental parameters for the input spectrum, including accurate precursor mass and collision energy. Finally, by clicking the "Cross-Modal Retrieval" button, you can retrieve candidates from the selected database or your own structural file and access identification results that include compound structures, scores, and rankings.</div>')
|
| 408 |
+
with gr.Tab(label="📶 Struture library", elem_id='custom_tab'):
|
| 409 |
+
#Blocks特有组件,设置所有子组件按垂直排列
|
| 410 |
+
with gr.Row():
|
| 411 |
+
with gr.Column():
|
| 412 |
+
peak_data = gr.File(file_count="single", label="Upload MS/MS spectrum file in .msp format", elem_classes=".file-upload-height")
|
| 413 |
+
draw_button = gr.Button("Draw Mass Spectrum")
|
| 414 |
+
with gr.Column():
|
| 415 |
+
spectrum_output = gr.Plot(label="Mass Spectrum",elem_id="custom_plot")
|
| 416 |
+
with gr.Column():
|
| 417 |
+
dataset = gr.Dropdown(["CSU_MS2_DB",
|
| 418 |
+
"BloodExp: blood exposome database",
|
| 419 |
+
"ChEBI: products of nature or synthetic products database",
|
| 420 |
+
"ChemFOnt: Biochemical database including primary metabolites, secondary metabolites, natural products, etc",
|
| 421 |
+
"ContaminantDB: Contaminant data from different online references and databases on contaminants",
|
| 422 |
+
"DrugBank: drug biochemical and pharmacological information database",
|
| 423 |
+
"ECMDB: database of small molecule metabolites found in or produced by Escherichia coli",
|
| 424 |
+
"Exposome-Explorer: biomarkers of exposure to environmental risk factors for diseases",
|
| 425 |
+
"Foodb: food constituent database",
|
| 426 |
+
"HMDB: human metabolome database",
|
| 427 |
+
"KEGG: a collection of small molecules, biopolymers, and other chemical substances",
|
| 428 |
+
"KNApSAcK: integrated metabolite-plant species database",
|
| 429 |
+
"MCDB: small molecule metabolites found in cow milk",
|
| 430 |
+
"MiMeDB: taxonomic, microbiological, and body-site location data on most known human microbes",
|
| 431 |
+
"NANPDB: database of natural products isolated from native organisms of Northern Africa",
|
| 432 |
+
"NPAtlas: natural products atlas database",
|
| 433 |
+
"Phenol-Explorer: Polyphenols",
|
| 434 |
+
"PMHub: plant metabolite database",
|
| 435 |
+
"PMN: plant metabolite database",
|
| 436 |
+
"SMPDB: small molecule pathway database",
|
| 437 |
+
"STOFF-IDENT: database of water relevant substances",
|
| 438 |
+
"T3DB: toxic exposome database",
|
| 439 |
+
"TCMSP: traditional chinese medicine systems pharmacology database",
|
| 440 |
+
"YMDB: yeast metabolome database"], label="Choose a structure library")
|
| 441 |
+
#gr.CheckboxGroup(choices=["HMDB", "Lipidmaps", "CHEMBL"], label="Choose a structure library"),
|
| 442 |
+
with gr.Row():
|
| 443 |
+
instru=gr.Dropdown(["HCD"], label="Instrument Type")
|
| 444 |
+
ionmode=gr.Dropdown(["[M+H]+"], label="Adduct Type")
|
| 445 |
+
par_ion_mass=gr.Textbox(label="Parent Ion Mass",placeholder="e.g., 180.00")
|
| 446 |
+
collision_e=gr.Textbox(label="collision energy", placeholder="e.g., 40")
|
| 447 |
+
with gr.Column():
|
| 448 |
+
lib_button = gr.Button("Cross-Modal Retrieval")
|
| 449 |
+
lib_output = gr.Gallery(height='auto',columns=4,elem_classes="gallery-height",label='Cross-modal retrieval results')
|
| 450 |
+
#lib_output = gr.Dataframe(type="pandas")
|
| 451 |
+
with gr.Tab("📁 Upload structure file"):
|
| 452 |
+
with gr.Row():
|
| 453 |
+
with gr.Column():
|
| 454 |
+
use_dataset= gr.File(file_count="single", label="Upload the candidate structure file in csv format, columns=['SMIELS']",elem_classes="file-upload-height2")
|
| 455 |
+
user_peak_data=gr.File(file_count="single", label="Upload MS/MS spectrum file in .msp format", elem_classes="file-upload-height2")
|
| 456 |
+
user_draw_button = gr.Button("Draw Mass Spectrum")
|
| 457 |
+
with gr.Column():
|
| 458 |
+
user_spectrum_output = gr.Plot(label="Mass Spectrum",elem_id="custom_plot2")
|
| 459 |
+
with gr.Row():
|
| 460 |
+
user_instru=gr.Dropdown(["HCD"], label="Instrument Type")
|
| 461 |
+
user_ionmode=gr.Dropdown(["[M+H]+"], label="Adduct Type")
|
| 462 |
+
user_collision_e=gr.Textbox(label="collision energy", placeholder="e.g., 40")
|
| 463 |
+
with gr.Column():
|
| 464 |
+
user_button = gr.Button("Cross-Modal Retrieval")
|
| 465 |
+
user_output = gr.Gallery(height='auto',columns=4,elem_classes="gallery-height",label='Cross-modal retrieval results')
|
| 466 |
+
draw_button.click(draw_mass_spectrum, inputs=[peak_data], outputs=[spectrum_output])
|
| 467 |
+
user_draw_button.click(draw_mass_spectrum, inputs=[user_peak_data], outputs=[user_spectrum_output])
|
| 468 |
+
lib_button.click(rank_lib, inputs=[dataset,peak_data,instru,ionmode,par_ion_mass,collision_e], outputs=lib_output)
|
| 469 |
+
user_button.click(rank_user_lib, inputs=[use_dataset,user_peak_data,user_instru,user_ionmode,user_collision_e], outputs=user_output)
|
| 470 |
+
demo.launch(share=False)
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
|