Spaces:
Sleeping
Sleeping
add new files
Browse files- dataset.py +122 -0
- fragment_processor.py +50 -0
- generate.py +3 -3
- main.py +148 -16
- sascorer.py +168 -0
dataset.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Implementation of a SMILES dataset.
|
| 5 |
+
"""
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.utils.data as tud
|
| 10 |
+
from torch.autograd import Variable
|
| 11 |
+
|
| 12 |
+
import configuration.config_default as cfgd
|
| 13 |
+
from models.transformer.module.subsequent_mask import subsequent_mask
|
| 14 |
+
|
| 15 |
+
class Dataset(tud.Dataset):
|
| 16 |
+
"""Custom PyTorch Dataset that takes a file containing
|
| 17 |
+
Source_Mol_ID,Target_Mol_ID,Source_Mol,Target_Mol,
|
| 18 |
+
Source_Mol_LogD,Target_Mol_LogD,Delta_LogD,
|
| 19 |
+
Source_Mol_Solubility,Target_Mol_Solubility,Delta_Solubility,
|
| 20 |
+
Source_Mol_Clint,Target_Mol_Clint,Delta_Clint,
|
| 21 |
+
Transformation,Core"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, data, vocabulary, tokenizer, prediction_mode=False):
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
:param data: dataframe read from training, validation or test file
|
| 27 |
+
:param vocabulary: used to encode source/target tokens
|
| 28 |
+
:param tokenizer: used to tokenize source/target smiles
|
| 29 |
+
:param prediction_mode: if use target smiles or not (training or test)
|
| 30 |
+
"""
|
| 31 |
+
self._vocabulary = vocabulary
|
| 32 |
+
self._tokenizer = tokenizer
|
| 33 |
+
self._data = data
|
| 34 |
+
self._prediction_mode = prediction_mode
|
| 35 |
+
|
| 36 |
+
def __getitem__(self, i):
|
| 37 |
+
"""
|
| 38 |
+
Tokenize and encode source smile and/or target smile (if prediction_mode is True)
|
| 39 |
+
:param i:
|
| 40 |
+
:return:
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
row = self._data.iloc[i]
|
| 44 |
+
# tokenize and encode source smiles
|
| 45 |
+
sourceConstant = row['constantSMILES']
|
| 46 |
+
sourceVariable = row['fromVarSMILES']
|
| 47 |
+
main_cls = row['main_cls']
|
| 48 |
+
minor_cls = row['minor_cls']
|
| 49 |
+
value = row['Delta_Value']
|
| 50 |
+
# value = row['Delta_pki']
|
| 51 |
+
source_tokens = []
|
| 52 |
+
|
| 53 |
+
# 先variable
|
| 54 |
+
source_tokens.extend(self._tokenizer.tokenize(sourceVariable)) ## add source variable SMILES token
|
| 55 |
+
# 再 major class eg activity
|
| 56 |
+
source_tokens.append(main_cls)
|
| 57 |
+
# 再 minor class eg Ki
|
| 58 |
+
source_tokens.append(minor_cls)
|
| 59 |
+
# 然后value
|
| 60 |
+
source_tokens.append(value)
|
| 61 |
+
# 接着constant
|
| 62 |
+
source_tokens.extend(self._tokenizer.tokenize(sourceConstant)) ## add source constant SMILES token
|
| 63 |
+
source_encoded = self._vocabulary.encode(source_tokens)
|
| 64 |
+
|
| 65 |
+
# print(source_tokens,'\n=====\n', source_encoded)
|
| 66 |
+
# tokenize and encode target smiles if it is for training instead of evaluation
|
| 67 |
+
if not self._prediction_mode:
|
| 68 |
+
target_smi = row['toVarSMILES']
|
| 69 |
+
target_tokens = self._tokenizer.tokenize(target_smi)
|
| 70 |
+
target_encoded = self._vocabulary.encode(target_tokens)
|
| 71 |
+
|
| 72 |
+
return torch.tensor(source_encoded, dtype=torch.long), torch.tensor(
|
| 73 |
+
target_encoded, dtype=torch.long), row
|
| 74 |
+
else:
|
| 75 |
+
return torch.tensor(source_encoded, dtype=torch.long), row
|
| 76 |
+
|
| 77 |
+
def __len__(self):
|
| 78 |
+
return len(self._data)
|
| 79 |
+
|
| 80 |
+
@classmethod
|
| 81 |
+
def collate_fn(cls, data_all):
|
| 82 |
+
# sort based on source sequence's length
|
| 83 |
+
data_all.sort(key=lambda x: len(x[0]), reverse=True)
|
| 84 |
+
is_prediction_mode = True if len(data_all[0]) == 2 else False
|
| 85 |
+
if is_prediction_mode:
|
| 86 |
+
source_encoded, data = zip(*data_all)
|
| 87 |
+
data = pd.DataFrame(data)
|
| 88 |
+
|
| 89 |
+
else:
|
| 90 |
+
source_encoded, target_encoded, data = zip(*data_all)
|
| 91 |
+
data = pd.DataFrame(data)
|
| 92 |
+
|
| 93 |
+
# maximum length of source sequences
|
| 94 |
+
max_length_source = max([seq.size(0) for seq in source_encoded])
|
| 95 |
+
# print('=====max len', max_length_source)
|
| 96 |
+
# padded source sequences with zeroes
|
| 97 |
+
collated_arr_source = torch.zeros(len(source_encoded), max_length_source, dtype=torch.long)
|
| 98 |
+
for i, seq in enumerate(source_encoded):
|
| 99 |
+
collated_arr_source[i, :seq.size(0)] = seq
|
| 100 |
+
# length of each source sequence
|
| 101 |
+
source_length = [seq.size(0) for seq in source_encoded]
|
| 102 |
+
source_length = torch.tensor(source_length)
|
| 103 |
+
# mask of source seqs
|
| 104 |
+
src_mask = (collated_arr_source !=0).unsqueeze(-2)
|
| 105 |
+
|
| 106 |
+
# target seq
|
| 107 |
+
if not is_prediction_mode:
|
| 108 |
+
max_length_target = max([seq.size(0) for seq in target_encoded])
|
| 109 |
+
collated_arr_target = torch.zeros(len(target_encoded), max_length_target, dtype=torch.long)
|
| 110 |
+
for i, seq in enumerate(target_encoded):
|
| 111 |
+
collated_arr_target[i, :seq.size(0)] = seq
|
| 112 |
+
|
| 113 |
+
trg_mask = (collated_arr_target != 0).unsqueeze(-2)
|
| 114 |
+
trg_mask = trg_mask & Variable(subsequent_mask(collated_arr_target.size(-1)).type_as(trg_mask))
|
| 115 |
+
trg_mask = trg_mask[:, :-1, :-1] # save start token, skip end token
|
| 116 |
+
else:
|
| 117 |
+
trg_mask = None
|
| 118 |
+
max_length_target = None
|
| 119 |
+
collated_arr_target = None
|
| 120 |
+
|
| 121 |
+
return collated_arr_source, source_length, collated_arr_target, src_mask, trg_mask, max_length_target, data
|
| 122 |
+
|
fragment_processor.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from mmpdblib.fragment_io import read_fragment_records
|
| 5 |
+
from rdkit import Chem
|
| 6 |
+
|
| 7 |
+
def fragmentize_molecule(smiles_string, max_ratio=0.5):
|
| 8 |
+
# 创建临时文件名
|
| 9 |
+
input_file = "temp_input.smi"
|
| 10 |
+
output_file = "temp_output.fragments"
|
| 11 |
+
|
| 12 |
+
# 将SMILES字符串写入临时输入文件
|
| 13 |
+
with open(input_file, "w") as f:
|
| 14 |
+
f.write(smiles_string + "\t" + "Molecule" + "\n")
|
| 15 |
+
|
| 16 |
+
# 使用mmpdb工具进行分子碎片化
|
| 17 |
+
os.system(f"mmpdb fragment {input_file} -o {output_file}")
|
| 18 |
+
|
| 19 |
+
# 读取并处理碎片
|
| 20 |
+
fragment_reader = read_fragment_records(output_file)
|
| 21 |
+
fragment_list = []
|
| 22 |
+
|
| 23 |
+
for record in fragment_reader:
|
| 24 |
+
for frag in record.fragments:
|
| 25 |
+
if count_heavy_atoms(frag.variable_smiles) < count_heavy_atoms(record.normalized_smiles) * max_ratio:
|
| 26 |
+
fragment_list.append({
|
| 27 |
+
'variable_smiles': frag.variable_smiles,
|
| 28 |
+
'constant_smiles': frag.constant_smiles,
|
| 29 |
+
'record_id': record.id,
|
| 30 |
+
'normalized_smiles': record.normalized_smiles,
|
| 31 |
+
'attachment_order': frag.attachment_order
|
| 32 |
+
})
|
| 33 |
+
|
| 34 |
+
# 删除临时文件
|
| 35 |
+
os.remove(input_file)
|
| 36 |
+
os.remove(output_file)
|
| 37 |
+
|
| 38 |
+
# 返回碎片列表
|
| 39 |
+
return pd.DataFrame(fragment_list)
|
| 40 |
+
|
| 41 |
+
def count_heavy_atoms(smiles):
|
| 42 |
+
# 使用RDKit计算重原子数
|
| 43 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 44 |
+
return mol.GetNumHeavyAtoms() if mol else 0
|
| 45 |
+
|
| 46 |
+
# 示例调用
|
| 47 |
+
# smiles = "O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o1)N2"
|
| 48 |
+
# fragment_df = fragmentize_molecule(smiles)
|
| 49 |
+
|
| 50 |
+
# print(fragment_df)
|
generate.py
CHANGED
|
@@ -76,9 +76,9 @@ class GenerateRunner():
|
|
| 76 |
return dataloader
|
| 77 |
|
| 78 |
def generate(self, opt):
|
| 79 |
-
if not self.overwrite and self.exist_flag:
|
| 80 |
-
|
| 81 |
-
|
| 82 |
# set device
|
| 83 |
#device = ut.allocate_gpu()
|
| 84 |
# torch.cuda.set_device(1)
|
|
|
|
| 76 |
return dataloader
|
| 77 |
|
| 78 |
def generate(self, opt):
|
| 79 |
+
# if not self.overwrite and self.exist_flag:
|
| 80 |
+
# print('GENERATED MOL EXIST, SKIP GENERATING!')
|
| 81 |
+
# return
|
| 82 |
# set device
|
| 83 |
#device = ut.allocate_gpu()
|
| 84 |
# torch.cuda.set_device(1)
|
main.py
CHANGED
|
@@ -1,24 +1,156 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
-
from fastapi.responses import JSONResponse, HTMLResponse
|
| 3 |
from pydantic import BaseModel
|
| 4 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
app = FastAPI()
|
| 6 |
|
| 7 |
-
class InputData(BaseModel):
|
| 8 |
-
user_input: str
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
-
@app.post("/submit")
|
| 18 |
-
async def submit_input(input_data: InputData):
|
| 19 |
-
# 处理用户输入并返回响应
|
| 20 |
-
print("input coming")
|
| 21 |
-
print(input_data)
|
| 22 |
-
return JSONResponse(content={"message": input_data.user_input})
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Query
|
|
|
|
| 2 |
from pydantic import BaseModel
|
| 3 |
+
import subprocess
|
| 4 |
+
from typing import List
|
| 5 |
+
from fragment_processor import fragmentize_molecule
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from rdkit import Chem
|
| 10 |
+
from rdkit.Chem import Descriptors, QED
|
| 11 |
+
from generate import GenerateRunner
|
| 12 |
+
from dataset import Dataset
|
| 13 |
+
import sascorer
|
| 14 |
app = FastAPI()
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
class Fragment(BaseModel):
|
| 18 |
+
variable_smiles: str
|
| 19 |
+
constant_smiles: str
|
| 20 |
+
record_id: str
|
| 21 |
+
normalized_smiles: str
|
| 22 |
+
attachment_order: int
|
| 23 |
+
|
| 24 |
+
class FragmentResponse(BaseModel):
|
| 25 |
+
fragments: List[Fragment]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class GenerateRequest(BaseModel):
|
| 30 |
+
constSmiles: str
|
| 31 |
+
varSmiles: str
|
| 32 |
+
mainCls: str
|
| 33 |
+
minorCls: str
|
| 34 |
+
deltaValue: str
|
| 35 |
+
num: int
|
| 36 |
+
|
| 37 |
+
class MoleculeOutput(BaseModel):
|
| 38 |
+
smile: str
|
| 39 |
+
molwt: float
|
| 40 |
+
tpsa: float
|
| 41 |
+
slogp: float
|
| 42 |
+
sa: float
|
| 43 |
+
qed: float
|
| 44 |
+
|
| 45 |
+
class Options:
|
| 46 |
+
def __init__(self, **entries):
|
| 47 |
+
self.__dict__.update(entries)
|
| 48 |
+
|
| 49 |
+
def calculate_descriptors(smiles):
|
| 50 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 51 |
+
if mol is None:
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
molwt = Descriptors.MolWt(mol)
|
| 55 |
+
tpsa = Descriptors.TPSA(mol)
|
| 56 |
+
slogp = Descriptors.MolLogP(mol)
|
| 57 |
+
sa = sascorer.calculateScore(mol)
|
| 58 |
+
qed = QED.qed(mol)
|
| 59 |
+
|
| 60 |
+
# 检查除法前是否为 0
|
| 61 |
+
if tpsa == 0:
|
| 62 |
+
print("Warning: TPSA is zero, skipping division.")
|
| 63 |
+
some_ratio = None
|
| 64 |
+
else:
|
| 65 |
+
some_ratio = molwt / tpsa # 安全的除法操作
|
| 66 |
+
|
| 67 |
+
return {"molwt": molwt, "tpsa": tpsa, "slogp": slogp, "sa": sa, "qed": qed}
|
| 68 |
+
|
| 69 |
+
def run_generate_runner(const_smiles, var_smiles, main_cls, minor_cls, delta_value, num_samples):
|
| 70 |
+
# 初始化生成器的配置选项
|
| 71 |
+
opt = {
|
| 72 |
+
'model_choice': 'transformer',
|
| 73 |
+
'model_path': '$(pwd)/raw_pretrain_frag/checkpoint',
|
| 74 |
+
'vocab_path': '$(pwd)',
|
| 75 |
+
'epoch': 20,
|
| 76 |
+
# 'save_directory': '$(pwd)/demo_gen',
|
| 77 |
+
# 'data_path': '/home/yichao/zhilian/GenAICode/CLModel_v2_zl',
|
| 78 |
+
# 'test_file_name': 'test_100',
|
| 79 |
+
'batch_size': num_samples
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# 将 opt 字典转换为 Options 对象
|
| 83 |
+
opt = Options(**opt)
|
| 84 |
+
|
| 85 |
+
runner = GenerateRunner(opt)
|
| 86 |
+
|
| 87 |
+
# 创建数据
|
| 88 |
+
data = {
|
| 89 |
+
"constantSMILES": const_smiles,
|
| 90 |
+
"fromVarSMILES": var_smiles,
|
| 91 |
+
"main_cls": main_cls,
|
| 92 |
+
"minor_cls": minor_cls,
|
| 93 |
+
"Delta_Value": delta_value
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# 创建 Dataset 实例
|
| 97 |
+
test_data = pd.DataFrame([data])
|
| 98 |
+
dataset = Dataset(test_data, vocabulary=runner.vocab, tokenizer=runner.tokenizer, prediction_mode=True)
|
| 99 |
+
|
| 100 |
+
# 生成 SMILES
|
| 101 |
+
dataloader = torch.utils.data.DataLoader(dataset, batch_size=num_samples, shuffle=False, collate_fn=Dataset.collate_fn)
|
| 102 |
+
result = []
|
| 103 |
+
|
| 104 |
+
for batch in dataloader:
|
| 105 |
+
src, source_length, _, src_mask, _, _, df = batch
|
| 106 |
+
src = src.to(runner.device)
|
| 107 |
+
src_mask = src_mask.to(runner.device)
|
| 108 |
+
source_length = source_length.to(runner.device) # 将 source_length 也移到同一设备
|
| 109 |
+
smiles_list = runner.sample(
|
| 110 |
+
model_choice="transformer",
|
| 111 |
+
model=runner.model,
|
| 112 |
+
src=src,
|
| 113 |
+
src_mask=src_mask,
|
| 114 |
+
source_length=source_length,
|
| 115 |
+
decode_type="multinomial",
|
| 116 |
+
num_samples=num_samples
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# 计算每个 SMILES 的化学性质
|
| 120 |
+
for smiles_group in smiles_list:
|
| 121 |
+
for smile in smiles_group: # smiles_group 是一个子列表
|
| 122 |
+
descriptors = calculate_descriptors(smile)
|
| 123 |
+
if descriptors:
|
| 124 |
+
result.append({
|
| 125 |
+
"smile": smile,
|
| 126 |
+
"molwt": descriptors['molwt'],
|
| 127 |
+
"tpsa": descriptors['tpsa'],
|
| 128 |
+
"slogp": descriptors['slogp'],
|
| 129 |
+
"sa": descriptors['sa'],
|
| 130 |
+
"qed": descriptors['qed']
|
| 131 |
+
})
|
| 132 |
+
|
| 133 |
|
| 134 |
+
return result
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
@app.get("/fragmentize/", response_model=FragmentResponse)
|
| 138 |
+
async def fragmentize(smiles: str = Query(..., description="SMILES string of the molecule")):
|
| 139 |
+
try:
|
| 140 |
+
fragment_df = fragmentize_molecule(smiles)
|
| 141 |
+
fragments = fragment_df.to_dict(orient="records")
|
| 142 |
+
return FragmentResponse(fragments=fragments)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
raise HTTPException(status_code=500, detail=f"发生错误: {str(e)}")
|
| 145 |
|
| 146 |
+
@app.post("/generate", response_model=List[MoleculeOutput])
|
| 147 |
+
async def generate_molecules(request: GenerateRequest):
|
| 148 |
+
try:
|
| 149 |
+
# 调用 SMILES 生成逻辑
|
| 150 |
+
result = run_generate_runner(request.constSmiles, request.varSmiles, request.mainCls, request.minorCls, request.deltaValue, request.num)
|
| 151 |
+
return result
|
| 152 |
+
except Exception as e:
|
| 153 |
+
# 捕获异常并记录详细的错误信息,包括堆栈追踪
|
| 154 |
+
error_message = f"Error occurred: {str(e)}"
|
| 155 |
+
print(error_message) # 打印到控制台,或者使用 logging 模块记录到日志文件
|
| 156 |
+
raise HTTPException(status_code=500, detail=f"Error occurred: {str(e)}")
|
sascorer.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# calculation of synthetic accessibility score as described in:
|
| 3 |
+
#
|
| 4 |
+
# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
|
| 5 |
+
# Peter Ertl and Ansgar Schuffenhauer
|
| 6 |
+
# Journal of Cheminformatics 1:8 (2009)
|
| 7 |
+
# http://www.jcheminf.com/content/1/1/8
|
| 8 |
+
#
|
| 9 |
+
# several small modifications to the original paper are included
|
| 10 |
+
# particularly slightly different formula for marocyclic penalty
|
| 11 |
+
# and taking into account also molecule symmetry (fingerprint density)
|
| 12 |
+
#
|
| 13 |
+
# for a set of 10k diverse molecules the agreement between the original method
|
| 14 |
+
# as implemented in PipelinePilot and this implementation is r2 = 0.97
|
| 15 |
+
#
|
| 16 |
+
# peter ertl & greg landrum, september 2013
|
| 17 |
+
#
|
| 18 |
+
|
| 19 |
+
import math
|
| 20 |
+
import os.path as op
|
| 21 |
+
import pickle
|
| 22 |
+
from collections import defaultdict
|
| 23 |
+
|
| 24 |
+
from rdkit import Chem
|
| 25 |
+
from rdkit.Chem import rdMolDescriptors
|
| 26 |
+
|
| 27 |
+
_fscores = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def readFragmentScores(name='fpscores'):
|
| 31 |
+
import gzip
|
| 32 |
+
global _fscores
|
| 33 |
+
# generate the full path filename:
|
| 34 |
+
if name == "fpscores":
|
| 35 |
+
name = op.join(op.dirname(__file__), name)
|
| 36 |
+
data = pickle.load(gzip.open('%s.pkl.gz' % name))
|
| 37 |
+
outDict = {}
|
| 38 |
+
for i in data:
|
| 39 |
+
for j in range(1, len(i)):
|
| 40 |
+
outDict[i[j]] = float(i[0])
|
| 41 |
+
_fscores = outDict
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def numBridgeheadsAndSpiro(mol, ri=None):
|
| 45 |
+
nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
|
| 46 |
+
nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
|
| 47 |
+
return nBridgehead, nSpiro
|
| 48 |
+
|
| 49 |
+
def calculateScore(m):
|
| 50 |
+
if _fscores is None:
|
| 51 |
+
readFragmentScores()
|
| 52 |
+
|
| 53 |
+
# fragment score
|
| 54 |
+
fp = rdMolDescriptors.GetMorganFingerprint(m, 2) # <- 2 is the *radius* of the circular fingerprint
|
| 55 |
+
fps = fp.GetNonzeroElements()
|
| 56 |
+
score1 = 0.
|
| 57 |
+
nf = 0
|
| 58 |
+
for bitId, v in fps.items():
|
| 59 |
+
nf += v
|
| 60 |
+
sfp = bitId
|
| 61 |
+
score1 += _fscores.get(sfp, -4) * v
|
| 62 |
+
|
| 63 |
+
if nf == 0:
|
| 64 |
+
score1 = -4 # 避免除以零,设定默认值
|
| 65 |
+
else:
|
| 66 |
+
score1 /= nf
|
| 67 |
+
|
| 68 |
+
# features score
|
| 69 |
+
nAtoms = m.GetNumAtoms()
|
| 70 |
+
nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
|
| 71 |
+
ri = m.GetRingInfo()
|
| 72 |
+
nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
|
| 73 |
+
nMacrocycles = 0
|
| 74 |
+
for x in ri.AtomRings():
|
| 75 |
+
if len(x) > 8:
|
| 76 |
+
nMacrocycles += 1
|
| 77 |
+
|
| 78 |
+
sizePenalty = nAtoms**1.005 - nAtoms
|
| 79 |
+
stereoPenalty = math.log10(nChiralCenters + 1)
|
| 80 |
+
spiroPenalty = math.log10(nSpiro + 1)
|
| 81 |
+
bridgePenalty = math.log10(nBridgeheads + 1)
|
| 82 |
+
macrocyclePenalty = 0.
|
| 83 |
+
if nMacrocycles > 0:
|
| 84 |
+
macrocyclePenalty = math.log10(2)
|
| 85 |
+
|
| 86 |
+
score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
|
| 87 |
+
|
| 88 |
+
# correction for the fingerprint density
|
| 89 |
+
if len(fps) == 0:
|
| 90 |
+
score3 = 0 # 避免除以零,设定默认值
|
| 91 |
+
else:
|
| 92 |
+
score3 = math.log(float(nAtoms) / len(fps)) * .5
|
| 93 |
+
|
| 94 |
+
sascore = score1 + score2 + score3
|
| 95 |
+
|
| 96 |
+
# need to transform "raw" value into scale between 1 and 10
|
| 97 |
+
min = -4.0
|
| 98 |
+
max = 2.5
|
| 99 |
+
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
|
| 100 |
+
if sascore > 8.:
|
| 101 |
+
sascore = 8. + math.log(sascore + 1. - 9.)
|
| 102 |
+
if sascore > 10.:
|
| 103 |
+
sascore = 10.0
|
| 104 |
+
elif sascore < 1.:
|
| 105 |
+
sascore = 1.0
|
| 106 |
+
|
| 107 |
+
return sascore
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def processMols(mols):
|
| 112 |
+
print('smiles\tName\tsa_score')
|
| 113 |
+
for i, m in enumerate(mols):
|
| 114 |
+
if m is None:
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
s = calculateScore(m)
|
| 118 |
+
|
| 119 |
+
smiles = Chem.MolToSmiles(m)
|
| 120 |
+
print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
if __name__ == '__main__':
|
| 124 |
+
import sys
|
| 125 |
+
import time
|
| 126 |
+
|
| 127 |
+
t1 = time.time()
|
| 128 |
+
readFragmentScores("fpscores")
|
| 129 |
+
t2 = time.time()
|
| 130 |
+
|
| 131 |
+
suppl = Chem.SmilesMolSupplier(sys.argv[1])
|
| 132 |
+
t3 = time.time()
|
| 133 |
+
processMols(suppl)
|
| 134 |
+
t4 = time.time()
|
| 135 |
+
|
| 136 |
+
print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
|
| 137 |
+
file=sys.stderr)
|
| 138 |
+
|
| 139 |
+
#
|
| 140 |
+
# Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
|
| 141 |
+
# All rights reserved.
|
| 142 |
+
#
|
| 143 |
+
# Redistribution and use in source and binary forms, with or without
|
| 144 |
+
# modification, are permitted provided that the following conditions are
|
| 145 |
+
# met:
|
| 146 |
+
#
|
| 147 |
+
# * Redistributions of source code must retain the above copyright
|
| 148 |
+
# notice, this list of conditions and the following disclaimer.
|
| 149 |
+
# * Redistributions in binary form must reproduce the above
|
| 150 |
+
# copyright notice, this list of conditions and the following
|
| 151 |
+
# disclaimer in the documentation and/or other materials provided
|
| 152 |
+
# with the distribution.
|
| 153 |
+
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
| 154 |
+
# nor the names of its contributors may be used to endorse or promote
|
| 155 |
+
# products derived from this software without specific prior written permission.
|
| 156 |
+
#
|
| 157 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 158 |
+
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 159 |
+
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 160 |
+
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 161 |
+
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 162 |
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 163 |
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 164 |
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 165 |
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 166 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 167 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 168 |
+
#
|