Spaces:

Songyou
/

LLM-fastAPI

Sleeping

App Files Files Community

Songyou commited on Jan 3, 2025

Commit

2df9869

1 Parent(s): 42d5bbb

add new files

Browse files

Files changed (5) hide show

dataset.py +122 -0
fragment_processor.py +50 -0
generate.py +3 -3
main.py +148 -16
sascorer.py +168 -0

dataset.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# coding=utf-8
+"""
+Implementation of a SMILES dataset.
+"""
+import pandas as pd
+import torch
+import torch.utils.data as tud
+from torch.autograd import Variable
+import configuration.config_default as cfgd
+from models.transformer.module.subsequent_mask import subsequent_mask
+class Dataset(tud.Dataset):
+    """Custom PyTorch Dataset that takes a file containing
+    Source_Mol_ID,Target_Mol_ID,Source_Mol,Target_Mol,
+    Source_Mol_LogD,Target_Mol_LogD,Delta_LogD,
+    Source_Mol_Solubility,Target_Mol_Solubility,Delta_Solubility,
+    Source_Mol_Clint,Target_Mol_Clint,Delta_Clint,
+    Transformation,Core"""
+    def __init__(self, data, vocabulary, tokenizer, prediction_mode=False):
+        """
+        :param data: dataframe read from training, validation or test file
+        :param vocabulary: used to encode source/target tokens
+        :param tokenizer: used to tokenize source/target smiles
+        :param prediction_mode: if use target smiles or not (training or test)
+        """
+        self._vocabulary = vocabulary
+        self._tokenizer = tokenizer
+        self._data = data
+        self._prediction_mode = prediction_mode
+    def __getitem__(self, i):
+        """
+        Tokenize and encode source smile and/or target smile (if prediction_mode is True)
+        :param i:
+        :return:
+        """
+        row = self._data.iloc[i]
+        # tokenize and encode source smiles
+        sourceConstant = row['constantSMILES']
+        sourceVariable = row['fromVarSMILES']
+        main_cls = row['main_cls']
+        minor_cls = row['minor_cls']
+        value = row['Delta_Value']
+        # value = row['Delta_pki']
+        source_tokens = []
+        # 先variable
+        source_tokens.extend(self._tokenizer.tokenize(sourceVariable))  ## add source variable SMILES token
+        # 再 major class eg activity
+        source_tokens.append(main_cls)
+        # 再 minor class eg Ki
+        source_tokens.append(minor_cls)
+        # 然后value
+        source_tokens.append(value)
+        # 接着constant
+        source_tokens.extend(self._tokenizer.tokenize(sourceConstant)) ## add source constant SMILES token
+        source_encoded = self._vocabulary.encode(source_tokens)
+        # print(source_tokens,'\n=====\n', source_encoded)
+        # tokenize and encode target smiles if it is for training instead of evaluation
+        if not self._prediction_mode:
+            target_smi = row['toVarSMILES']
+            target_tokens = self._tokenizer.tokenize(target_smi)
+            target_encoded = self._vocabulary.encode(target_tokens)
+            return torch.tensor(source_encoded, dtype=torch.long), torch.tensor(
+                target_encoded, dtype=torch.long), row
+        else:
+            return torch.tensor(source_encoded, dtype=torch.long),  row
+    def __len__(self):
+        return len(self._data)
+    @classmethod
+    def collate_fn(cls, data_all):
+        # sort based on source sequence's length
+        data_all.sort(key=lambda x: len(x[0]), reverse=True)
+        is_prediction_mode = True if len(data_all[0]) == 2 else False
+        if is_prediction_mode:
+            source_encoded, data = zip(*data_all)
+            data = pd.DataFrame(data)
+        else:
+            source_encoded, target_encoded, data = zip(*data_all)
+            data = pd.DataFrame(data)
+        # maximum length of source sequences
+        max_length_source = max([seq.size(0) for seq in source_encoded])
+        # print('=====max len', max_length_source)
+        # padded source sequences with zeroes
+        collated_arr_source = torch.zeros(len(source_encoded), max_length_source, dtype=torch.long)
+        for i, seq in enumerate(source_encoded):
+            collated_arr_source[i, :seq.size(0)] = seq
+        # length of each source sequence
+        source_length = [seq.size(0) for seq in source_encoded]
+        source_length = torch.tensor(source_length)
+        # mask of source seqs
+        src_mask = (collated_arr_source !=0).unsqueeze(-2)
+        # target seq
+        if not is_prediction_mode:
+            max_length_target = max([seq.size(0) for seq in target_encoded])
+            collated_arr_target = torch.zeros(len(target_encoded), max_length_target, dtype=torch.long)
+            for i, seq in enumerate(target_encoded):
+                collated_arr_target[i, :seq.size(0)] = seq
+            trg_mask = (collated_arr_target != 0).unsqueeze(-2)
+            trg_mask = trg_mask & Variable(subsequent_mask(collated_arr_target.size(-1)).type_as(trg_mask))
+            trg_mask = trg_mask[:, :-1, :-1]  # save start token, skip end token
+        else:
+            trg_mask = None
+            max_length_target = None
+            collated_arr_target = None
+        return collated_arr_source, source_length, collated_arr_target, src_mask, trg_mask, max_length_target, data

fragment_processor.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import pandas as pd
+from pathlib import Path
+from mmpdblib.fragment_io import read_fragment_records
+from rdkit import Chem
+def fragmentize_molecule(smiles_string, max_ratio=0.5):
+    # 创建临时文件名
+    input_file = "temp_input.smi"
+    output_file = "temp_output.fragments"
+    # 将SMILES字符串写入临时输入文件
+    with open(input_file, "w") as f:
+        f.write(smiles_string + "\t" + "Molecule" + "\n")
+    # 使用mmpdb工具进行分子碎片化
+    os.system(f"mmpdb fragment {input_file} -o {output_file}")
+    # 读取并处理碎片
+    fragment_reader = read_fragment_records(output_file)
+    fragment_list = []
+    for record in fragment_reader:
+        for frag in record.fragments:
+            if count_heavy_atoms(frag.variable_smiles) < count_heavy_atoms(record.normalized_smiles) * max_ratio:
+                fragment_list.append({
+                    'variable_smiles': frag.variable_smiles,
+                    'constant_smiles': frag.constant_smiles,
+                    'record_id': record.id,
+                    'normalized_smiles': record.normalized_smiles,
+                    'attachment_order': frag.attachment_order
+                })
+    # 删除临时文件
+    os.remove(input_file)
+    os.remove(output_file)
+    # 返回碎片列表
+    return pd.DataFrame(fragment_list)
+def count_heavy_atoms(smiles):
+    # 使用RDKit计算重原子数
+    mol = Chem.MolFromSmiles(smiles)
+    return mol.GetNumHeavyAtoms() if mol else 0
+# 示例调用
+# smiles = "O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o1)N2"
+# fragment_df = fragmentize_molecule(smiles)
+# print(fragment_df)

generate.py CHANGED Viewed

@@ -76,9 +76,9 @@ class GenerateRunner():
         return dataloader
     def generate(self, opt):
-        if not self.overwrite and self.exist_flag:
-            print('GENERATED MOL EXIST, SKIP GENERATING!')
-            return
         # set device
         #device = ut.allocate_gpu()
         # torch.cuda.set_device(1)

         return dataloader
     def generate(self, opt):
+        # if not self.overwrite and self.exist_flag:
+        #     print('GENERATED MOL EXIST, SKIP GENERATING!')
+        #     return
         # set device
         #device = ut.allocate_gpu()
         # torch.cuda.set_device(1)

main.py CHANGED Viewed

@@ -1,24 +1,156 @@
-from fastapi import FastAPI
-from fastapi.responses import JSONResponse, HTMLResponse
 from pydantic import BaseModel
-import logging
 app = FastAPI()
-class InputData(BaseModel):
-    user_input: str
-@app.get("/", response_class=HTMLResponse)
-async def read_root():
-    # 提供前端 HTML 文件
-    with open("index.html", "r") as file:
-        return file.read()
-@app.post("/submit")
-async def submit_input(input_data: InputData):
-    # 处理用户输入并返回响应
-    print("input coming")
-    print(input_data)
-    return JSONResponse(content={"message": input_data.user_input})

+from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
+import subprocess
+from typing import List
+from fragment_processor import fragmentize_molecule
+import torch
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import Descriptors, QED
+from generate import GenerateRunner
+from dataset import Dataset
+import sascorer
 app = FastAPI()
+class Fragment(BaseModel):
+    variable_smiles: str
+    constant_smiles: str
+    record_id: str
+    normalized_smiles: str
+    attachment_order: int
+class FragmentResponse(BaseModel):
+    fragments: List[Fragment]
+class GenerateRequest(BaseModel):
+    constSmiles: str
+    varSmiles: str
+    mainCls: str
+    minorCls: str
+    deltaValue: str
+    num: int
+class MoleculeOutput(BaseModel):
+    smile: str
+    molwt: float
+    tpsa: float
+    slogp: float
+    sa: float
+    qed: float
+class Options:
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+def calculate_descriptors(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    molwt = Descriptors.MolWt(mol)
+    tpsa = Descriptors.TPSA(mol)
+    slogp = Descriptors.MolLogP(mol)
+    sa = sascorer.calculateScore(mol)
+    qed = QED.qed(mol)
+    # 检查除法前是否为 0
+    if tpsa == 0:
+        print("Warning: TPSA is zero, skipping division.")
+        some_ratio = None
+    else:
+        some_ratio = molwt / tpsa  # 安全的除法操作
+    return {"molwt": molwt, "tpsa": tpsa, "slogp": slogp, "sa": sa, "qed": qed}
+def run_generate_runner(const_smiles, var_smiles, main_cls, minor_cls, delta_value, num_samples):
+    # 初始化生成器的配置选项
+    opt = {
+        'model_choice': 'transformer',
+        'model_path': '$(pwd)/raw_pretrain_frag/checkpoint',
+        'vocab_path': '$(pwd)',
+        'epoch': 20,
+        # 'save_directory': '$(pwd)/demo_gen',
+        # 'data_path': '/home/yichao/zhilian/GenAICode/CLModel_v2_zl',
+        # 'test_file_name': 'test_100',
+        'batch_size': num_samples
+    }
+    # 将 opt 字典转换为 Options 对象
+    opt = Options(**opt)
+    runner = GenerateRunner(opt)
+    # 创建数据
+    data = {
+        "constantSMILES": const_smiles,
+        "fromVarSMILES": var_smiles,
+        "main_cls": main_cls,
+        "minor_cls": minor_cls,
+        "Delta_Value": delta_value
+    }
+    # 创建 Dataset 实例
+    test_data = pd.DataFrame([data])
+    dataset = Dataset(test_data, vocabulary=runner.vocab, tokenizer=runner.tokenizer, prediction_mode=True)
+    # 生成 SMILES
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=num_samples, shuffle=False, collate_fn=Dataset.collate_fn)
+    result = []
+    for batch in dataloader:
+        src, source_length, _, src_mask, _, _, df = batch
+        src = src.to(runner.device)
+        src_mask = src_mask.to(runner.device)
+        source_length = source_length.to(runner.device)  # 将 source_length 也移到同一设备
+        smiles_list = runner.sample(
+            model_choice="transformer",
+            model=runner.model,
+            src=src,
+            src_mask=src_mask,
+            source_length=source_length,
+            decode_type="multinomial",
+            num_samples=num_samples
+        )
+        # 计算每个 SMILES 的化学性质
+        for smiles_group in smiles_list:
+            for smile in smiles_group:  # smiles_group 是一个子列表
+                descriptors = calculate_descriptors(smile)
+                if descriptors:
+                    result.append({
+                        "smile": smile,
+                        "molwt": descriptors['molwt'],
+                        "tpsa": descriptors['tpsa'],
+                        "slogp": descriptors['slogp'],
+                        "sa": descriptors['sa'],
+                        "qed": descriptors['qed']
+                    })
+    return result
+@app.get("/fragmentize/", response_model=FragmentResponse)
+async def fragmentize(smiles: str = Query(..., description="SMILES string of the molecule")):
+    try:
+        fragment_df = fragmentize_molecule(smiles)
+        fragments = fragment_df.to_dict(orient="records")
+        return FragmentResponse(fragments=fragments)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"发生错误: {str(e)}")
+@app.post("/generate", response_model=List[MoleculeOutput])
+async def generate_molecules(request: GenerateRequest):
+    try:
+        # 调用 SMILES 生成逻辑
+        result = run_generate_runner(request.constSmiles, request.varSmiles, request.mainCls, request.minorCls, request.deltaValue, request.num)
+        return result
+    except Exception as e:
+        # 捕获异常并记录详细的错误信息，包括堆栈追踪
+        error_message = f"Error occurred: {str(e)}"
+        print(error_message)  # 打印到控制台，或者使用 logging 模块记录到日志文件
+        raise HTTPException(status_code=500, detail=f"Error occurred: {str(e)}")

sascorer.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#
+# calculation of synthetic accessibility score as described in:
+#
+# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
+# Peter Ertl and Ansgar Schuffenhauer
+# Journal of Cheminformatics 1:8 (2009)
+# http://www.jcheminf.com/content/1/1/8
+#
+# several small modifications to the original paper are included
+# particularly slightly different formula for marocyclic penalty
+# and taking into account also molecule symmetry (fingerprint density)
+#
+# for a set of 10k diverse molecules the agreement between the original method
+# as implemented in PipelinePilot and this implementation is r2 = 0.97
+#
+# peter ertl & greg landrum, september 2013
+#
+import math
+import os.path as op
+import pickle
+from collections import defaultdict
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+_fscores = None
+def readFragmentScores(name='fpscores'):
+  import gzip
+  global _fscores
+  # generate the full path filename:
+  if name == "fpscores":
+    name = op.join(op.dirname(__file__), name)
+  data = pickle.load(gzip.open('%s.pkl.gz' % name))
+  outDict = {}
+  for i in data:
+    for j in range(1, len(i)):
+      outDict[i[j]] = float(i[0])
+  _fscores = outDict
+def numBridgeheadsAndSpiro(mol, ri=None):
+  nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
+  nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
+  return nBridgehead, nSpiro
+def calculateScore(m):
+    if _fscores is None:
+        readFragmentScores()
+    # fragment score
+    fp = rdMolDescriptors.GetMorganFingerprint(m, 2)  # <- 2 is the *radius* of the circular fingerprint
+    fps = fp.GetNonzeroElements()
+    score1 = 0.
+    nf = 0
+    for bitId, v in fps.items():
+        nf += v
+        sfp = bitId
+        score1 += _fscores.get(sfp, -4) * v
+    if nf == 0:
+        score1 = -4  # 避免除以零，设定默认值
+    else:
+        score1 /= nf
+    # features score
+    nAtoms = m.GetNumAtoms()
+    nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
+    ri = m.GetRingInfo()
+    nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
+    nMacrocycles = 0
+    for x in ri.AtomRings():
+        if len(x) > 8:
+            nMacrocycles += 1
+    sizePenalty = nAtoms**1.005 - nAtoms
+    stereoPenalty = math.log10(nChiralCenters + 1)
+    spiroPenalty = math.log10(nSpiro + 1)
+    bridgePenalty = math.log10(nBridgeheads + 1)
+    macrocyclePenalty = 0.
+    if nMacrocycles > 0:
+        macrocyclePenalty = math.log10(2)
+    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
+    # correction for the fingerprint density
+    if len(fps) == 0:
+        score3 = 0  # 避免除以零，设定默认值
+    else:
+        score3 = math.log(float(nAtoms) / len(fps)) * .5
+    sascore = score1 + score2 + score3
+    # need to transform "raw" value into scale between 1 and 10
+    min = -4.0
+    max = 2.5
+    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
+    if sascore > 8.:
+        sascore = 8. + math.log(sascore + 1. - 9.)
+    if sascore > 10.:
+        sascore = 10.0
+    elif sascore < 1.:
+        sascore = 1.0
+    return sascore
+def processMols(mols):
+  print('smiles\tName\tsa_score')
+  for i, m in enumerate(mols):
+    if m is None:
+      continue
+    s = calculateScore(m)
+    smiles = Chem.MolToSmiles(m)
+    print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
+if __name__ == '__main__':
+  import sys
+  import time
+  t1 = time.time()
+  readFragmentScores("fpscores")
+  t2 = time.time()
+  suppl = Chem.SmilesMolSupplier(sys.argv[1])
+  t3 = time.time()
+  processMols(suppl)
+  t4 = time.time()
+  print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
+        file=sys.stderr)
+#
+#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
+#       products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#