LLM-fastAPI / fragment_processor.py
Songyou's picture
add new files
2df9869
raw
history blame
1.67 kB
import os
import pandas as pd
from pathlib import Path
from mmpdblib.fragment_io import read_fragment_records
from rdkit import Chem
def fragmentize_molecule(smiles_string, max_ratio=0.5):
# 创建临时文件名
input_file = "temp_input.smi"
output_file = "temp_output.fragments"
# 将SMILES字符串写入临时输入文件
with open(input_file, "w") as f:
f.write(smiles_string + "\t" + "Molecule" + "\n")
# 使用mmpdb工具进行分子碎片化
os.system(f"mmpdb fragment {input_file} -o {output_file}")
# 读取并处理碎片
fragment_reader = read_fragment_records(output_file)
fragment_list = []
for record in fragment_reader:
for frag in record.fragments:
if count_heavy_atoms(frag.variable_smiles) < count_heavy_atoms(record.normalized_smiles) * max_ratio:
fragment_list.append({
'variable_smiles': frag.variable_smiles,
'constant_smiles': frag.constant_smiles,
'record_id': record.id,
'normalized_smiles': record.normalized_smiles,
'attachment_order': frag.attachment_order
})
# 删除临时文件
os.remove(input_file)
os.remove(output_file)
# 返回碎片列表
return pd.DataFrame(fragment_list)
def count_heavy_atoms(smiles):
# 使用RDKit计算重原子数
mol = Chem.MolFromSmiles(smiles)
return mol.GetNumHeavyAtoms() if mol else 0
# 示例调用
# smiles = "O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o1)N2"
# fragment_df = fragmentize_molecule(smiles)
# print(fragment_df)