import os import pandas as pd from pathlib import Path from mmpdblib.fragment_io import read_fragment_records from rdkit import Chem def fragmentize_molecule(smiles_string, max_ratio=0.5): # 创建临时文件名 input_file = "temp_input.smi" output_file = "temp_output.fragments" # 将SMILES字符串写入临时输入文件 with open(input_file, "w") as f: f.write(smiles_string + "\t" + "Molecule" + "\n") # 使用mmpdb工具进行分子碎片化 os.system(f"mmpdb fragment {input_file} -o {output_file}") # 读取并处理碎片 fragment_reader = read_fragment_records(output_file) fragment_list = [] for record in fragment_reader: for frag in record.fragments: if count_heavy_atoms(frag.variable_smiles) < count_heavy_atoms(record.normalized_smiles) * max_ratio: fragment_list.append({ 'variable_smiles': frag.variable_smiles, 'constant_smiles': frag.constant_smiles, 'record_id': record.id, 'normalized_smiles': record.normalized_smiles, 'attachment_order': frag.attachment_order }) # 删除临时文件 os.remove(input_file) os.remove(output_file) # 返回碎片列表 return pd.DataFrame(fragment_list) def count_heavy_atoms(smiles): # 使用RDKit计算重原子数 mol = Chem.MolFromSmiles(smiles) return mol.GetNumHeavyAtoms() if mol else 0 # 示例调用 # smiles = "O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o1)N2" # fragment_df = fragmentize_molecule(smiles) # print(fragment_df)