Songyou commited on
Commit
52471b4
·
verified ·
1 Parent(s): 14e32f4

Update fragment_processor.py

Browse files
Files changed (1) hide show
  1. fragment_processor.py +96 -43
fragment_processor.py CHANGED
@@ -1,50 +1,103 @@
1
- import os
2
  import pandas as pd
 
3
  from pathlib import Path
4
  from mmpdblib.fragment_io import read_fragment_records
5
  from rdkit import Chem
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- def fragmentize_molecule(smiles_string, max_ratio=0.5):
8
- # 创建临时文件名
 
 
 
 
 
 
 
 
 
9
  input_file = "temp_input.smi"
10
  output_file = "temp_output.fragments"
11
-
12
- # 将SMILES字符串写入临时输入文件
13
- with open(input_file, "w") as f:
14
- f.write(smiles_string + "\t" + "Molecule" + "\n")
15
-
16
- # 使用mmpdb工具进行分子碎片化
17
- os.system(f"mmpdb fragment {input_file} -o {output_file}")
18
-
19
- # 读取并处理碎片
20
- fragment_reader = read_fragment_records(output_file)
21
- fragment_list = []
22
-
23
- for record in fragment_reader:
24
- for frag in record.fragments:
25
- if count_heavy_atoms(frag.variable_smiles) < count_heavy_atoms(record.normalized_smiles) * max_ratio:
26
- fragment_list.append({
27
- 'variable_smiles': frag.variable_smiles,
28
- 'constant_smiles': frag.constant_smiles,
29
- 'record_id': record.id,
30
- 'normalized_smiles': record.normalized_smiles,
31
- 'attachment_order': frag.attachment_order
32
- })
33
-
34
- # 删除临时文件
35
- os.remove(input_file)
36
- os.remove(output_file)
37
-
38
- # 返回碎片列表
39
- return pd.DataFrame(fragment_list)
40
-
41
- def count_heavy_atoms(smiles):
42
- # 使用RDKit计算重原子数
43
- mol = Chem.MolFromSmiles(smiles)
44
- return mol.GetNumHeavyAtoms() if mol else 0
45
-
46
- # 示例调用
47
- # smiles = "O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o1)N2"
48
- # fragment_df = fragmentize_molecule(smiles)
49
-
50
- # print(fragment_df)
 
 
 
 
1
  import pandas as pd
2
+ import os
3
  from pathlib import Path
4
  from mmpdblib.fragment_io import read_fragment_records
5
  from rdkit import Chem
6
+ class Index_Dummy:
7
+ """对 dummy 原子进行编号:变量和常量部分分别处理"""
8
+ def __init__(self, df):
9
+ self.df = df
10
+
11
+ def index_constant(self, constSmi, attachmentOrder):
12
+ count = -1
13
+ newConstSmi = ""
14
+ for idx, ichar in enumerate(constSmi):
15
+ if ichar == '*':
16
+ count += 1
17
+ # 注意:attachmentOrder 应为可迭代对象,这里假设传入的 attachmentOrder 为列表或可转换为列表
18
+ ichar = f"[*:{int(attachmentOrder[count]) + 1}]"
19
+ newConstSmi += ichar
20
+ return newConstSmi
21
+
22
+ def index_var(self, varSmi):
23
+ count = 0
24
+ newVarSmi = ""
25
+ for idx, ichar in enumerate(varSmi):
26
+ if ichar == '*':
27
+ count += 1
28
+ ichar = f"[*:{count}]"
29
+ newVarSmi += ichar
30
+ return newVarSmi
31
+
32
+ def add_index(self):
33
+ for idx, row in self.df.iterrows():
34
+ varSmi = row['variable_smiles']
35
+ constSmi = row['constant_smiles']
36
+ attachmentOrder = row['attachment_order']
37
+ self.df.loc[idx, 'variable_smiles'] = self.index_var(varSmi)
38
+ self.df.loc[idx, 'constant_smiles'] = self.index_constant(constSmi, attachmentOrder)
39
+ return self.df
40
+
41
+
42
+ def count_heavy_atoms(smi):
43
+ mol = Chem.MolFromSmiles(smi)
44
+ if not mol:
45
+ return 0
46
+ heavy_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1])
47
+ return heavy_count
48
 
49
+
50
+ def fragmentize_molecule(smiles_string: str, max_ratio: float = 0.5) -> pd.DataFrame:
51
+ """
52
+ 对单个分子进行 fragment 化处理:
53
+ 1. 将 SMILES 字符串写入临时文件(同时写入标题信息)
54
+ 2. 使用 mmpdb 工具 fragment 化分子
55
+ 3. 读取 fragment 文件,并依据 heavy atom 个数筛选合适的 fragment
56
+ 4. 对 fragment 中 dummy 原子添加编号
57
+ 5. 最后返回 DataFrame 格式的 fragment 数据
58
+ """
59
+ # 定义临时文件名(这里保证文件名唯一性可根据需要进一步改进)
60
  input_file = "temp_input.smi"
61
  output_file = "temp_output.fragments"
62
+
63
+ try:
64
+ # SMILES 字符串写入临时输入文件(标题默认写 “Molecule”)
65
+ with open(input_file, "w") as f:
66
+ f.write(smiles_string + "\t" + "Molecule" + "\n")
67
+
68
+ # 使用 mmpdb 工具进行分子碎片化
69
+ ret = os.system(f"mmpdb fragment {input_file} -o {output_file}")
70
+ if ret != 0:
71
+ raise Exception("mmpdb fragment 命令执行失败,请确保 mmpdb 工具安装并配置正确。")
72
+
73
+ # 读取并处理碎片
74
+ fragment_reader = read_fragment_records(output_file)
75
+ frag_list = []
76
+ for record in fragment_reader:
77
+ # 打印或记录当前处理的 record 信息,可根据需要选择注释掉
78
+ print(f"Processing record: {record.id}, {record.normalized_smiles}")
79
+ for frag in record.fragments:
80
+ if count_heavy_atoms(frag.variable_smiles) < count_heavy_atoms(record.normalized_smiles) * max_ratio:
81
+ frag_list.append({
82
+ 'variable_smiles': frag.variable_smiles,
83
+ 'constant_smiles': frag.constant_smiles,
84
+ 'record_id': record.id,
85
+ 'normalized_smiles': record.normalized_smiles,
86
+ 'attachment_order': frag.attachment_order
87
+ })
88
+
89
+ if not frag_list:
90
+ raise Exception("未找到满足筛选条件的碎片。")
91
+
92
+ # 构造 DataFrame,并对 dummy 原子添加编号
93
+ df_frag = pd.DataFrame(frag_list)
94
+ index_dummy = Index_Dummy(df_frag)
95
+ df_frag = index_dummy.add_index()
96
+ return df_frag
97
+
98
+ finally:
99
+ # 删除临时文件,确保每次调用结束后文件被清理
100
+ if Path(input_file).exists():
101
+ os.remove(input_file)
102
+ if Path(output_file).exists():
103
+ os.remove(output_file)