ym59 commited on
Commit
2d38666
Β·
verified Β·
1 Parent(s): 414b96e

Upload src/features/ligand.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/features/ligand.py +261 -0
src/features/ligand.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/features/ligand.py
2
+ #
3
+ # Ligand feature extraction β€” pure RDKit, zero ML models at inference.
4
+ # All operations: O(N_atoms) or O(N_atomsΒ²) at worst β†’ microseconds/mol.
5
+ #
6
+ # Feature blocks:
7
+ #
8
+ # BINARY FINGERPRINTS (presence/absence of substructure)
9
+ # ─────────────────────────────────────────────────────
10
+ # ecfp2 1024d Morgan r=1 β€” ultra-local atom neighbourhoods
11
+ # ecfp 1024d Morgan r=2 β€” standard local topology (ECFP4)
12
+ # ecfp6 1024d Morgan r=3 β€” extended neighbourhoods
13
+ # fcfp 1024d Functional class r=2 β€” pharmacophoric identity
14
+ # maccs 167d 166 SMARTS pharmacophore keys
15
+ # atom_pair 2048d All-pairs graph distance (global topology)
16
+ # torsion 2048d 4-atom rotatable bond paths (conformational)
17
+ # avalon 512d Avalon β€” completely different algorithm (Scitegic)
18
+ # rdkit_pat 2048d RDKit layered β€” ring + aromaticity + bond order
19
+ #
20
+ # COUNT FINGERPRINTS (how many times each substructure appears)
21
+ # ─────────────────────────────────────────────────────────────
22
+ # ecfp_count 1024d Morgan r=2 counts β€” 3 benzenes != 1 benzene
23
+ # ecfp6_count 1024d Morgan r=3 counts
24
+ #
25
+ # DENSE CONTINUOUS
26
+ # ────────────────
27
+ # estate 79d EState sum indices β€” electrotopological signal
28
+ # phys 217d RDKit full descriptor suite (RobustScaler normalised)
29
+ #
30
+ # Inference timing (HF Spaces free tier, 2 vCPU):
31
+ # Per SMILES: ~3-5 ms total (all fingerprints + descriptors)
32
+ # 1M compounds: ~50-80 min on single CPU core
33
+ # No GPU, no transformer, no external calls.
34
+
35
+ import numpy as np
36
+ from rdkit import Chem, DataStructs
37
+ from rdkit.Chem import AllChem, Descriptors, MACCSkeys, rdMolDescriptors
38
+ from rdkit.Chem.EState import Fingerprinter as EStateFP
39
+ from rdkit import RDLogger
40
+ from sklearn.preprocessing import RobustScaler
41
+
42
+ RDLogger.DisableLog('rdApp.*')
43
+ _DESC_LIST = Descriptors._descList
44
+
45
+ try:
46
+ from rdkit.Avalon.pyAvalonTools import GetAvalonFP as _GetAvalonFP
47
+ _AVALON_OK = True
48
+ except ImportError:
49
+ _AVALON_OK = False
50
+ print(" WARNING: rdkit.Avalon not available β€” avalon features will be zeros. "
51
+ "Reinstall RDKit with Avalon support if needed.")
52
+
53
+
54
+ def smiles_to_features(smiles: str):
55
+ """
56
+ Convert a SMILES string to the full ligand feature dict.
57
+ Returns None if SMILES is invalid.
58
+ """
59
+ mol = Chem.MolFromSmiles(smiles)
60
+ if mol is None:
61
+ return None
62
+
63
+ # ── Binary Morgan fingerprints ─────────────────────────────────────
64
+ def _bin(radius, nbits=1024):
65
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
66
+ arr = np.zeros(nbits, dtype=np.float32)
67
+ DataStructs.ConvertToNumpyArray(fp, arr)
68
+ return arr
69
+
70
+ ecfp2 = _bin(1)
71
+ ecfp = _bin(2) # ECFP4
72
+ ecfp6 = _bin(3)
73
+
74
+ fp_fcfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=True)
75
+ fcfp = np.zeros(1024, dtype=np.float32)
76
+ DataStructs.ConvertToNumpyArray(fp_fcfp, fcfp)
77
+
78
+ # ── Morgan COUNT fingerprints ──────────────────────────────────────
79
+ # Counts how many times each substructure hashes to each bit.
80
+ # A drug with 3 chloro-phenyl groups looks different from one with 1.
81
+ # Orthogonal to the binary versions above.
82
+ def _cnt(radius, nbits=1024):
83
+ fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=nbits)
84
+ arr = np.zeros(nbits, dtype=np.float32)
85
+ DataStructs.ConvertToNumpyArray(fp, arr)
86
+ return arr
87
+
88
+ ecfp_count = _cnt(2)
89
+ ecfp6_count = _cnt(3)
90
+
91
+ # ── Avalon fingerprint (512d) ──────────────────────────────────────
92
+ # Completely different algorithm from Morgan family.
93
+ # Graph-invariant path enumeration β€” catches heteroaromatic scaffold
94
+ # patterns Morgan misses.
95
+ if _AVALON_OK:
96
+ try:
97
+ fp_av = _GetAvalonFP(mol, nBits=512)
98
+ avalon = np.zeros(512, dtype=np.float32)
99
+ DataStructs.ConvertToNumpyArray(fp_av, avalon)
100
+ except Exception:
101
+ avalon = np.zeros(512, dtype=np.float32)
102
+ else:
103
+ avalon = np.zeros(512, dtype=np.float32)
104
+
105
+ # ── RDKit Pattern (Layered) fingerprint (2048d) ────────────────────
106
+ # Encodes atom connectivity WITH ring membership, aromaticity, bond
107
+ # order layered in. Catches fused aromatic systems (indoles, purines,
108
+ # quinolines) that ECFP treats as overlapping local neighbourhoods.
109
+ try:
110
+ fp_pat = Chem.RDKFingerprint(mol, fpSize=2048)
111
+ rdkit_pat = np.zeros(2048, dtype=np.float32)
112
+ DataStructs.ConvertToNumpyArray(fp_pat, rdkit_pat)
113
+ except Exception:
114
+ rdkit_pat = np.zeros(2048, dtype=np.float32)
115
+
116
+ # ── MACCS keys (167d) ─────────────────────────────────────────────
117
+ mk = MACCSkeys.GenMACCSKeys(mol)
118
+ maccs = np.zeros(167, dtype=np.float32)
119
+ DataStructs.ConvertToNumpyArray(mk, maccs)
120
+
121
+ # ── AtomPair binary (2048d) ────────────────────────────────────────
122
+ fp_ap = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=2048)
123
+ atom_pair = np.zeros(2048, dtype=np.float32)
124
+ DataStructs.ConvertToNumpyArray(fp_ap, atom_pair)
125
+
126
+ # ── Topological Torsion binary (2048d) ────────────────────────────
127
+ fp_tt = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=2048)
128
+ torsion = np.zeros(2048, dtype=np.float32)
129
+ DataStructs.ConvertToNumpyArray(fp_tt, torsion)
130
+
131
+ # ── EState sum indices (79d dense continuous) ──────────────────────
132
+ try:
133
+ _, sum_e = EStateFP.FingerprintMol(mol)
134
+ estate = np.array(sum_e, dtype=np.float64)
135
+ estate = np.nan_to_num(estate, nan=0.0, posinf=0.0, neginf=0.0)
136
+ estate = np.clip(estate, -1e6, 1e6).astype(np.float32)
137
+ except Exception:
138
+ estate = np.zeros(79, dtype=np.float32)
139
+
140
+ # ── RDKit physicochemical descriptors (~217d) ──────────────────────
141
+ phys = []
142
+ for _, func in _DESC_LIST:
143
+ try:
144
+ v = float(func(mol))
145
+ phys.append(v if (np.isfinite(v) and abs(v) < 1e15) else 0.0)
146
+ except Exception:
147
+ phys.append(0.0)
148
+
149
+ return {
150
+ 'ecfp2': ecfp2,
151
+ 'ecfp': ecfp,
152
+ 'ecfp6': ecfp6,
153
+ 'fcfp': fcfp,
154
+ 'maccs': maccs,
155
+ 'atom_pair': atom_pair,
156
+ 'torsion': torsion,
157
+ 'avalon': avalon,
158
+ 'rdkit_pat': rdkit_pat,
159
+ 'ecfp_count': ecfp_count,
160
+ 'ecfp6_count': ecfp6_count,
161
+ 'estate': estate,
162
+ 'phys': np.array(phys, dtype=np.float32),
163
+ }
164
+
165
+
166
+ def extract_ligand_features(smiles_list: list, scaler=None, fit_scaler: bool = False):
167
+ """
168
+ Extract ligand features for a list of SMILES strings.
169
+
170
+ Args:
171
+ smiles_list: list of SMILES strings
172
+ scaler: fitted RobustScaler (required if fit_scaler=False)
173
+ fit_scaler: if True, fit a new scaler on the continuous features
174
+
175
+ Returns:
176
+ feats: dict of numpy arrays, one per feature type
177
+ valid_idx: indices of successfully parsed SMILES
178
+ scaler: fitted RobustScaler
179
+
180
+ Note: Binary + count fingerprints are NOT scaled.
181
+ GBMs are invariant to monotone transforms on binary features.
182
+ Count fingerprints are log1p-transformed for numerical stability.
183
+ """
184
+ ecfp2s, ecfps, ecfp6s, fcfps = [], [], [], []
185
+ maccss, aps, tors = [], [], []
186
+ avalons, rdkit_pats = [], []
187
+ ecfp_counts, ecfp6_counts = [], []
188
+ estates, physs = [], []
189
+ valid_idx = []
190
+
191
+ for i, smi in enumerate(smiles_list):
192
+ r = smiles_to_features(smi)
193
+ if r is None:
194
+ continue
195
+ ecfp2s.append(r['ecfp2'])
196
+ ecfps.append(r['ecfp'])
197
+ ecfp6s.append(r['ecfp6'])
198
+ fcfps.append(r['fcfp'])
199
+ maccss.append(r['maccs'])
200
+ aps.append(r['atom_pair'])
201
+ tors.append(r['torsion'])
202
+ avalons.append(r['avalon'])
203
+ rdkit_pats.append(r['rdkit_pat'])
204
+ ecfp_counts.append(r['ecfp_count'])
205
+ ecfp6_counts.append(r['ecfp6_count'])
206
+ estates.append(r['estate'])
207
+ physs.append(r['phys'])
208
+ valid_idx.append(i)
209
+
210
+ n_fail = len(smiles_list) - len(valid_idx)
211
+ if n_fail:
212
+ print(f" Ligand: {n_fail} SMILES failed to parse β€” dropped")
213
+
214
+ # Continuous: clean then scale together
215
+ phys_arr = np.nan_to_num(
216
+ np.array(physs, dtype=np.float64),
217
+ nan=0.0, posinf=0.0, neginf=0.0
218
+ ).astype(np.float32)
219
+ estate_arr = np.array(estates, dtype=np.float32)
220
+
221
+ continuous = np.concatenate([phys_arr, estate_arr], axis=1)
222
+ if fit_scaler:
223
+ scaler = RobustScaler()
224
+ scaler.fit(continuous)
225
+ continuous_scaled = scaler.transform(continuous)
226
+ phys_scaled = continuous_scaled[:, :phys_arr.shape[1]]
227
+ estate_scaled = continuous_scaled[:, phys_arr.shape[1]:]
228
+
229
+ # Count FPs: log1p stabilises large int values without losing magnitude info
230
+ ecfp_cnt_arr = np.log1p(np.array(ecfp_counts, dtype=np.float32))
231
+ ecfp6_cnt_arr = np.log1p(np.array(ecfp6_counts, dtype=np.float32))
232
+
233
+ feats = {
234
+ 'ecfp2': np.array(ecfp2s, dtype=np.float32),
235
+ 'ecfp': np.array(ecfps, dtype=np.float32),
236
+ 'ecfp6': np.array(ecfp6s, dtype=np.float32),
237
+ 'fcfp': np.array(fcfps, dtype=np.float32),
238
+ 'maccs': np.array(maccss, dtype=np.float32),
239
+ 'atom_pair': np.array(aps, dtype=np.float32),
240
+ 'torsion': np.array(tors, dtype=np.float32),
241
+ 'avalon': np.array(avalons, dtype=np.float32),
242
+ 'rdkit_pat': np.array(rdkit_pats, dtype=np.float32),
243
+ 'ecfp_count': ecfp_cnt_arr,
244
+ 'ecfp6_count': ecfp6_cnt_arr,
245
+ 'estate': estate_scaled,
246
+ 'phys': phys_scaled,
247
+ }
248
+
249
+ total_dim = sum(v.shape[1] for v in feats.values())
250
+ print(f" Ligand: {len(valid_idx)} molecules | {total_dim}d total")
251
+ print(f" Binary: ecfp2={feats['ecfp2'].shape[1]} ecfp={feats['ecfp'].shape[1]} "
252
+ f"ecfp6={feats['ecfp6'].shape[1]} fcfp={feats['fcfp'].shape[1]} "
253
+ f"maccs={feats['maccs'].shape[1]} ap={feats['atom_pair'].shape[1]} "
254
+ f"tors={feats['torsion'].shape[1]} avalon={feats['avalon'].shape[1]} "
255
+ f"rdkit_pat={feats['rdkit_pat'].shape[1]}")
256
+ print(f" Counts: ecfp_cnt={feats['ecfp_count'].shape[1]} "
257
+ f"ecfp6_cnt={feats['ecfp6_count'].shape[1]}")
258
+ print(f" Dense: estate={feats['estate'].shape[1]} "
259
+ f"phys={feats['phys'].shape[1]}")
260
+
261
+ return feats, valid_idx, scaler