Upload src/features/ligand.py with huggingface_hub
Browse files- src/features/ligand.py +261 -0
src/features/ligand.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/features/ligand.py
|
| 2 |
+
#
|
| 3 |
+
# Ligand feature extraction β pure RDKit, zero ML models at inference.
|
| 4 |
+
# All operations: O(N_atoms) or O(N_atomsΒ²) at worst β microseconds/mol.
|
| 5 |
+
#
|
| 6 |
+
# Feature blocks:
|
| 7 |
+
#
|
| 8 |
+
# BINARY FINGERPRINTS (presence/absence of substructure)
|
| 9 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
# ecfp2 1024d Morgan r=1 β ultra-local atom neighbourhoods
|
| 11 |
+
# ecfp 1024d Morgan r=2 β standard local topology (ECFP4)
|
| 12 |
+
# ecfp6 1024d Morgan r=3 β extended neighbourhoods
|
| 13 |
+
# fcfp 1024d Functional class r=2 β pharmacophoric identity
|
| 14 |
+
# maccs 167d 166 SMARTS pharmacophore keys
|
| 15 |
+
# atom_pair 2048d All-pairs graph distance (global topology)
|
| 16 |
+
# torsion 2048d 4-atom rotatable bond paths (conformational)
|
| 17 |
+
# avalon 512d Avalon β completely different algorithm (Scitegic)
|
| 18 |
+
# rdkit_pat 2048d RDKit layered β ring + aromaticity + bond order
|
| 19 |
+
#
|
| 20 |
+
# COUNT FINGERPRINTS (how many times each substructure appears)
|
| 21 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
# ecfp_count 1024d Morgan r=2 counts β 3 benzenes != 1 benzene
|
| 23 |
+
# ecfp6_count 1024d Morgan r=3 counts
|
| 24 |
+
#
|
| 25 |
+
# DENSE CONTINUOUS
|
| 26 |
+
# ββββββββββββββββ
|
| 27 |
+
# estate 79d EState sum indices β electrotopological signal
|
| 28 |
+
# phys 217d RDKit full descriptor suite (RobustScaler normalised)
|
| 29 |
+
#
|
| 30 |
+
# Inference timing (HF Spaces free tier, 2 vCPU):
|
| 31 |
+
# Per SMILES: ~3-5 ms total (all fingerprints + descriptors)
|
| 32 |
+
# 1M compounds: ~50-80 min on single CPU core
|
| 33 |
+
# No GPU, no transformer, no external calls.
|
| 34 |
+
|
| 35 |
+
import numpy as np
|
| 36 |
+
from rdkit import Chem, DataStructs
|
| 37 |
+
from rdkit.Chem import AllChem, Descriptors, MACCSkeys, rdMolDescriptors
|
| 38 |
+
from rdkit.Chem.EState import Fingerprinter as EStateFP
|
| 39 |
+
from rdkit import RDLogger
|
| 40 |
+
from sklearn.preprocessing import RobustScaler
|
| 41 |
+
|
| 42 |
+
RDLogger.DisableLog('rdApp.*')
|
| 43 |
+
_DESC_LIST = Descriptors._descList
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
from rdkit.Avalon.pyAvalonTools import GetAvalonFP as _GetAvalonFP
|
| 47 |
+
_AVALON_OK = True
|
| 48 |
+
except ImportError:
|
| 49 |
+
_AVALON_OK = False
|
| 50 |
+
print(" WARNING: rdkit.Avalon not available β avalon features will be zeros. "
|
| 51 |
+
"Reinstall RDKit with Avalon support if needed.")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def smiles_to_features(smiles: str):
|
| 55 |
+
"""
|
| 56 |
+
Convert a SMILES string to the full ligand feature dict.
|
| 57 |
+
Returns None if SMILES is invalid.
|
| 58 |
+
"""
|
| 59 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 60 |
+
if mol is None:
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
# ββ Binary Morgan fingerprints βββββββββββββββββββββββββββββββββββββ
|
| 64 |
+
def _bin(radius, nbits=1024):
|
| 65 |
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
|
| 66 |
+
arr = np.zeros(nbits, dtype=np.float32)
|
| 67 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 68 |
+
return arr
|
| 69 |
+
|
| 70 |
+
ecfp2 = _bin(1)
|
| 71 |
+
ecfp = _bin(2) # ECFP4
|
| 72 |
+
ecfp6 = _bin(3)
|
| 73 |
+
|
| 74 |
+
fp_fcfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=True)
|
| 75 |
+
fcfp = np.zeros(1024, dtype=np.float32)
|
| 76 |
+
DataStructs.ConvertToNumpyArray(fp_fcfp, fcfp)
|
| 77 |
+
|
| 78 |
+
# ββ Morgan COUNT fingerprints ββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
# Counts how many times each substructure hashes to each bit.
|
| 80 |
+
# A drug with 3 chloro-phenyl groups looks different from one with 1.
|
| 81 |
+
# Orthogonal to the binary versions above.
|
| 82 |
+
def _cnt(radius, nbits=1024):
|
| 83 |
+
fp = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=nbits)
|
| 84 |
+
arr = np.zeros(nbits, dtype=np.float32)
|
| 85 |
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
| 86 |
+
return arr
|
| 87 |
+
|
| 88 |
+
ecfp_count = _cnt(2)
|
| 89 |
+
ecfp6_count = _cnt(3)
|
| 90 |
+
|
| 91 |
+
# ββ Avalon fingerprint (512d) ββββββββββββββββββββββββββββββββββββββ
|
| 92 |
+
# Completely different algorithm from Morgan family.
|
| 93 |
+
# Graph-invariant path enumeration β catches heteroaromatic scaffold
|
| 94 |
+
# patterns Morgan misses.
|
| 95 |
+
if _AVALON_OK:
|
| 96 |
+
try:
|
| 97 |
+
fp_av = _GetAvalonFP(mol, nBits=512)
|
| 98 |
+
avalon = np.zeros(512, dtype=np.float32)
|
| 99 |
+
DataStructs.ConvertToNumpyArray(fp_av, avalon)
|
| 100 |
+
except Exception:
|
| 101 |
+
avalon = np.zeros(512, dtype=np.float32)
|
| 102 |
+
else:
|
| 103 |
+
avalon = np.zeros(512, dtype=np.float32)
|
| 104 |
+
|
| 105 |
+
# ββ RDKit Pattern (Layered) fingerprint (2048d) ββββββββββββββββββββ
|
| 106 |
+
# Encodes atom connectivity WITH ring membership, aromaticity, bond
|
| 107 |
+
# order layered in. Catches fused aromatic systems (indoles, purines,
|
| 108 |
+
# quinolines) that ECFP treats as overlapping local neighbourhoods.
|
| 109 |
+
try:
|
| 110 |
+
fp_pat = Chem.RDKFingerprint(mol, fpSize=2048)
|
| 111 |
+
rdkit_pat = np.zeros(2048, dtype=np.float32)
|
| 112 |
+
DataStructs.ConvertToNumpyArray(fp_pat, rdkit_pat)
|
| 113 |
+
except Exception:
|
| 114 |
+
rdkit_pat = np.zeros(2048, dtype=np.float32)
|
| 115 |
+
|
| 116 |
+
# ββ MACCS keys (167d) βββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
+
mk = MACCSkeys.GenMACCSKeys(mol)
|
| 118 |
+
maccs = np.zeros(167, dtype=np.float32)
|
| 119 |
+
DataStructs.ConvertToNumpyArray(mk, maccs)
|
| 120 |
+
|
| 121 |
+
# ββ AtomPair binary (2048d) ββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
fp_ap = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=2048)
|
| 123 |
+
atom_pair = np.zeros(2048, dtype=np.float32)
|
| 124 |
+
DataStructs.ConvertToNumpyArray(fp_ap, atom_pair)
|
| 125 |
+
|
| 126 |
+
# ββ Topological Torsion binary (2048d) ββββββββββββββββββββββββββββ
|
| 127 |
+
fp_tt = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=2048)
|
| 128 |
+
torsion = np.zeros(2048, dtype=np.float32)
|
| 129 |
+
DataStructs.ConvertToNumpyArray(fp_tt, torsion)
|
| 130 |
+
|
| 131 |
+
# ββ EState sum indices (79d dense continuous) ββββββββββββββββββββββ
|
| 132 |
+
try:
|
| 133 |
+
_, sum_e = EStateFP.FingerprintMol(mol)
|
| 134 |
+
estate = np.array(sum_e, dtype=np.float64)
|
| 135 |
+
estate = np.nan_to_num(estate, nan=0.0, posinf=0.0, neginf=0.0)
|
| 136 |
+
estate = np.clip(estate, -1e6, 1e6).astype(np.float32)
|
| 137 |
+
except Exception:
|
| 138 |
+
estate = np.zeros(79, dtype=np.float32)
|
| 139 |
+
|
| 140 |
+
# ββ RDKit physicochemical descriptors (~217d) ββββββββββββββββββββββ
|
| 141 |
+
phys = []
|
| 142 |
+
for _, func in _DESC_LIST:
|
| 143 |
+
try:
|
| 144 |
+
v = float(func(mol))
|
| 145 |
+
phys.append(v if (np.isfinite(v) and abs(v) < 1e15) else 0.0)
|
| 146 |
+
except Exception:
|
| 147 |
+
phys.append(0.0)
|
| 148 |
+
|
| 149 |
+
return {
|
| 150 |
+
'ecfp2': ecfp2,
|
| 151 |
+
'ecfp': ecfp,
|
| 152 |
+
'ecfp6': ecfp6,
|
| 153 |
+
'fcfp': fcfp,
|
| 154 |
+
'maccs': maccs,
|
| 155 |
+
'atom_pair': atom_pair,
|
| 156 |
+
'torsion': torsion,
|
| 157 |
+
'avalon': avalon,
|
| 158 |
+
'rdkit_pat': rdkit_pat,
|
| 159 |
+
'ecfp_count': ecfp_count,
|
| 160 |
+
'ecfp6_count': ecfp6_count,
|
| 161 |
+
'estate': estate,
|
| 162 |
+
'phys': np.array(phys, dtype=np.float32),
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def extract_ligand_features(smiles_list: list, scaler=None, fit_scaler: bool = False):
|
| 167 |
+
"""
|
| 168 |
+
Extract ligand features for a list of SMILES strings.
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
smiles_list: list of SMILES strings
|
| 172 |
+
scaler: fitted RobustScaler (required if fit_scaler=False)
|
| 173 |
+
fit_scaler: if True, fit a new scaler on the continuous features
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
feats: dict of numpy arrays, one per feature type
|
| 177 |
+
valid_idx: indices of successfully parsed SMILES
|
| 178 |
+
scaler: fitted RobustScaler
|
| 179 |
+
|
| 180 |
+
Note: Binary + count fingerprints are NOT scaled.
|
| 181 |
+
GBMs are invariant to monotone transforms on binary features.
|
| 182 |
+
Count fingerprints are log1p-transformed for numerical stability.
|
| 183 |
+
"""
|
| 184 |
+
ecfp2s, ecfps, ecfp6s, fcfps = [], [], [], []
|
| 185 |
+
maccss, aps, tors = [], [], []
|
| 186 |
+
avalons, rdkit_pats = [], []
|
| 187 |
+
ecfp_counts, ecfp6_counts = [], []
|
| 188 |
+
estates, physs = [], []
|
| 189 |
+
valid_idx = []
|
| 190 |
+
|
| 191 |
+
for i, smi in enumerate(smiles_list):
|
| 192 |
+
r = smiles_to_features(smi)
|
| 193 |
+
if r is None:
|
| 194 |
+
continue
|
| 195 |
+
ecfp2s.append(r['ecfp2'])
|
| 196 |
+
ecfps.append(r['ecfp'])
|
| 197 |
+
ecfp6s.append(r['ecfp6'])
|
| 198 |
+
fcfps.append(r['fcfp'])
|
| 199 |
+
maccss.append(r['maccs'])
|
| 200 |
+
aps.append(r['atom_pair'])
|
| 201 |
+
tors.append(r['torsion'])
|
| 202 |
+
avalons.append(r['avalon'])
|
| 203 |
+
rdkit_pats.append(r['rdkit_pat'])
|
| 204 |
+
ecfp_counts.append(r['ecfp_count'])
|
| 205 |
+
ecfp6_counts.append(r['ecfp6_count'])
|
| 206 |
+
estates.append(r['estate'])
|
| 207 |
+
physs.append(r['phys'])
|
| 208 |
+
valid_idx.append(i)
|
| 209 |
+
|
| 210 |
+
n_fail = len(smiles_list) - len(valid_idx)
|
| 211 |
+
if n_fail:
|
| 212 |
+
print(f" Ligand: {n_fail} SMILES failed to parse β dropped")
|
| 213 |
+
|
| 214 |
+
# Continuous: clean then scale together
|
| 215 |
+
phys_arr = np.nan_to_num(
|
| 216 |
+
np.array(physs, dtype=np.float64),
|
| 217 |
+
nan=0.0, posinf=0.0, neginf=0.0
|
| 218 |
+
).astype(np.float32)
|
| 219 |
+
estate_arr = np.array(estates, dtype=np.float32)
|
| 220 |
+
|
| 221 |
+
continuous = np.concatenate([phys_arr, estate_arr], axis=1)
|
| 222 |
+
if fit_scaler:
|
| 223 |
+
scaler = RobustScaler()
|
| 224 |
+
scaler.fit(continuous)
|
| 225 |
+
continuous_scaled = scaler.transform(continuous)
|
| 226 |
+
phys_scaled = continuous_scaled[:, :phys_arr.shape[1]]
|
| 227 |
+
estate_scaled = continuous_scaled[:, phys_arr.shape[1]:]
|
| 228 |
+
|
| 229 |
+
# Count FPs: log1p stabilises large int values without losing magnitude info
|
| 230 |
+
ecfp_cnt_arr = np.log1p(np.array(ecfp_counts, dtype=np.float32))
|
| 231 |
+
ecfp6_cnt_arr = np.log1p(np.array(ecfp6_counts, dtype=np.float32))
|
| 232 |
+
|
| 233 |
+
feats = {
|
| 234 |
+
'ecfp2': np.array(ecfp2s, dtype=np.float32),
|
| 235 |
+
'ecfp': np.array(ecfps, dtype=np.float32),
|
| 236 |
+
'ecfp6': np.array(ecfp6s, dtype=np.float32),
|
| 237 |
+
'fcfp': np.array(fcfps, dtype=np.float32),
|
| 238 |
+
'maccs': np.array(maccss, dtype=np.float32),
|
| 239 |
+
'atom_pair': np.array(aps, dtype=np.float32),
|
| 240 |
+
'torsion': np.array(tors, dtype=np.float32),
|
| 241 |
+
'avalon': np.array(avalons, dtype=np.float32),
|
| 242 |
+
'rdkit_pat': np.array(rdkit_pats, dtype=np.float32),
|
| 243 |
+
'ecfp_count': ecfp_cnt_arr,
|
| 244 |
+
'ecfp6_count': ecfp6_cnt_arr,
|
| 245 |
+
'estate': estate_scaled,
|
| 246 |
+
'phys': phys_scaled,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
total_dim = sum(v.shape[1] for v in feats.values())
|
| 250 |
+
print(f" Ligand: {len(valid_idx)} molecules | {total_dim}d total")
|
| 251 |
+
print(f" Binary: ecfp2={feats['ecfp2'].shape[1]} ecfp={feats['ecfp'].shape[1]} "
|
| 252 |
+
f"ecfp6={feats['ecfp6'].shape[1]} fcfp={feats['fcfp'].shape[1]} "
|
| 253 |
+
f"maccs={feats['maccs'].shape[1]} ap={feats['atom_pair'].shape[1]} "
|
| 254 |
+
f"tors={feats['torsion'].shape[1]} avalon={feats['avalon'].shape[1]} "
|
| 255 |
+
f"rdkit_pat={feats['rdkit_pat'].shape[1]}")
|
| 256 |
+
print(f" Counts: ecfp_cnt={feats['ecfp_count'].shape[1]} "
|
| 257 |
+
f"ecfp6_cnt={feats['ecfp6_count'].shape[1]}")
|
| 258 |
+
print(f" Dense: estate={feats['estate'].shape[1]} "
|
| 259 |
+
f"phys={feats['phys'].shape[1]}")
|
| 260 |
+
|
| 261 |
+
return feats, valid_idx, scaler
|