"""RDKit-based molecular feature extraction utilities. This module supports both per-drug and pair-level features for DDI modeling. """ from __future__ import annotations import logging from dataclasses import dataclass from typing import Any, Dict, List, Tuple import numpy as np from sklearn.preprocessing import StandardScaler try: from rdkit import DataStructs from rdkit import Chem from rdkit.Chem import AllChem, Descriptors except Exception: # pragma: no cover DataStructs = None # type: ignore Chem = None # type: ignore AllChem = None # type: ignore Descriptors = None # type: ignore logger = logging.getLogger("medcare_ddi.molfeat") @dataclass class MoleculeFeatureConfig: n_bits: int = 1024 radius: int = 2 normalize_descriptors: bool = True DESCRIPTOR_NAMES = [ 'MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'NumAtoms', 'NumRings', 'NumRotatableBonds', ] PAIR_SIMILARITY_NAMES = [ 'tanimoto', 'dice', 'cosine', 'both_valid', 'any_invalid', ] def _safe_mol(smiles: str): if Chem is None: raise RuntimeError("RDKit not installed. Install rdkit-pypi or use conda.") try: mol = Chem.MolFromSmiles(smiles or '') return mol except Exception: return None def _desc_vector(mol) -> np.ndarray: if mol is None: return np.zeros((len(DESCRIPTOR_NAMES),), dtype=np.float32) return np.array( [ float(Descriptors.MolWt(mol)), float(Descriptors.MolLogP(mol)), float(Descriptors.NumHDonors(mol)), float(Descriptors.NumHAcceptors(mol)), float(Descriptors.TPSA(mol)), float(mol.GetNumAtoms()), float(Descriptors.RingCount(mol)), float(Descriptors.NumRotatableBonds(mol)), ], dtype=np.float32, ) def _fingerprint(mol, radius: int, n_bits: int) -> np.ndarray: if mol is None: return np.zeros((n_bits,), dtype=np.float32) bitvect = AllChem.GetMorganFingerprintAsBitVect(mol, radius, n_bits) arr = np.zeros((n_bits,), dtype=np.int8) DataStructs.ConvertToNumpyArray(bitvect, arr) return arr.astype(np.float32) def _tanimoto(mol_a, mol_b, radius: int, n_bits: int) -> float: if mol_a is None or mol_b is None: return 0.0 fp_a = AllChem.GetMorganFingerprintAsBitVect(mol_a, radius, n_bits) fp_b = AllChem.GetMorganFingerprintAsBitVect(mol_b, radius, n_bits) return float(DataStructs.TanimotoSimilarity(fp_a, fp_b)) def _pair_similarity_features(mol_a, mol_b, radius: int, n_bits: int) -> np.ndarray: if mol_a is None or mol_b is None: return np.array([0.0, 0.0, 0.0, 0.0, 1.0], dtype=np.float32) fp_a = AllChem.GetMorganFingerprintAsBitVect(mol_a, radius, n_bits) fp_b = AllChem.GetMorganFingerprintAsBitVect(mol_b, radius, n_bits) tanimoto = float(DataStructs.TanimotoSimilarity(fp_a, fp_b)) dice = float(DataStructs.DiceSimilarity(fp_a, fp_b)) cosine = float(DataStructs.CosineSimilarity(fp_a, fp_b)) return np.array([tanimoto, dice, cosine, 1.0, 0.0], dtype=np.float32) def smiles_to_features(smiles_list: List[str], n_bits: int = 1024, radius: int = 2) -> Tuple[np.ndarray, List[Dict[str, Any]]]: """Convert a list of SMILES to fingerprint vectors and descriptor metadata. Returns: X: np.ndarray shape (N, n_bits + len(DESCRIPTOR_NAMES)) meta: list[dict] with descriptors per molecule """ fps: List[np.ndarray] = [] descs: List[np.ndarray] = [] metas = [] for s in smiles_list: mol = _safe_mol(s) if mol is None: logger.warning(f"Invalid SMILES: {s}") fp = np.zeros((n_bits,), dtype=np.float32) dvec = np.zeros((len(DESCRIPTOR_NAMES),), dtype=np.float32) meta = {"valid": False} else: fp = _fingerprint(mol, radius=radius, n_bits=n_bits) dvec = _desc_vector(mol) meta = { "valid": True, **{k: float(v) for k, v in zip(DESCRIPTOR_NAMES, dvec.tolist())}, } fps.append(fp) descs.append(dvec) metas.append(meta) X_fp = np.vstack(fps) X_meta = np.vstack(descs) X = np.hstack([X_fp, X_meta]) return X, metas class MolecularFeatureExtractor: """Pair-level molecular feature extractor with descriptor normalization.""" def __init__(self, config: MoleculeFeatureConfig | None = None): self.config = config or MoleculeFeatureConfig() self.scaler = StandardScaler() self._is_fitted = False def fit(self, smiles_pairs: List[Tuple[str, str]]) -> None: desc_rows: List[np.ndarray] = [] for s_a, s_b in smiles_pairs: mol_a = _safe_mol(s_a) mol_b = _safe_mol(s_b) d_a = _desc_vector(mol_a) d_b = _desc_vector(mol_b) d_delta = np.abs(d_a - d_b) sim = _pair_similarity_features(mol_a, mol_b, self.config.radius, self.config.n_bits) desc_rows.append(np.concatenate([d_a, d_b, d_delta, sim], axis=0)) matrix = np.vstack(desc_rows) if desc_rows else np.zeros((0, len(DESCRIPTOR_NAMES) * 3 + len(PAIR_SIMILARITY_NAMES)), dtype=np.float32) if matrix.shape[0] > 0: self.scaler.fit(matrix) self._is_fitted = True def transform(self, smiles_pairs: List[Tuple[str, str]]) -> np.ndarray: rows: List[np.ndarray] = [] for s_a, s_b in smiles_pairs: mol_a = _safe_mol(s_a) mol_b = _safe_mol(s_b) fp_a = _fingerprint(mol_a, self.config.radius, self.config.n_bits) fp_b = _fingerprint(mol_b, self.config.radius, self.config.n_bits) fp_pair = np.abs(fp_a - fp_b) d_a = _desc_vector(mol_a) d_b = _desc_vector(mol_b) d_delta = np.abs(d_a - d_b) sim = _pair_similarity_features(mol_a, mol_b, self.config.radius, self.config.n_bits) desc = np.concatenate([d_a, d_b, d_delta, sim], axis=0) if self.config.normalize_descriptors and self._is_fitted: desc = self.scaler.transform(desc.reshape(1, -1)).reshape(-1).astype(np.float32) rows.append(np.concatenate([fp_pair, desc], axis=0).astype(np.float32)) return np.vstack(rows) if rows else np.zeros((0, self.config.n_bits + (len(DESCRIPTOR_NAMES) * 3 + len(PAIR_SIMILARITY_NAMES))), dtype=np.float32) def fit_transform(self, smiles_pairs: List[Tuple[str, str]]) -> np.ndarray: self.fit(smiles_pairs) return self.transform(smiles_pairs) if __name__ == '__main__': test_smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=CC=C1', 'INVALID_SMILES'] X, metas = smiles_to_features(test_smiles) print('X shape:', X.shape) print(metas)