Spaces:
Sleeping
Sleeping
| """Data preprocessing: SMILES -> Morgan fingerprints + RDKit descriptors.""" | |
| import logging | |
| import os | |
| import pickle | |
| # Suppress RDKit warnings | |
| logging.getLogger("rdkit").setLevel(logging.ERROR) | |
| os.environ["RDKIT_VERBOSE"] = "0" | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| from utils import RANDOM_STATE, get_data_path | |
| # RDKit 2D descriptors (≥20, validated to exist) | |
| RDKIT_DESCRIPTOR_NAMES = [ | |
| "MolWt", | |
| "MolLogP", | |
| "NumHDonors", | |
| "NumHAcceptors", | |
| "NumRotatableBonds", | |
| "NumValenceElectrons", | |
| "NumRadicalElectrons", | |
| "TPSA", | |
| "LabuteASA", | |
| "HeavyAtomMolWt", | |
| "ExactMolWt", | |
| "NumHeteroatoms", | |
| "FractionCSP3", | |
| "NumSaturatedRings", | |
| "NumAliphaticRings", | |
| "NumAromaticRings", | |
| "RingCount", | |
| "NumAmideBonds", | |
| "NumSaturatedHeterocycles", | |
| "NumSaturatedCarbocycles", | |
| ] | |
| def _get_descriptor_list(): | |
| from rdkit.Chem import Descriptors | |
| return [n for n in RDKIT_DESCRIPTOR_NAMES if hasattr(Descriptors, n)] | |
| def _get_morgan_fingerprint(smiles: str, n_bits: int = 1024, radius: int = 2) -> np.ndarray | None: | |
| """Convert SMILES to Morgan fingerprint. Returns None if invalid.""" | |
| try: | |
| from rdkit import Chem | |
| from rdkit.Chem import AllChem | |
| mol = Chem.MolFromSmiles(smiles) | |
| if mol is None: | |
| return None | |
| fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits) | |
| return np.array(fp, dtype=np.float32) | |
| except Exception: | |
| return None | |
| def _get_rdkit_descriptors(smiles: str, desc_names: list) -> np.ndarray | None: | |
| """Compute RDKit 2D descriptors. Returns None if invalid.""" | |
| try: | |
| from rdkit import Chem | |
| from rdkit.Chem import Descriptors | |
| mol = Chem.MolFromSmiles(smiles) | |
| if mol is None: | |
| return None | |
| desc = [getattr(Descriptors, n)(mol) for n in desc_names] | |
| arr = np.array(desc, dtype=np.float32) | |
| if np.any(np.isnan(arr)) or np.any(np.isinf(arr)): | |
| return None | |
| return arr | |
| except Exception: | |
| return None | |
| def preprocess() -> tuple[np.ndarray, np.ndarray, list[str]]: | |
| """ | |
| Load CSV, compute Morgan (1024-bit) + descriptors, return (X, y, feature_names). | |
| Drops invalid SMILES. | |
| """ | |
| df = pd.read_csv(get_data_path()) | |
| df = df.dropna(subset=["Canonical_Smiles"]) | |
| df = df[df["Canonical_Smiles"].astype(str).str.strip() != ""] | |
| desc_names = _get_descriptor_list() | |
| fps_list = [] | |
| descs_list = [] | |
| valid_idx = [] | |
| for i, row in df.iterrows(): | |
| smi = str(row["Canonical_Smiles"]).strip() | |
| fp = _get_morgan_fingerprint(smi) | |
| desc = _get_rdkit_descriptors(smi, desc_names) | |
| if fp is not None and desc is not None: | |
| fps_list.append(fp) | |
| descs_list.append(desc) | |
| valid_idx.append(i) | |
| fps = np.array(fps_list, dtype=np.float32) | |
| descs = np.array(descs_list, dtype=np.float32) | |
| X = np.hstack([fps, descs]) | |
| y = df.loc[valid_idx, "Activity"].values | |
| feature_names = [f"morgan_{i}" for i in range(1024)] + desc_names | |
| return X, y, feature_names | |
| def save_preprocessed(output_dir: Path) -> tuple[np.ndarray, np.ndarray, list[str]]: | |
| """Preprocess, save to data/, return (X, y, feature_names).""" | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| X, y, names = preprocess() | |
| np.save(output_dir / "features.npy", X) | |
| with open(output_dir / "feature_names.pkl", "wb") as f: | |
| pickle.dump(names, f) | |
| np.save(output_dir / "labels.npy", y) | |
| return X, y, names | |