molecular / molecule /descriptors.py
ivanm151's picture
init
6796365
from rdkit import Chem
from rdkit.Chem import (
Descriptors, rdMolDescriptors, Crippen, Lipinski, QED, AllChem,
ChemicalFeatures
)
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import rdMolDescriptors as rdmd
import numpy as np
import os
def load_feature_factory():
"""Загружает стандартный RDKit FeatureFactory."""
from rdkit import RDConfig
fdef = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
return ChemicalFeatures.BuildFeatureFactory(fdef)
def compute_gasteiger_stats(mol):
"""Возвращает статистику по Gasteiger-зарядам или None."""
try:
AllChem.ComputeGasteigerCharges(mol)
charges = [a.GetDoubleProp("_GasteigerCharge") for a in mol.GetAtoms()]
return {
"mean": float(np.mean(charges)),
"max": float(max(charges)),
"min": float(min(charges)),
}
except Exception:
return None
def compute_morgan_fp(mol, radius=2, n_bits=2048):
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
bitstring = fp.ToBitString()
return {
"n_bits": n_bits,
"bits_on": bitstring.count("1"),
}
def compute_extra_descriptors(mol, n_show=10):
calc = MoleculeDescriptors.MolecularDescriptorCalculator(
[name for name, _ in Descriptors._descList]
)
names = calc.GetDescriptorNames()
values = calc.CalcDescriptors(mol)
return {
"names": names[:n_show],
"values": values[:n_show],
}
def get_molecule_properties(smiles):
"""Возвращает дескрипторы и свойства молекулы по SMILES."""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
raise ValueError("Не удалось создать молекулу из SMILES.")
mol = Chem.AddHs(mol)
# --- Базовые свойства ---
base = {
"n_atoms": mol.GetNumAtoms(),
"n_bonds": mol.GetNumBonds(),
"canonical_smiles": Chem.MolToSmiles(Chem.RemoveHs(mol), canonical=True),
"aromatic_rings": rdmd.CalcNumAromaticRings(mol),
"aliphatic_rings": rdmd.CalcNumAliphaticRings(mol),
"heterocycles": rdmd.CalcNumHeterocycles(mol),
}
# --- Физико-химические ---
physchem = {
"mol_weight": Descriptors.MolWt(mol),
"logp": Crippen.MolLogP(mol),
"tpsa": Descriptors.TPSA(mol),
"h_donors": Lipinski.NumHDonors(mol),
"h_acceptors": Lipinski.NumHAcceptors(mol),
"rotatable_bonds": Descriptors.NumRotatableBonds(mol),
"fraction_csp3": rdMolDescriptors.CalcFractionCSP3(mol),
}
charges = compute_gasteiger_stats(mol)
factory = load_feature_factory()
feats = factory.GetFeaturesForMol(mol)
pharm = {
"n_features": len(feats),
"types": sorted({f.GetFamily() for f in feats}),
}
try:
qed_val = QED.qed(mol)
except Exception:
qed_val = None
lipinski_pass = (
physchem["mol_weight"] <= 500
and physchem["logp"] <= 5
and physchem["h_donors"] <= 5
and physchem["h_acceptors"] <= 10
)
lipinski = {
"mw_ok": physchem["mol_weight"] <= 500,
"logp_ok": physchem["logp"] <= 5,
"h_donors_ok": physchem["h_donors"] <= 5,
"h_acceptors_ok": physchem["h_acceptors"] <= 10,
"pass_": lipinski_pass,
}
fp = compute_morgan_fp(mol)
extra = compute_extra_descriptors(mol)
return {
"smiles": smiles,
"base": base,
"physchem": physchem,
"charges": charges,
"pharmacophore": pharm,
"qed": qed_val,
"lipinski": lipinski,
"fingerprint": fp,
"extra_descriptors": extra,
}
# Пример вывода
if __name__ == "__main__":
data = get_molecule_properties("CC(=O)OC1=CC=CC=C1C(=O)O")
for section, value in data.items():
print(f"\n=== {section.upper()} ===")
print(value)