Preformu / utils /molecule_renderer.py
Kevinshh's picture
Upload molecule_renderer.py
8368021 verified
"""
Molecular Structure Renderer.
This module provides molecular structure visualization using RDKit:
- 2D structure rendering
- Morgan fingerprint visualization
- Molecular property calculation
For web-based 3D visualization, we also support 3Dmol.js integration.
"""
import io
import base64
from typing import Optional, Dict, Any, Tuple, List
from dataclasses import dataclass
@dataclass
class MoleculeInfo:
"""Container for molecular information."""
smiles: str
name: Optional[str] = None
# Calculated properties
molecular_weight: Optional[float] = None
logp: Optional[float] = None
hbd: Optional[int] = None # H-bond donors
hba: Optional[int] = None # H-bond acceptors
tpsa: Optional[float] = None # Topological polar surface area
rotatable_bonds: Optional[int] = None
# Rendered images
structure_2d_svg: Optional[str] = None
structure_2d_png_base64: Optional[str] = None
fingerprint_svg: Optional[str] = None
class MoleculeRenderer:
"""
Renders molecular structures using RDKit.
Provides:
- 2D structure images (SVG and PNG)
- Morgan fingerprint bit visualization
- Basic property calculations
"""
def __init__(self):
"""Initialize the renderer and check RDKit availability."""
self._rdkit_available = self._check_rdkit()
def _check_rdkit(self) -> bool:
"""Check if RDKit is available."""
try:
from rdkit import Chem
from rdkit.Chem import Draw
return True
except ImportError:
print("Warning: RDKit not installed. Molecular rendering disabled.")
print("Install with: pip install rdkit")
return False
@property
def is_available(self) -> bool:
"""Check if rendering is available."""
return self._rdkit_available
def parse_smiles(self, smiles: str) -> Optional[Any]:
"""
Parse SMILES string to RDKit molecule object with enhanced error handling.
Handles:
- Standard SMILES parsing
- SMILES with encoding issues (URL encoding, whitespace)
- Complex stereochemistry
- Salts and mixtures
Args:
smiles: SMILES notation
Returns:
RDKit Mol object or None if invalid
"""
if not self._rdkit_available:
return None
if not smiles or not smiles.strip():
return None
from rdkit import Chem
# Step 1: Clean and normalize SMILES
clean_smiles = self._normalize_smiles(smiles)
# Step 2: Try standard parsing
mol = Chem.MolFromSmiles(clean_smiles)
if mol is not None:
return mol
# Step 3: Try parsing without sanitization (for debugging)
try:
mol = Chem.MolFromSmiles(clean_smiles, sanitize=False)
if mol is not None:
# Try to sanitize manually
try:
Chem.SanitizeMol(mol)
return mol
except:
# Return unsanitized if that fails
pass
except:
pass
# Step 4: Try stripping stereochemistry for complex molecules
try:
stripped_smiles = self._strip_stereochemistry(clean_smiles)
mol = Chem.MolFromSmiles(stripped_smiles)
if mol is not None:
return mol
except:
pass
# Step 5: For salt forms (e.g., "sodium salt"), try splitting
if '.' in clean_smiles:
# Take the largest fragment
fragments = clean_smiles.split('.')
largest = max(fragments, key=len)
mol = Chem.MolFromSmiles(largest)
if mol is not None:
return mol
return None
def _normalize_smiles(self, smiles: str) -> str:
"""Normalize SMILES string by cleaning common issues."""
import re
import urllib.parse
# Decode URL encoding if present
if '%' in smiles:
try:
smiles = urllib.parse.unquote(smiles)
except:
pass
# Remove whitespace and newlines
smiles = smiles.strip().replace('\n', '').replace('\r', '').replace(' ', '')
# Remove common prefixes/suffixes that might be added
prefixes = ['SMILES:', 'smiles:', 'SMILES=', 'smiles=']
for prefix in prefixes:
if smiles.startswith(prefix):
smiles = smiles[len(prefix):]
# Fix common encoding issues
smiles = smiles.replace('(', '(').replace(')', ')')
smiles = smiles.replace('【', '[').replace('】', ']')
smiles = smiles.replace('=', '=').replace('#', '#')
return smiles
def _strip_stereochemistry(self, smiles: str) -> str:
"""Strip stereochemistry from SMILES for fallback parsing."""
import re
# Remove @ symbols (stereochemistry markers)
smiles = re.sub(r'@+', '', smiles)
# Remove E/Z markers in double bonds
smiles = re.sub(r'/|\\\\', '', smiles)
return smiles
def render_2d_svg(
self,
smiles: str,
width: int = 400,
height: int = 300,
highlight_atoms: Optional[List[int]] = None,
) -> Optional[str]:
"""
Render 2D structure as SVG.
Args:
smiles: SMILES notation
width: Image width
height: Image height
highlight_atoms: Optional list of atom indices to highlight
Returns:
SVG string or None if failed
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
mol = self.parse_smiles(smiles)
if mol is None:
return None
# Create drawer
drawer = rdMolDraw2D.MolDraw2DSVG(width, height)
# Set drawing options
opts = drawer.drawOptions()
opts.addStereoAnnotation = True
opts.addAtomIndices = False
# Draw molecule
if highlight_atoms:
drawer.DrawMolecule(mol, highlightAtoms=highlight_atoms)
else:
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
return svg
def render_2d_png_base64(
self,
smiles: str,
width: int = 400,
height: int = 300,
) -> Optional[str]:
"""
Render 2D structure as PNG and return base64 encoded string.
Args:
smiles: SMILES notation
width: Image width
height: Image height
Returns:
Base64 encoded PNG string or None if failed
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import Draw
mol = self.parse_smiles(smiles)
if mol is None:
return None
# Generate PNG image
img = Draw.MolToImage(mol, size=(width, height))
# Convert to base64
buffer = io.BytesIO()
img.save(buffer, format='PNG')
buffer.seek(0)
png_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
return png_base64
def get_data_uri(self, smiles: str, width: int = 400, height: int = 300) -> Optional[str]:
"""
Get a data URI for embedding molecule image in HTML.
Args:
smiles: SMILES notation
width: Image width
height: Image height
Returns:
Data URI string or None
"""
png_base64 = self.render_2d_png_base64(smiles, width, height)
if png_base64:
return f"data:image/png;base64,{png_base64}"
return None
def calculate_morgan_fingerprint(
self,
smiles: str,
radius: int = 2,
n_bits: int = 2048,
) -> Optional[List[int]]:
"""
Calculate Morgan fingerprint (circular fingerprint).
Args:
smiles: SMILES notation
radius: Fingerprint radius
n_bits: Number of bits
Returns:
List of on-bit indices or None
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import AllChem
mol = self.parse_smiles(smiles)
if mol is None:
return None
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
# Get on-bits
on_bits = list(fp.GetOnBits())
return on_bits
def render_fingerprint_bits(
self,
smiles: str,
radius: int = 2,
highlight_bits: Optional[List[int]] = None,
) -> Optional[str]:
"""
Render Morgan fingerprint bit visualization as SVG.
Shows which atoms contribute to specific fingerprint bits.
Args:
smiles: SMILES notation
radius: Morgan fingerprint radius
highlight_bits: Specific bits to highlight
Returns:
SVG string or None
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import rdMolDraw2D
mol = self.parse_smiles(smiles)
if mol is None:
return None
# Get bit info (which atoms contribute to which bits)
bi = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, bitInfo=bi)
# If specific bits requested, get atoms for those
if highlight_bits:
atoms_to_highlight = set()
for bit in highlight_bits:
if bit in bi:
for atom_info in bi[bit]:
center_atom, _ = atom_info
atoms_to_highlight.add(center_atom)
highlight_atoms = list(atoms_to_highlight)
else:
highlight_atoms = None
# Render with highlights
return self.render_2d_svg(smiles, highlight_atoms=highlight_atoms)
def calculate_properties(self, smiles: str) -> Optional[Dict[str, Any]]:
"""
Calculate basic molecular properties.
Args:
smiles: SMILES notation
Returns:
Dictionary of properties or None
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
mol = self.parse_smiles(smiles)
if mol is None:
return None
return {
"molecular_weight": round(Descriptors.MolWt(mol), 2),
"logp": round(Descriptors.MolLogP(mol), 2),
"hbd": Lipinski.NumHDonors(mol),
"hba": Lipinski.NumHAcceptors(mol),
"tpsa": round(Descriptors.TPSA(mol), 2),
"rotatable_bonds": Lipinski.NumRotatableBonds(mol),
"num_atoms": mol.GetNumAtoms(),
"num_heavy_atoms": Lipinski.HeavyAtomCount(mol),
"num_rings": Lipinski.RingCount(mol),
"fraction_sp3": round(Lipinski.FractionCSP3(mol), 2),
}
def get_molecule_info(self, smiles: str, name: Optional[str] = None) -> MoleculeInfo:
"""
Get comprehensive molecule information including rendered images.
Args:
smiles: SMILES notation
name: Optional molecule name
Returns:
MoleculeInfo object with all available data
"""
info = MoleculeInfo(smiles=smiles, name=name)
if not self._rdkit_available:
return info
# Calculate properties
props = self.calculate_properties(smiles)
if props:
info.molecular_weight = props["molecular_weight"]
info.logp = props["logp"]
info.hbd = props["hbd"]
info.hba = props["hba"]
info.tpsa = props["tpsa"]
info.rotatable_bonds = props["rotatable_bonds"]
# Render images
info.structure_2d_svg = self.render_2d_svg(smiles)
info.structure_2d_png_base64 = self.render_2d_png_base64(smiles)
return info
def identify_functional_groups(self, smiles: str) -> List[Dict[str, Any]]:
"""
Identify reactive functional groups in a molecule using SMARTS patterns.
This is crucial for compatibility analysis as it identifies
potential reactive sites in the API molecule.
Args:
smiles: SMILES notation
Returns:
List of identified functional groups with properties
"""
if not self._rdkit_available:
return []
from rdkit import Chem
mol = self.parse_smiles(smiles)
if mol is None:
return []
# Define SMARTS patterns for pharmaceutically relevant functional groups
functional_group_patterns = {
# Amines
"primary_amine": {
"smarts": "[NX3H2;!$([NX3H2]-C=O)]",
"name_cn": "伯胺基团",
"name_en": "Primary Amine",
"property_type": "碱性",
"reactions": ["美拉德反应(Maillard Reaction)", "氧化脱氨(Oxidative Deamination)", "席夫碱形成(Schiff Base)"],
},
"secondary_amine": {
"smarts": "[NX3H1;!$([NX3H1]-C=O)]([#6])([#6])",
"name_cn": "仲胺基团",
"name_en": "Secondary Amine",
"property_type": "碱性",
"reactions": ["美拉德反应(Maillard Reaction)", "N-氧化(N-Oxidation)"],
},
"tertiary_amine": {
"smarts": "[NX3H0;!$([NX3]-C=O)]([#6])([#6])([#6])",
"name_cn": "叔胺基团",
"name_en": "Tertiary Amine",
"property_type": "碱性",
"reactions": ["N-氧化(N-Oxidation)"],
},
# Thiols and Thioethers
"thiol": {
"smarts": "[SH]",
"name_cn": "巯基",
"name_en": "Thiol",
"property_type": "中性/弱酸性",
"reactions": ["氧化成二硫键(Disulfide Formation)", "金属配位(Metal Coordination)"],
},
"thioether": {
"smarts": "[#6][SX2][#6]",
"name_cn": "硫醚基团",
"name_en": "Thioether",
"property_type": "中性",
"reactions": ["氧化成亚砜(Sulfoxide Formation)", "氧化成砜(Sulfone Formation)"],
},
# Hydroxyl groups
"phenol": {
"smarts": "[OX2H][c]",
"name_cn": "酚羟基",
"name_en": "Phenolic Hydroxyl",
"property_type": "弱酸性",
"reactions": ["氧化(Oxidation)", "光氧化(Photooxidation)", "醌形成(Quinone Formation)"],
},
"alcohol": {
"smarts": "[OX2H][CX4]",
"name_cn": "醇羟基",
"name_en": "Aliphatic Hydroxyl",
"property_type": "中性",
"reactions": ["脱水(Dehydration)", "酯化(Esterification)"],
},
# Carbonyl groups
"aldehyde": {
"smarts": "[CX3H1](=O)[#6]",
"name_cn": "醛基",
"name_en": "Aldehyde",
"property_type": "中性/亲电",
"reactions": ["美拉德反应(Maillard Reaction)", "氧化成羧酸(Oxidation to Carboxylic Acid)"],
},
"ketone": {
"smarts": "[CX3](=O)([#6])[#6]",
"name_cn": "酮基",
"name_en": "Ketone",
"property_type": "中性",
"reactions": ["还原(Reduction)", "缩合反应(Condensation)"],
},
# Carboxylic acid and derivatives
"carboxylic_acid": {
"smarts": "[CX3](=O)[OX2H]",
"name_cn": "羧基",
"name_en": "Carboxylic Acid",
"property_type": "酸性",
"reactions": ["盐形成(Salt Formation)", "酰胺化(Amidation)"],
},
"ester": {
"smarts": "[CX3](=O)[OX2][#6]",
"name_cn": "酯基",
"name_en": "Ester",
"property_type": "中性",
"reactions": ["水解(Hydrolysis)", "转酯化(Transesterification)"],
},
"amide": {
"smarts": "[CX3](=O)[NX3]",
"name_cn": "酰胺基",
"name_en": "Amide",
"property_type": "中性",
"reactions": ["水解(Hydrolysis)"],
},
"lactone": {
"smarts": "[#6]1~[#6]~[#6](=O)~[OX2]~1",
"name_cn": "内酯环",
"name_en": "Lactone",
"property_type": "中性",
"reactions": ["开环水解(Ring-opening Hydrolysis)"],
},
# Nitrogen heterocycles
"pyridine": {
"smarts": "c1ccncc1",
"name_cn": "吡啶环",
"name_en": "Pyridine",
"property_type": "碱性",
"reactions": ["N-氧化(N-Oxidation)", "质子化(Protonation)"],
},
"imidazole": {
"smarts": "c1cnc[nH]1",
"name_cn": "咪唑环",
"name_en": "Imidazole",
"property_type": "碱性/两性",
"reactions": ["N-氧化(N-Oxidation)", "金属配位(Metal Coordination)"],
},
# Other important groups
"nitrile": {
"smarts": "[CX2]#N",
"name_cn": "氰基",
"name_en": "Nitrile",
"property_type": "中性",
"reactions": ["水解成酰胺/羧酸(Hydrolysis)"],
},
"allylic": {
"smarts": "[CX4][CX3]=[CX3]",
"name_cn": "烯丙位",
"name_en": "Allylic Position",
"property_type": "中性",
"reactions": ["自氧化(Autoxidation)"],
},
"benzylic": {
"smarts": "[CX4H2]c",
"name_cn": "苄位",
"name_en": "Benzylic Position",
"property_type": "中性",
"reactions": ["自氧化(Autoxidation)"],
},
}
identified_groups = []
for group_id, group_info in functional_group_patterns.items():
pattern = Chem.MolFromSmarts(group_info["smarts"])
if pattern is None:
continue
matches = mol.GetSubstructMatches(pattern)
if matches:
identified_groups.append({
"id": group_id,
"name_cn": group_info["name_cn"],
"name_en": group_info["name_en"],
"property_type": group_info["property_type"],
"potential_reactions": group_info["reactions"],
"count": len(matches),
"atom_indices": [list(m) for m in matches],
})
return identified_groups
def get_functional_groups_summary(self, smiles: str) -> str:
"""
Get a formatted text summary of identified functional groups.
Args:
smiles: SMILES notation
Returns:
Formatted string for use in prompts
"""
groups = self.identify_functional_groups(smiles)
if not groups:
return "未能识别到特征官能团,请人工确认分子结构"
lines = []
for g in groups:
count_str = f"×{g['count']}" if g['count'] > 1 else ""
lines.append(f"{g['name_cn']}{g['name_en']}{count_str} - {g['property_type']}")
return ";".join(lines)
def get_3dmol_script(smiles: str, container_id: str = "mol3d") -> str:
"""
Generate JavaScript for 3Dmol.js visualization.
This returns a script that can be embedded in HTML to show
an interactive 3D molecular viewer.
Args:
smiles: SMILES notation
container_id: HTML container element ID
Returns:
JavaScript code string
"""
# Note: This requires 3Dmol.js to be loaded in the page
# and a valid SDF/MOL block. For simplicity, we use
# the SMILES directly and let 3Dmol parse it.
return f"""
<script>
(function() {{
let viewer = $3Dmol.createViewer(document.getElementById('{container_id}'), {{
backgroundColor: 'white'
}});
// Use PubChem to get 3D structure from SMILES
// Alternatively, generate conformer with RDKit
let smiles = '{smiles}';
// Add molecule from SMILES (requires 3Dmol.js SmilesParser)
viewer.addModel(smiles, 'smi');
viewer.setStyle({{}}, {{stick: {{}}}});
viewer.zoomTo();
viewer.render();
}})();
</script>
"""
def get_3dmol_html(
smiles: str,
width: int = 400,
height: int = 300,
) -> str:
"""
Generate complete HTML for 3Dmol.js visualization.
Args:
smiles: SMILES notation
width: Viewer width
height: Viewer height
Returns:
Complete HTML string
"""
return f"""
<div id="mol3d-container" style="width: {width}px; height: {height}px; position: relative;">
<div id="mol3d" style="width: 100%; height: 100%;"></div>
</div>
<script src="https://3dmol.org/build/3Dmol-min.js"></script>
{get_3dmol_script(smiles, 'mol3d')}
"""
# Singleton instance for easy import
renderer = MoleculeRenderer()