""" Molecular Structure Renderer. This module provides molecular structure visualization using RDKit: - 2D structure rendering - Morgan fingerprint visualization - Molecular property calculation For web-based 3D visualization, we also support 3Dmol.js integration. """ import io import base64 from typing import Optional, Dict, Any, Tuple, List from dataclasses import dataclass @dataclass class MoleculeInfo: """Container for molecular information.""" smiles: str name: Optional[str] = None # Calculated properties molecular_weight: Optional[float] = None logp: Optional[float] = None hbd: Optional[int] = None # H-bond donors hba: Optional[int] = None # H-bond acceptors tpsa: Optional[float] = None # Topological polar surface area rotatable_bonds: Optional[int] = None # Rendered images structure_2d_svg: Optional[str] = None structure_2d_png_base64: Optional[str] = None fingerprint_svg: Optional[str] = None class MoleculeRenderer: """ Renders molecular structures using RDKit. Provides: - 2D structure images (SVG and PNG) - Morgan fingerprint bit visualization - Basic property calculations """ def __init__(self): """Initialize the renderer and check RDKit availability.""" self._rdkit_available = self._check_rdkit() def _check_rdkit(self) -> bool: """Check if RDKit is available.""" try: from rdkit import Chem from rdkit.Chem import Draw return True except ImportError: print("Warning: RDKit not installed. Molecular rendering disabled.") print("Install with: pip install rdkit") return False @property def is_available(self) -> bool: """Check if rendering is available.""" return self._rdkit_available def parse_smiles(self, smiles: str) -> Optional[Any]: """ Parse SMILES string to RDKit molecule object with enhanced error handling. Handles: - Standard SMILES parsing - SMILES with encoding issues (URL encoding, whitespace) - Complex stereochemistry - Salts and mixtures Args: smiles: SMILES notation Returns: RDKit Mol object or None if invalid """ if not self._rdkit_available: return None if not smiles or not smiles.strip(): return None from rdkit import Chem # Step 1: Clean and normalize SMILES clean_smiles = self._normalize_smiles(smiles) # Step 2: Try standard parsing mol = Chem.MolFromSmiles(clean_smiles) if mol is not None: return mol # Step 3: Try parsing without sanitization (for debugging) try: mol = Chem.MolFromSmiles(clean_smiles, sanitize=False) if mol is not None: # Try to sanitize manually try: Chem.SanitizeMol(mol) return mol except: # Return unsanitized if that fails pass except: pass # Step 4: Try stripping stereochemistry for complex molecules try: stripped_smiles = self._strip_stereochemistry(clean_smiles) mol = Chem.MolFromSmiles(stripped_smiles) if mol is not None: return mol except: pass # Step 5: For salt forms (e.g., "sodium salt"), try splitting if '.' in clean_smiles: # Take the largest fragment fragments = clean_smiles.split('.') largest = max(fragments, key=len) mol = Chem.MolFromSmiles(largest) if mol is not None: return mol return None def _normalize_smiles(self, smiles: str) -> str: """Normalize SMILES string by cleaning common issues.""" import re import urllib.parse # Decode URL encoding if present if '%' in smiles: try: smiles = urllib.parse.unquote(smiles) except: pass # Remove whitespace and newlines smiles = smiles.strip().replace('\n', '').replace('\r', '').replace(' ', '') # Remove common prefixes/suffixes that might be added prefixes = ['SMILES:', 'smiles:', 'SMILES=', 'smiles='] for prefix in prefixes: if smiles.startswith(prefix): smiles = smiles[len(prefix):] # Fix common encoding issues smiles = smiles.replace('(', '(').replace(')', ')') smiles = smiles.replace('【', '[').replace('】', ']') smiles = smiles.replace('=', '=').replace('#', '#') return smiles def _strip_stereochemistry(self, smiles: str) -> str: """Strip stereochemistry from SMILES for fallback parsing.""" import re # Remove @ symbols (stereochemistry markers) smiles = re.sub(r'@+', '', smiles) # Remove E/Z markers in double bonds smiles = re.sub(r'/|\\\\', '', smiles) return smiles def render_2d_svg( self, smiles: str, width: int = 400, height: int = 300, highlight_atoms: Optional[List[int]] = None, ) -> Optional[str]: """ Render 2D structure as SVG. Args: smiles: SMILES notation width: Image width height: Image height highlight_atoms: Optional list of atom indices to highlight Returns: SVG string or None if failed """ if not self._rdkit_available: return None from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem.Draw import rdMolDraw2D mol = self.parse_smiles(smiles) if mol is None: return None # Create drawer drawer = rdMolDraw2D.MolDraw2DSVG(width, height) # Set drawing options opts = drawer.drawOptions() opts.addStereoAnnotation = True opts.addAtomIndices = False # Draw molecule if highlight_atoms: drawer.DrawMolecule(mol, highlightAtoms=highlight_atoms) else: drawer.DrawMolecule(mol) drawer.FinishDrawing() svg = drawer.GetDrawingText() return svg def render_2d_png_base64( self, smiles: str, width: int = 400, height: int = 300, ) -> Optional[str]: """ Render 2D structure as PNG and return base64 encoded string. Args: smiles: SMILES notation width: Image width height: Image height Returns: Base64 encoded PNG string or None if failed """ if not self._rdkit_available: return None from rdkit import Chem from rdkit.Chem import Draw mol = self.parse_smiles(smiles) if mol is None: return None # Generate PNG image img = Draw.MolToImage(mol, size=(width, height)) # Convert to base64 buffer = io.BytesIO() img.save(buffer, format='PNG') buffer.seek(0) png_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') return png_base64 def get_data_uri(self, smiles: str, width: int = 400, height: int = 300) -> Optional[str]: """ Get a data URI for embedding molecule image in HTML. Args: smiles: SMILES notation width: Image width height: Image height Returns: Data URI string or None """ png_base64 = self.render_2d_png_base64(smiles, width, height) if png_base64: return f"data:image/png;base64,{png_base64}" return None def calculate_morgan_fingerprint( self, smiles: str, radius: int = 2, n_bits: int = 2048, ) -> Optional[List[int]]: """ Calculate Morgan fingerprint (circular fingerprint). Args: smiles: SMILES notation radius: Fingerprint radius n_bits: Number of bits Returns: List of on-bit indices or None """ if not self._rdkit_available: return None from rdkit import Chem from rdkit.Chem import AllChem mol = self.parse_smiles(smiles) if mol is None: return None fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits) # Get on-bits on_bits = list(fp.GetOnBits()) return on_bits def render_fingerprint_bits( self, smiles: str, radius: int = 2, highlight_bits: Optional[List[int]] = None, ) -> Optional[str]: """ Render Morgan fingerprint bit visualization as SVG. Shows which atoms contribute to specific fingerprint bits. Args: smiles: SMILES notation radius: Morgan fingerprint radius highlight_bits: Specific bits to highlight Returns: SVG string or None """ if not self._rdkit_available: return None from rdkit import Chem from rdkit.Chem import AllChem, Draw from rdkit.Chem.Draw import rdMolDraw2D mol = self.parse_smiles(smiles) if mol is None: return None # Get bit info (which atoms contribute to which bits) bi = {} fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, bitInfo=bi) # If specific bits requested, get atoms for those if highlight_bits: atoms_to_highlight = set() for bit in highlight_bits: if bit in bi: for atom_info in bi[bit]: center_atom, _ = atom_info atoms_to_highlight.add(center_atom) highlight_atoms = list(atoms_to_highlight) else: highlight_atoms = None # Render with highlights return self.render_2d_svg(smiles, highlight_atoms=highlight_atoms) def calculate_properties(self, smiles: str) -> Optional[Dict[str, Any]]: """ Calculate basic molecular properties. Args: smiles: SMILES notation Returns: Dictionary of properties or None """ if not self._rdkit_available: return None from rdkit import Chem from rdkit.Chem import Descriptors, Lipinski mol = self.parse_smiles(smiles) if mol is None: return None return { "molecular_weight": round(Descriptors.MolWt(mol), 2), "logp": round(Descriptors.MolLogP(mol), 2), "hbd": Lipinski.NumHDonors(mol), "hba": Lipinski.NumHAcceptors(mol), "tpsa": round(Descriptors.TPSA(mol), 2), "rotatable_bonds": Lipinski.NumRotatableBonds(mol), "num_atoms": mol.GetNumAtoms(), "num_heavy_atoms": Lipinski.HeavyAtomCount(mol), "num_rings": Lipinski.RingCount(mol), "fraction_sp3": round(Lipinski.FractionCSP3(mol), 2), } def get_molecule_info(self, smiles: str, name: Optional[str] = None) -> MoleculeInfo: """ Get comprehensive molecule information including rendered images. Args: smiles: SMILES notation name: Optional molecule name Returns: MoleculeInfo object with all available data """ info = MoleculeInfo(smiles=smiles, name=name) if not self._rdkit_available: return info # Calculate properties props = self.calculate_properties(smiles) if props: info.molecular_weight = props["molecular_weight"] info.logp = props["logp"] info.hbd = props["hbd"] info.hba = props["hba"] info.tpsa = props["tpsa"] info.rotatable_bonds = props["rotatable_bonds"] # Render images info.structure_2d_svg = self.render_2d_svg(smiles) info.structure_2d_png_base64 = self.render_2d_png_base64(smiles) return info def identify_functional_groups(self, smiles: str) -> List[Dict[str, Any]]: """ Identify reactive functional groups in a molecule using SMARTS patterns. This is crucial for compatibility analysis as it identifies potential reactive sites in the API molecule. Args: smiles: SMILES notation Returns: List of identified functional groups with properties """ if not self._rdkit_available: return [] from rdkit import Chem mol = self.parse_smiles(smiles) if mol is None: return [] # Define SMARTS patterns for pharmaceutically relevant functional groups functional_group_patterns = { # Amines "primary_amine": { "smarts": "[NX3H2;!$([NX3H2]-C=O)]", "name_cn": "伯胺基团", "name_en": "Primary Amine", "property_type": "碱性", "reactions": ["美拉德反应(Maillard Reaction)", "氧化脱氨(Oxidative Deamination)", "席夫碱形成(Schiff Base)"], }, "secondary_amine": { "smarts": "[NX3H1;!$([NX3H1]-C=O)]([#6])([#6])", "name_cn": "仲胺基团", "name_en": "Secondary Amine", "property_type": "碱性", "reactions": ["美拉德反应(Maillard Reaction)", "N-氧化(N-Oxidation)"], }, "tertiary_amine": { "smarts": "[NX3H0;!$([NX3]-C=O)]([#6])([#6])([#6])", "name_cn": "叔胺基团", "name_en": "Tertiary Amine", "property_type": "碱性", "reactions": ["N-氧化(N-Oxidation)"], }, # Thiols and Thioethers "thiol": { "smarts": "[SH]", "name_cn": "巯基", "name_en": "Thiol", "property_type": "中性/弱酸性", "reactions": ["氧化成二硫键(Disulfide Formation)", "金属配位(Metal Coordination)"], }, "thioether": { "smarts": "[#6][SX2][#6]", "name_cn": "硫醚基团", "name_en": "Thioether", "property_type": "中性", "reactions": ["氧化成亚砜(Sulfoxide Formation)", "氧化成砜(Sulfone Formation)"], }, # Hydroxyl groups "phenol": { "smarts": "[OX2H][c]", "name_cn": "酚羟基", "name_en": "Phenolic Hydroxyl", "property_type": "弱酸性", "reactions": ["氧化(Oxidation)", "光氧化(Photooxidation)", "醌形成(Quinone Formation)"], }, "alcohol": { "smarts": "[OX2H][CX4]", "name_cn": "醇羟基", "name_en": "Aliphatic Hydroxyl", "property_type": "中性", "reactions": ["脱水(Dehydration)", "酯化(Esterification)"], }, # Carbonyl groups "aldehyde": { "smarts": "[CX3H1](=O)[#6]", "name_cn": "醛基", "name_en": "Aldehyde", "property_type": "中性/亲电", "reactions": ["美拉德反应(Maillard Reaction)", "氧化成羧酸(Oxidation to Carboxylic Acid)"], }, "ketone": { "smarts": "[CX3](=O)([#6])[#6]", "name_cn": "酮基", "name_en": "Ketone", "property_type": "中性", "reactions": ["还原(Reduction)", "缩合反应(Condensation)"], }, # Carboxylic acid and derivatives "carboxylic_acid": { "smarts": "[CX3](=O)[OX2H]", "name_cn": "羧基", "name_en": "Carboxylic Acid", "property_type": "酸性", "reactions": ["盐形成(Salt Formation)", "酰胺化(Amidation)"], }, "ester": { "smarts": "[CX3](=O)[OX2][#6]", "name_cn": "酯基", "name_en": "Ester", "property_type": "中性", "reactions": ["水解(Hydrolysis)", "转酯化(Transesterification)"], }, "amide": { "smarts": "[CX3](=O)[NX3]", "name_cn": "酰胺基", "name_en": "Amide", "property_type": "中性", "reactions": ["水解(Hydrolysis)"], }, "lactone": { "smarts": "[#6]1~[#6]~[#6](=O)~[OX2]~1", "name_cn": "内酯环", "name_en": "Lactone", "property_type": "中性", "reactions": ["开环水解(Ring-opening Hydrolysis)"], }, # Nitrogen heterocycles "pyridine": { "smarts": "c1ccncc1", "name_cn": "吡啶环", "name_en": "Pyridine", "property_type": "碱性", "reactions": ["N-氧化(N-Oxidation)", "质子化(Protonation)"], }, "imidazole": { "smarts": "c1cnc[nH]1", "name_cn": "咪唑环", "name_en": "Imidazole", "property_type": "碱性/两性", "reactions": ["N-氧化(N-Oxidation)", "金属配位(Metal Coordination)"], }, # Other important groups "nitrile": { "smarts": "[CX2]#N", "name_cn": "氰基", "name_en": "Nitrile", "property_type": "中性", "reactions": ["水解成酰胺/羧酸(Hydrolysis)"], }, "allylic": { "smarts": "[CX4][CX3]=[CX3]", "name_cn": "烯丙位", "name_en": "Allylic Position", "property_type": "中性", "reactions": ["自氧化(Autoxidation)"], }, "benzylic": { "smarts": "[CX4H2]c", "name_cn": "苄位", "name_en": "Benzylic Position", "property_type": "中性", "reactions": ["自氧化(Autoxidation)"], }, } identified_groups = [] for group_id, group_info in functional_group_patterns.items(): pattern = Chem.MolFromSmarts(group_info["smarts"]) if pattern is None: continue matches = mol.GetSubstructMatches(pattern) if matches: identified_groups.append({ "id": group_id, "name_cn": group_info["name_cn"], "name_en": group_info["name_en"], "property_type": group_info["property_type"], "potential_reactions": group_info["reactions"], "count": len(matches), "atom_indices": [list(m) for m in matches], }) return identified_groups def get_functional_groups_summary(self, smiles: str) -> str: """ Get a formatted text summary of identified functional groups. Args: smiles: SMILES notation Returns: Formatted string for use in prompts """ groups = self.identify_functional_groups(smiles) if not groups: return "未能识别到特征官能团,请人工确认分子结构" lines = [] for g in groups: count_str = f"×{g['count']}" if g['count'] > 1 else "" lines.append(f"{g['name_cn']}({g['name_en']}){count_str} - {g['property_type']}") return ";".join(lines) def get_3dmol_script(smiles: str, container_id: str = "mol3d") -> str: """ Generate JavaScript for 3Dmol.js visualization. This returns a script that can be embedded in HTML to show an interactive 3D molecular viewer. Args: smiles: SMILES notation container_id: HTML container element ID Returns: JavaScript code string """ # Note: This requires 3Dmol.js to be loaded in the page # and a valid SDF/MOL block. For simplicity, we use # the SMILES directly and let 3Dmol parse it. return f""" """ def get_3dmol_html( smiles: str, width: int = 400, height: int = 300, ) -> str: """ Generate complete HTML for 3Dmol.js visualization. Args: smiles: SMILES notation width: Viewer width height: Viewer height Returns: Complete HTML string """ return f"""