File size: 23,477 Bytes
8368021 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 | """
Molecular Structure Renderer.
This module provides molecular structure visualization using RDKit:
- 2D structure rendering
- Morgan fingerprint visualization
- Molecular property calculation
For web-based 3D visualization, we also support 3Dmol.js integration.
"""
import io
import base64
from typing import Optional, Dict, Any, Tuple, List
from dataclasses import dataclass
@dataclass
class MoleculeInfo:
"""Container for molecular information."""
smiles: str
name: Optional[str] = None
# Calculated properties
molecular_weight: Optional[float] = None
logp: Optional[float] = None
hbd: Optional[int] = None # H-bond donors
hba: Optional[int] = None # H-bond acceptors
tpsa: Optional[float] = None # Topological polar surface area
rotatable_bonds: Optional[int] = None
# Rendered images
structure_2d_svg: Optional[str] = None
structure_2d_png_base64: Optional[str] = None
fingerprint_svg: Optional[str] = None
class MoleculeRenderer:
"""
Renders molecular structures using RDKit.
Provides:
- 2D structure images (SVG and PNG)
- Morgan fingerprint bit visualization
- Basic property calculations
"""
def __init__(self):
"""Initialize the renderer and check RDKit availability."""
self._rdkit_available = self._check_rdkit()
def _check_rdkit(self) -> bool:
"""Check if RDKit is available."""
try:
from rdkit import Chem
from rdkit.Chem import Draw
return True
except ImportError:
print("Warning: RDKit not installed. Molecular rendering disabled.")
print("Install with: pip install rdkit")
return False
@property
def is_available(self) -> bool:
"""Check if rendering is available."""
return self._rdkit_available
def parse_smiles(self, smiles: str) -> Optional[Any]:
"""
Parse SMILES string to RDKit molecule object with enhanced error handling.
Handles:
- Standard SMILES parsing
- SMILES with encoding issues (URL encoding, whitespace)
- Complex stereochemistry
- Salts and mixtures
Args:
smiles: SMILES notation
Returns:
RDKit Mol object or None if invalid
"""
if not self._rdkit_available:
return None
if not smiles or not smiles.strip():
return None
from rdkit import Chem
# Step 1: Clean and normalize SMILES
clean_smiles = self._normalize_smiles(smiles)
# Step 2: Try standard parsing
mol = Chem.MolFromSmiles(clean_smiles)
if mol is not None:
return mol
# Step 3: Try parsing without sanitization (for debugging)
try:
mol = Chem.MolFromSmiles(clean_smiles, sanitize=False)
if mol is not None:
# Try to sanitize manually
try:
Chem.SanitizeMol(mol)
return mol
except:
# Return unsanitized if that fails
pass
except:
pass
# Step 4: Try stripping stereochemistry for complex molecules
try:
stripped_smiles = self._strip_stereochemistry(clean_smiles)
mol = Chem.MolFromSmiles(stripped_smiles)
if mol is not None:
return mol
except:
pass
# Step 5: For salt forms (e.g., "sodium salt"), try splitting
if '.' in clean_smiles:
# Take the largest fragment
fragments = clean_smiles.split('.')
largest = max(fragments, key=len)
mol = Chem.MolFromSmiles(largest)
if mol is not None:
return mol
return None
def _normalize_smiles(self, smiles: str) -> str:
"""Normalize SMILES string by cleaning common issues."""
import re
import urllib.parse
# Decode URL encoding if present
if '%' in smiles:
try:
smiles = urllib.parse.unquote(smiles)
except:
pass
# Remove whitespace and newlines
smiles = smiles.strip().replace('\n', '').replace('\r', '').replace(' ', '')
# Remove common prefixes/suffixes that might be added
prefixes = ['SMILES:', 'smiles:', 'SMILES=', 'smiles=']
for prefix in prefixes:
if smiles.startswith(prefix):
smiles = smiles[len(prefix):]
# Fix common encoding issues
smiles = smiles.replace('(', '(').replace(')', ')')
smiles = smiles.replace('【', '[').replace('】', ']')
smiles = smiles.replace('=', '=').replace('#', '#')
return smiles
def _strip_stereochemistry(self, smiles: str) -> str:
"""Strip stereochemistry from SMILES for fallback parsing."""
import re
# Remove @ symbols (stereochemistry markers)
smiles = re.sub(r'@+', '', smiles)
# Remove E/Z markers in double bonds
smiles = re.sub(r'/|\\\\', '', smiles)
return smiles
def render_2d_svg(
self,
smiles: str,
width: int = 400,
height: int = 300,
highlight_atoms: Optional[List[int]] = None,
) -> Optional[str]:
"""
Render 2D structure as SVG.
Args:
smiles: SMILES notation
width: Image width
height: Image height
highlight_atoms: Optional list of atom indices to highlight
Returns:
SVG string or None if failed
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
mol = self.parse_smiles(smiles)
if mol is None:
return None
# Create drawer
drawer = rdMolDraw2D.MolDraw2DSVG(width, height)
# Set drawing options
opts = drawer.drawOptions()
opts.addStereoAnnotation = True
opts.addAtomIndices = False
# Draw molecule
if highlight_atoms:
drawer.DrawMolecule(mol, highlightAtoms=highlight_atoms)
else:
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
return svg
def render_2d_png_base64(
self,
smiles: str,
width: int = 400,
height: int = 300,
) -> Optional[str]:
"""
Render 2D structure as PNG and return base64 encoded string.
Args:
smiles: SMILES notation
width: Image width
height: Image height
Returns:
Base64 encoded PNG string or None if failed
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import Draw
mol = self.parse_smiles(smiles)
if mol is None:
return None
# Generate PNG image
img = Draw.MolToImage(mol, size=(width, height))
# Convert to base64
buffer = io.BytesIO()
img.save(buffer, format='PNG')
buffer.seek(0)
png_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
return png_base64
def get_data_uri(self, smiles: str, width: int = 400, height: int = 300) -> Optional[str]:
"""
Get a data URI for embedding molecule image in HTML.
Args:
smiles: SMILES notation
width: Image width
height: Image height
Returns:
Data URI string or None
"""
png_base64 = self.render_2d_png_base64(smiles, width, height)
if png_base64:
return f"data:image/png;base64,{png_base64}"
return None
def calculate_morgan_fingerprint(
self,
smiles: str,
radius: int = 2,
n_bits: int = 2048,
) -> Optional[List[int]]:
"""
Calculate Morgan fingerprint (circular fingerprint).
Args:
smiles: SMILES notation
radius: Fingerprint radius
n_bits: Number of bits
Returns:
List of on-bit indices or None
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import AllChem
mol = self.parse_smiles(smiles)
if mol is None:
return None
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
# Get on-bits
on_bits = list(fp.GetOnBits())
return on_bits
def render_fingerprint_bits(
self,
smiles: str,
radius: int = 2,
highlight_bits: Optional[List[int]] = None,
) -> Optional[str]:
"""
Render Morgan fingerprint bit visualization as SVG.
Shows which atoms contribute to specific fingerprint bits.
Args:
smiles: SMILES notation
radius: Morgan fingerprint radius
highlight_bits: Specific bits to highlight
Returns:
SVG string or None
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import rdMolDraw2D
mol = self.parse_smiles(smiles)
if mol is None:
return None
# Get bit info (which atoms contribute to which bits)
bi = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, bitInfo=bi)
# If specific bits requested, get atoms for those
if highlight_bits:
atoms_to_highlight = set()
for bit in highlight_bits:
if bit in bi:
for atom_info in bi[bit]:
center_atom, _ = atom_info
atoms_to_highlight.add(center_atom)
highlight_atoms = list(atoms_to_highlight)
else:
highlight_atoms = None
# Render with highlights
return self.render_2d_svg(smiles, highlight_atoms=highlight_atoms)
def calculate_properties(self, smiles: str) -> Optional[Dict[str, Any]]:
"""
Calculate basic molecular properties.
Args:
smiles: SMILES notation
Returns:
Dictionary of properties or None
"""
if not self._rdkit_available:
return None
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
mol = self.parse_smiles(smiles)
if mol is None:
return None
return {
"molecular_weight": round(Descriptors.MolWt(mol), 2),
"logp": round(Descriptors.MolLogP(mol), 2),
"hbd": Lipinski.NumHDonors(mol),
"hba": Lipinski.NumHAcceptors(mol),
"tpsa": round(Descriptors.TPSA(mol), 2),
"rotatable_bonds": Lipinski.NumRotatableBonds(mol),
"num_atoms": mol.GetNumAtoms(),
"num_heavy_atoms": Lipinski.HeavyAtomCount(mol),
"num_rings": Lipinski.RingCount(mol),
"fraction_sp3": round(Lipinski.FractionCSP3(mol), 2),
}
def get_molecule_info(self, smiles: str, name: Optional[str] = None) -> MoleculeInfo:
"""
Get comprehensive molecule information including rendered images.
Args:
smiles: SMILES notation
name: Optional molecule name
Returns:
MoleculeInfo object with all available data
"""
info = MoleculeInfo(smiles=smiles, name=name)
if not self._rdkit_available:
return info
# Calculate properties
props = self.calculate_properties(smiles)
if props:
info.molecular_weight = props["molecular_weight"]
info.logp = props["logp"]
info.hbd = props["hbd"]
info.hba = props["hba"]
info.tpsa = props["tpsa"]
info.rotatable_bonds = props["rotatable_bonds"]
# Render images
info.structure_2d_svg = self.render_2d_svg(smiles)
info.structure_2d_png_base64 = self.render_2d_png_base64(smiles)
return info
def identify_functional_groups(self, smiles: str) -> List[Dict[str, Any]]:
"""
Identify reactive functional groups in a molecule using SMARTS patterns.
This is crucial for compatibility analysis as it identifies
potential reactive sites in the API molecule.
Args:
smiles: SMILES notation
Returns:
List of identified functional groups with properties
"""
if not self._rdkit_available:
return []
from rdkit import Chem
mol = self.parse_smiles(smiles)
if mol is None:
return []
# Define SMARTS patterns for pharmaceutically relevant functional groups
functional_group_patterns = {
# Amines
"primary_amine": {
"smarts": "[NX3H2;!$([NX3H2]-C=O)]",
"name_cn": "伯胺基团",
"name_en": "Primary Amine",
"property_type": "碱性",
"reactions": ["美拉德反应(Maillard Reaction)", "氧化脱氨(Oxidative Deamination)", "席夫碱形成(Schiff Base)"],
},
"secondary_amine": {
"smarts": "[NX3H1;!$([NX3H1]-C=O)]([#6])([#6])",
"name_cn": "仲胺基团",
"name_en": "Secondary Amine",
"property_type": "碱性",
"reactions": ["美拉德反应(Maillard Reaction)", "N-氧化(N-Oxidation)"],
},
"tertiary_amine": {
"smarts": "[NX3H0;!$([NX3]-C=O)]([#6])([#6])([#6])",
"name_cn": "叔胺基团",
"name_en": "Tertiary Amine",
"property_type": "碱性",
"reactions": ["N-氧化(N-Oxidation)"],
},
# Thiols and Thioethers
"thiol": {
"smarts": "[SH]",
"name_cn": "巯基",
"name_en": "Thiol",
"property_type": "中性/弱酸性",
"reactions": ["氧化成二硫键(Disulfide Formation)", "金属配位(Metal Coordination)"],
},
"thioether": {
"smarts": "[#6][SX2][#6]",
"name_cn": "硫醚基团",
"name_en": "Thioether",
"property_type": "中性",
"reactions": ["氧化成亚砜(Sulfoxide Formation)", "氧化成砜(Sulfone Formation)"],
},
# Hydroxyl groups
"phenol": {
"smarts": "[OX2H][c]",
"name_cn": "酚羟基",
"name_en": "Phenolic Hydroxyl",
"property_type": "弱酸性",
"reactions": ["氧化(Oxidation)", "光氧化(Photooxidation)", "醌形成(Quinone Formation)"],
},
"alcohol": {
"smarts": "[OX2H][CX4]",
"name_cn": "醇羟基",
"name_en": "Aliphatic Hydroxyl",
"property_type": "中性",
"reactions": ["脱水(Dehydration)", "酯化(Esterification)"],
},
# Carbonyl groups
"aldehyde": {
"smarts": "[CX3H1](=O)[#6]",
"name_cn": "醛基",
"name_en": "Aldehyde",
"property_type": "中性/亲电",
"reactions": ["美拉德反应(Maillard Reaction)", "氧化成羧酸(Oxidation to Carboxylic Acid)"],
},
"ketone": {
"smarts": "[CX3](=O)([#6])[#6]",
"name_cn": "酮基",
"name_en": "Ketone",
"property_type": "中性",
"reactions": ["还原(Reduction)", "缩合反应(Condensation)"],
},
# Carboxylic acid and derivatives
"carboxylic_acid": {
"smarts": "[CX3](=O)[OX2H]",
"name_cn": "羧基",
"name_en": "Carboxylic Acid",
"property_type": "酸性",
"reactions": ["盐形成(Salt Formation)", "酰胺化(Amidation)"],
},
"ester": {
"smarts": "[CX3](=O)[OX2][#6]",
"name_cn": "酯基",
"name_en": "Ester",
"property_type": "中性",
"reactions": ["水解(Hydrolysis)", "转酯化(Transesterification)"],
},
"amide": {
"smarts": "[CX3](=O)[NX3]",
"name_cn": "酰胺基",
"name_en": "Amide",
"property_type": "中性",
"reactions": ["水解(Hydrolysis)"],
},
"lactone": {
"smarts": "[#6]1~[#6]~[#6](=O)~[OX2]~1",
"name_cn": "内酯环",
"name_en": "Lactone",
"property_type": "中性",
"reactions": ["开环水解(Ring-opening Hydrolysis)"],
},
# Nitrogen heterocycles
"pyridine": {
"smarts": "c1ccncc1",
"name_cn": "吡啶环",
"name_en": "Pyridine",
"property_type": "碱性",
"reactions": ["N-氧化(N-Oxidation)", "质子化(Protonation)"],
},
"imidazole": {
"smarts": "c1cnc[nH]1",
"name_cn": "咪唑环",
"name_en": "Imidazole",
"property_type": "碱性/两性",
"reactions": ["N-氧化(N-Oxidation)", "金属配位(Metal Coordination)"],
},
# Other important groups
"nitrile": {
"smarts": "[CX2]#N",
"name_cn": "氰基",
"name_en": "Nitrile",
"property_type": "中性",
"reactions": ["水解成酰胺/羧酸(Hydrolysis)"],
},
"allylic": {
"smarts": "[CX4][CX3]=[CX3]",
"name_cn": "烯丙位",
"name_en": "Allylic Position",
"property_type": "中性",
"reactions": ["自氧化(Autoxidation)"],
},
"benzylic": {
"smarts": "[CX4H2]c",
"name_cn": "苄位",
"name_en": "Benzylic Position",
"property_type": "中性",
"reactions": ["自氧化(Autoxidation)"],
},
}
identified_groups = []
for group_id, group_info in functional_group_patterns.items():
pattern = Chem.MolFromSmarts(group_info["smarts"])
if pattern is None:
continue
matches = mol.GetSubstructMatches(pattern)
if matches:
identified_groups.append({
"id": group_id,
"name_cn": group_info["name_cn"],
"name_en": group_info["name_en"],
"property_type": group_info["property_type"],
"potential_reactions": group_info["reactions"],
"count": len(matches),
"atom_indices": [list(m) for m in matches],
})
return identified_groups
def get_functional_groups_summary(self, smiles: str) -> str:
"""
Get a formatted text summary of identified functional groups.
Args:
smiles: SMILES notation
Returns:
Formatted string for use in prompts
"""
groups = self.identify_functional_groups(smiles)
if not groups:
return "未能识别到特征官能团,请人工确认分子结构"
lines = []
for g in groups:
count_str = f"×{g['count']}" if g['count'] > 1 else ""
lines.append(f"{g['name_cn']}({g['name_en']}){count_str} - {g['property_type']}")
return ";".join(lines)
def get_3dmol_script(smiles: str, container_id: str = "mol3d") -> str:
"""
Generate JavaScript for 3Dmol.js visualization.
This returns a script that can be embedded in HTML to show
an interactive 3D molecular viewer.
Args:
smiles: SMILES notation
container_id: HTML container element ID
Returns:
JavaScript code string
"""
# Note: This requires 3Dmol.js to be loaded in the page
# and a valid SDF/MOL block. For simplicity, we use
# the SMILES directly and let 3Dmol parse it.
return f"""
<script>
(function() {{
let viewer = $3Dmol.createViewer(document.getElementById('{container_id}'), {{
backgroundColor: 'white'
}});
// Use PubChem to get 3D structure from SMILES
// Alternatively, generate conformer with RDKit
let smiles = '{smiles}';
// Add molecule from SMILES (requires 3Dmol.js SmilesParser)
viewer.addModel(smiles, 'smi');
viewer.setStyle({{}}, {{stick: {{}}}});
viewer.zoomTo();
viewer.render();
}})();
</script>
"""
def get_3dmol_html(
smiles: str,
width: int = 400,
height: int = 300,
) -> str:
"""
Generate complete HTML for 3Dmol.js visualization.
Args:
smiles: SMILES notation
width: Viewer width
height: Viewer height
Returns:
Complete HTML string
"""
return f"""
<div id="mol3d-container" style="width: {width}px; height: {height}px; position: relative;">
<div id="mol3d" style="width: 100%; height: 100%;"></div>
</div>
<script src="https://3dmol.org/build/3Dmol-min.js"></script>
{get_3dmol_script(smiles, 'mol3d')}
"""
# Singleton instance for easy import
renderer = MoleculeRenderer()
|