Vaishnav14220 commited on
Commit
d90ea25
·
1 Parent(s): 34c4a25

Fix NIST reaction rendering: Add comprehensive SMILES mapping for radicals, cyclic compounds, and complex formulas like C2H3, c-C3H2, etc. Handle spaces in formulas and special notation.

Browse files
Files changed (1) hide show
  1. app.py +180 -26
app.py CHANGED
@@ -525,6 +525,139 @@ def _fetch_all_nist_reactions(limit: int = 100) -> List[tuple[str, str]]:
525
  return []
526
 
527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  def _render_reaction_from_nist(reaction_text: str) -> str | None:
529
  """Render a reaction from NIST format to SVG using RDKit."""
530
  reaction_text = (reaction_text or "").strip()
@@ -535,34 +668,54 @@ def _render_reaction_from_nist(reaction_text: str) -> str | None:
535
  smiles_reaction = None
536
 
537
  # Handle different NIST reaction formats
538
- if " → " in reaction_text:
539
- # Format: "A + B → C + D"
540
- parts = reaction_text.split(" → ")
541
- if len(parts) == 2:
542
- reactants = parts[0].replace(" + ", ".").strip()
543
- products = parts[1].replace(" + ", ".").strip()
544
- smiles_reaction = f"{reactants}>>{products}"
545
- elif " -> " in reaction_text:
546
- # Alternative arrow format
547
- parts = reaction_text.split(" -> ")
548
- if len(parts) == 2:
549
- reactants = parts[0].replace(" + ", ".").strip()
550
- products = parts[1].replace(" + ", ".").strip()
551
- smiles_reaction = f"{reactants}>>{products}"
552
- elif " " in reaction_text:
553
- # Reversible reaction
554
- parts = reaction_text.split(" ")
555
- if len(parts) == 2:
556
- reactants = parts[0].replace(" + ", ".").strip()
557
- products = parts[1].replace(" + ", ".").strip()
558
- smiles_reaction = f"{reactants}>>{products}"
559
-
560
- # If we couldn't parse it, try using it directly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  if not smiles_reaction:
562
  if ">>" in reaction_text:
563
  smiles_reaction = reaction_text
564
  else:
565
- return None
 
 
 
 
 
 
566
 
567
  try:
568
  # Try parsing as SMILES reaction first
@@ -570,7 +723,8 @@ def _render_reaction_from_nist(reaction_text: str) -> str | None:
570
  if reaction is None:
571
  # Fall back to SMARTS parsing
572
  reaction = rdChemReactions.ReactionFromSmarts(smiles_reaction, useSmiles=False)
573
- except Exception:
 
574
  return None
575
 
576
  if reaction is None or (reaction.GetNumReactantTemplates() == 0 and reaction.GetNumProductTemplates() == 0):
@@ -580,7 +734,7 @@ def _render_reaction_from_nist(reaction_text: str) -> str | None:
580
  # Generate SVG with specified parameters
581
  svg = Draw.ReactionToImage(reaction, subImgSize=(200, 200), useSVG=True, drawOptions=None, returnPNG=False)
582
  except Exception as exc:
583
- print(f"Error rendering reaction: {exc}")
584
  return None
585
 
586
  if isinstance(svg, tuple):
 
525
  return []
526
 
527
 
528
+ def _clean_chemical_formula(formula: str) -> str:
529
+ """Clean and normalize chemical formulas from NIST format."""
530
+ if not formula:
531
+ return ""
532
+
533
+ # Remove extra spaces within formulas (C 2 H 3 -> C2H3)
534
+ import re
535
+
536
+ # Pattern to match element symbols followed by numbers with spaces
537
+ # This will convert "C 2 H 3" to "C2H3"
538
+ cleaned = re.sub(r'([A-Z][a-z]?)(\s+)(\d+)', r'\1\3', formula)
539
+
540
+ # Handle radicals and special notation
541
+ cleaned = cleaned.replace("·", "") # Remove radical dots
542
+ cleaned = cleaned.replace("•", "") # Remove alternative radical notation
543
+
544
+ # Keep c- prefix for cyclic compounds, remove other lowercase prefixes
545
+ if not cleaned.startswith(('c-', 'C-')):
546
+ cleaned = re.sub(r'^[a-z]-', '', cleaned)
547
+
548
+ return cleaned.strip()
549
+
550
+
551
+ def _nist_formula_to_smiles(formula: str) -> str | None:
552
+ """Convert NIST chemical formula to SMILES string for RDKit."""
553
+ if not formula:
554
+ return None
555
+
556
+ formula = _clean_chemical_formula(formula)
557
+
558
+ # Dictionary of common NIST formulas to SMILES
559
+ # This is a lookup table for frequently encountered species
560
+ nist_to_smiles = {
561
+ # Simple molecules
562
+ "H2": "[H][H]",
563
+ "O2": "O=O",
564
+ "N2": "N#N",
565
+ "CO": "[C-]#[O+]",
566
+ "CO2": "O=C=O",
567
+ "H2O": "O",
568
+ "CH4": "C",
569
+ "C2H6": "CC",
570
+ "C2H4": "C=C",
571
+ "C2H2": "C#C",
572
+ "C3H8": "CCC",
573
+ "C3H6": "C=CC",
574
+ "C6H6": "c1ccccc1",
575
+
576
+ # Radicals (simplified representations)
577
+ "H": "[H]",
578
+ "CH3": "[CH3]",
579
+ "C2H5": "C[CH2]",
580
+ "C2H3": "C=C[CH2]", # Propargyl radical
581
+ "C3H3": "C#CC", # Propynyl radical
582
+ "C": "[C]", # Carbon atom
583
+ "OH": "[OH]",
584
+ "O": "[O]",
585
+ "HO2": "O[O]",
586
+ "CH2": "[CH2]",
587
+
588
+ # Cyclic compounds
589
+ "c-C3H2": "C1=CC1", # Cyclopropenylidene (simplified)
590
+
591
+ # More complex species
592
+ "CH2O": "C=O",
593
+ "CH3OH": "CO",
594
+ "C2H5OH": "CCO",
595
+ "HCO": "[CH]=O",
596
+ "CH3CHO": "CC=O",
597
+ "C2H4O": "C=CO",
598
+
599
+ # Ions (simplified)
600
+ "H+": "[H+]",
601
+ "OH-": "[OH-]",
602
+ "O2-": "[O-][O]",
603
+
604
+ # Specific compounds from the failing reaction
605
+ "C2H3": "C=C[CH2]", # Propargyl radical C2H3
606
+ "c-C3H2": "C1=CC1", # Cyclopropenyl radical (c-C3H2)
607
+ "CC3H2": "C1=CC1", # Alternative notation
608
+ }
609
+
610
+ # Direct lookup
611
+ if formula in nist_to_smiles:
612
+ return nist_to_smiles[formula]
613
+
614
+ # Try to generate SMILES for simple hydrocarbons
615
+ if re.match(r'^C\d+H\d*$', formula):
616
+ # Parse C_nH_m
617
+ c_match = re.search(r'C(\d+)', formula)
618
+ h_match = re.search(r'H(\d+)', formula)
619
+
620
+ if c_match and h_match:
621
+ c_count = int(c_match.group(1))
622
+ h_count = int(h_match.group(1))
623
+
624
+ if c_count == 1 and h_count == 4:
625
+ return "C" # CH4
626
+ elif c_count == 2 and h_count == 6:
627
+ return "CC" # C2H6
628
+ elif c_count == 2 and h_count == 4:
629
+ return "C=C" # C2H4
630
+ elif c_count == 2 and h_count == 2:
631
+ return "C#C" # C2H2
632
+ elif c_count == 3 and h_count == 8:
633
+ return "CCC" # C3H8
634
+ elif c_count == 3 and h_count == 6:
635
+ return "C=CC" # C3H6
636
+
637
+ # For unknown formulas, try to create a simple representation
638
+ # This is a fallback that may not be chemically accurate
639
+ if re.match(r'^[A-Z][a-z]?\d*$', formula):
640
+ # Single atom with number (like O2, N2)
641
+ element_match = re.match(r'^([A-Z][a-z]?)(\d*)$', formula)
642
+ if element_match:
643
+ element = element_match.group(1)
644
+ count = element_match.group(2)
645
+
646
+ if count and int(count) > 1:
647
+ # For diatomic molecules
648
+ if element in ['O', 'N', 'H']:
649
+ if element == 'O':
650
+ return "O=O"
651
+ elif element == 'N':
652
+ return "N#N"
653
+ elif element == 'H':
654
+ return "[H][H]"
655
+ else:
656
+ return f"[{element}]"
657
+
658
+ return None # Could not convert
659
+
660
+
661
  def _render_reaction_from_nist(reaction_text: str) -> str | None:
662
  """Render a reaction from NIST format to SVG using RDKit."""
663
  reaction_text = (reaction_text or "").strip()
 
668
  smiles_reaction = None
669
 
670
  # Handle different NIST reaction formats
671
+ separators = [" → ", " -> ", " ↔ ", " ⇌ ", " →", " ->", " ⇌"]
672
+
673
+ parts = None
674
+ for sep in separators:
675
+ if sep in reaction_text:
676
+ parts = reaction_text.split(sep, 1)
677
+ break
678
+
679
+ if parts and len(parts) == 2:
680
+ reactants_text = parts[0].strip()
681
+ products_text = parts[1].strip()
682
+
683
+ # Split reactants and products by " + "
684
+ reactants = [r.strip() for r in reactants_text.split(" + ") if r.strip()]
685
+ products = [p.strip() for p in products_text.split(" + ") if p.strip()]
686
+
687
+ # Convert each compound to SMILES
688
+ reactant_smiles = []
689
+ product_smiles = []
690
+
691
+ for reactant in reactants:
692
+ smiles = _nist_formula_to_smiles(reactant)
693
+ if smiles:
694
+ reactant_smiles.append(smiles)
695
+
696
+ for product in products:
697
+ smiles = _nist_formula_to_smiles(product)
698
+ if smiles:
699
+ product_smiles.append(smiles)
700
+
701
+ # Only proceed if we have at least one reactant and one product
702
+ if reactant_smiles and product_smiles:
703
+ reactants_smiles_str = ".".join(reactant_smiles)
704
+ products_smiles_str = ".".join(product_smiles)
705
+ smiles_reaction = f"{reactants_smiles_str}>>{products_smiles_str}"
706
+
707
+ # If we couldn't parse it with separators, try using it directly
708
  if not smiles_reaction:
709
  if ">>" in reaction_text:
710
  smiles_reaction = reaction_text
711
  else:
712
+ # Last resort: try to clean the entire reaction text
713
+ cleaned = _clean_chemical_formula(reaction_text)
714
+ if ">>" in cleaned:
715
+ smiles_reaction = cleaned
716
+
717
+ if not smiles_reaction:
718
+ return None
719
 
720
  try:
721
  # Try parsing as SMILES reaction first
 
723
  if reaction is None:
724
  # Fall back to SMARTS parsing
725
  reaction = rdChemReactions.ReactionFromSmarts(smiles_reaction, useSmiles=False)
726
+ except Exception as exc:
727
+ print(f"RDKit parsing error for '{smiles_reaction}': {exc}")
728
  return None
729
 
730
  if reaction is None or (reaction.GetNumReactantTemplates() == 0 and reaction.GetNumProductTemplates() == 0):
 
734
  # Generate SVG with specified parameters
735
  svg = Draw.ReactionToImage(reaction, subImgSize=(200, 200), useSVG=True, drawOptions=None, returnPNG=False)
736
  except Exception as exc:
737
+ print(f"Error rendering reaction '{smiles_reaction}': {exc}")
738
  return None
739
 
740
  if isinstance(svg, tuple):