diff --git a/SynTool/__init__.py b/SynTool/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..2a04536789e8e3660c5758c092982bd4507a79f5
--- /dev/null
+++ b/SynTool/__init__.py
@@ -0,0 +1,3 @@
+from .mcts import *
+
+__all__ = ["Tree"]
diff --git a/SynTool/chem/__init__.py b/SynTool/chem/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SynTool/chem/__pycache__/__init__.cpython-310.pyc b/SynTool/chem/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46b692ffe41b12b6ef257284ff49a790767a925f
Binary files /dev/null and b/SynTool/chem/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SynTool/chem/__pycache__/reaction.cpython-310.pyc b/SynTool/chem/__pycache__/reaction.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd5c09272e6aff74cc2a930d6ce2685506ff7e3b
Binary files /dev/null and b/SynTool/chem/__pycache__/reaction.cpython-310.pyc differ
diff --git a/SynTool/chem/__pycache__/retron.cpython-310.pyc b/SynTool/chem/__pycache__/retron.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e56eac834ed51580ae0305bbdf19250aeabe0c5c
Binary files /dev/null and b/SynTool/chem/__pycache__/retron.cpython-310.pyc differ
diff --git a/SynTool/chem/__pycache__/utils.cpython-310.pyc b/SynTool/chem/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbdaff60c4fac107754d6d17e6dd8c1b12fc246e
Binary files /dev/null and b/SynTool/chem/__pycache__/utils.cpython-310.pyc differ
diff --git a/SynTool/chem/data/__init__.py b/SynTool/chem/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SynTool/chem/data/__pycache__/__init__.cpython-310.pyc b/SynTool/chem/data/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d722a048daf89b7f5338f588d495aee68bb4c8be
Binary files /dev/null and b/SynTool/chem/data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SynTool/chem/data/__pycache__/cleaning.cpython-310.pyc b/SynTool/chem/data/__pycache__/cleaning.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8af0739bcddd86bcae4842a6b3813273382b488
Binary files /dev/null and b/SynTool/chem/data/__pycache__/cleaning.cpython-310.pyc differ
diff --git a/SynTool/chem/data/__pycache__/filtering.cpython-310.pyc b/SynTool/chem/data/__pycache__/filtering.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c36901c582fa4586c1745eb6156c37bc8d5103c
Binary files /dev/null and b/SynTool/chem/data/__pycache__/filtering.cpython-310.pyc differ
diff --git a/SynTool/chem/data/__pycache__/mapping.cpython-310.pyc b/SynTool/chem/data/__pycache__/mapping.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..976460e6a2ccc64043d93a7d288802dcd78f6a0e
Binary files /dev/null and b/SynTool/chem/data/__pycache__/mapping.cpython-310.pyc differ
diff --git a/SynTool/chem/data/__pycache__/standardizer.cpython-310.pyc b/SynTool/chem/data/__pycache__/standardizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a3f9a0bede2b466bf554d247d508192242742fd
Binary files /dev/null and b/SynTool/chem/data/__pycache__/standardizer.cpython-310.pyc differ
diff --git a/SynTool/chem/data/cleaning.py b/SynTool/chem/data/cleaning.py
new file mode 100644
index 0000000000000000000000000000000000000000..344d9f765dbc01dde57c919234093264a2d660d0
--- /dev/null
+++ b/SynTool/chem/data/cleaning.py
@@ -0,0 +1,124 @@
+import os
+from multiprocessing import Queue, Process, Manager, Value
+from logging import getLogger, Logger
+from tqdm import tqdm
+from CGRtools.containers import ReactionContainer
+
+from .standardizer import Standardizer
+from SynTool.utils.files import ReactionReader, ReactionWriter
+from SynTool.utils.config import ReactionStandardizationConfig
+
+
+def cleaner(reaction: ReactionContainer, logger: Logger, config: ReactionStandardizationConfig):
+ """
+ Standardize a reaction according to external script
+
+ :param reaction: ReactionContainer to clean/standardize
+ :param logger: Logger - to avoid writing log
+ :param config: ReactionStandardizationConfig
+ :return: ReactionContainer or empty list
+ """
+ standardizer = Standardizer(id_tag='Reaction_ID',
+ action_on_isotopes=2,
+ skip_tautomerize=True,
+ skip_errors=config.skip_errors,
+ keep_unbalanced_ions=config.keep_unbalanced_ions,
+ keep_reagents=config.keep_reagents,
+ ignore_mapping=config.ignore_mapping,
+ logger=logger)
+ return standardizer.standardize(reaction)
+
+
+def worker_cleaner(to_clean: Queue, to_write: Queue, config: ReactionStandardizationConfig):
+ """
+ Launches standardizations using the Queue to_clean. Fills the to_write Queue with results
+
+ :param to_clean: Queue of reactions to clean/standardize
+ :param to_write: Standardized outputs to write
+ :param config: ReactionStandardizationConfig
+ :return: None
+ """
+ logger = getLogger()
+ logger.disabled = True
+ while True:
+ raw_reaction = to_clean.get()
+ if raw_reaction == "Quit":
+ break
+ res = cleaner(raw_reaction, logger, config)
+ to_write.put(res)
+ logger.disabled = False
+
+
+def cleaner_writer(output_file: str, to_write: Queue, cleaned_nb: Value, remove_old=True):
+ """
+ Writes in output file the standardized reactions
+
+ :param output_file: output file path
+ :param to_write: Standardized ReactionContainer to write
+ :param cleaned_nb: number of final reactions
+ :param remove_old: whenever to remove or not an already existing file
+ """
+
+ if remove_old and os.path.isfile(output_file):
+ os.remove(output_file)
+
+ counter = 0
+ seen_reactions = []
+ with ReactionWriter(output_file) as out:
+ while True:
+ res = to_write.get()
+ if res:
+ if res == "Quit":
+ cleaned_nb.set(counter)
+ break
+ elif isinstance(res, ReactionContainer):
+ smi = format(res, "m")
+ if smi not in seen_reactions:
+ out.write(res)
+ counter += 1
+ seen_reactions.append(smi)
+
+
+def reactions_cleaner(config: ReactionStandardizationConfig,
+ input_file: str, output_file: str, num_cpus: int, batch_prep_size: int = 100):
+ """
+ Writes in output file the standardized reactions
+
+ :param config:
+ :param input_file: input RDF file path
+ :param output_file: output RDF file path
+ :param num_cpus: number of CPU to be parallelized
+ :param batch_prep_size: size of each batch per CPU
+ """
+ with Manager() as m:
+ to_clean = m.Queue(maxsize=num_cpus * batch_prep_size)
+ to_write = m.Queue(maxsize=batch_prep_size)
+ cleaned_nb = m.Value(int, 0)
+
+ writer = Process(target=cleaner_writer, args=(output_file, to_write, cleaned_nb))
+ writer.start()
+
+ workers = []
+ for _ in range(num_cpus - 2):
+ w = Process(target=worker_cleaner, args=(to_clean, to_write, config))
+ w.start()
+ workers.append(w)
+
+ n = 0
+ with ReactionReader(input_file) as reactions:
+ for raw_reaction in tqdm(reactions):
+ if 'Reaction_ID' not in raw_reaction.meta:
+ raw_reaction.meta['Reaction_ID'] = n
+ to_clean.put(raw_reaction)
+ n += 1
+
+ for _ in workers:
+ to_clean.put("Quit")
+ for w in workers:
+ w.join()
+
+ to_write.put("Quit")
+ writer.join()
+
+ print(f'Initial number of reactions: {n}'),
+ print(f'Removed number of reactions: {n - cleaned_nb.get()}')
diff --git a/SynTool/chem/data/filtering.py b/SynTool/chem/data/filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a66fa7757e725a5ca882c0549b4278fb14041e7
--- /dev/null
+++ b/SynTool/chem/data/filtering.py
@@ -0,0 +1,917 @@
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Tuple, Dict, Any, Optional
+from tqdm.auto import tqdm
+
+import numpy as np
+import ray
+import yaml
+from CGRtools.containers import ReactionContainer, MoleculeContainer, CGRContainer
+from StructureFingerprint import MorganFingerprint
+
+from SynTool.utils.files import ReactionReader, ReactionWriter
+from SynTool.chem.utils import remove_small_molecules, rebalance_reaction, remove_reagents
+from SynTool.utils.config import ConfigABC, convert_config_to_dict
+
+
+@dataclass
+class CompeteProductsConfig(ConfigABC):
+ fingerprint_tanimoto_threshold: float = 0.3
+ mcs_tanimoto_threshold: float = 0.6
+
+ @staticmethod
+ def from_dict(config_dict: Dict[str, Any]):
+ """Create an instance of CompeteProductsConfig from a dictionary."""
+ return CompeteProductsConfig(**config_dict)
+
+ @staticmethod
+ def from_yaml(file_path: str):
+ """Deserialize a YAML file into a CompeteProductsConfig object."""
+ with open(file_path, "r") as file:
+ config_dict = yaml.safe_load(file)
+ return CompeteProductsConfig.from_dict(config_dict)
+
+ def _validate_params(self, params: Dict[str, Any]):
+ """Validate configuration parameters."""
+ if not isinstance(params.get("fingerprint_tanimoto_threshold"), float) \
+ or not (0 <= params["fingerprint_tanimoto_threshold"] <= 1):
+ raise ValueError("Invalid 'fingerprint_tanimoto_threshold'; expected a float between 0 and 1")
+
+ if not isinstance(params.get("mcs_tanimoto_threshold"), float) \
+ or not (0 <= params["mcs_tanimoto_threshold"] <= 1):
+ raise ValueError("Invalid 'mcs_tanimoto_threshold'; expected a float between 0 and 1")
+
+
+class CompeteProductsChecker:
+ """Checks if there are compete reactions."""
+
+ def __init__(
+ self,
+ fingerprint_tanimoto_threshold: float = 0.3,
+ mcs_tanimoto_threshold: float = 0.6,
+ ):
+ self.fingerprint_tanimoto_threshold = fingerprint_tanimoto_threshold
+ self.mcs_tanimoto_threshold = mcs_tanimoto_threshold
+
+ @staticmethod
+ def from_config(config: CompeteProductsConfig):
+ """Creates an instance of CompeteProductsChecker from a configuration object."""
+ return CompeteProductsChecker(
+ config.fingerprint_tanimoto_threshold, config.mcs_tanimoto_threshold
+ )
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ """
+ Returns True if the reaction has competing products, else False
+
+ :param reaction: input reaction
+ :return: True or False
+ """
+ mf = MorganFingerprint()
+ is_compete = False
+
+ # Check for compete products using both fingerprint similarity and maximum common substructure (MCS) similarity
+ for mol in reaction.reagents:
+ for other_mol in reaction.products:
+ if len(mol) > 6 and len(other_mol) > 6:
+ # Compute fingerprint similarity
+ molf = mf.transform([mol])
+ other_molf = mf.transform([other_mol])
+ fingerprint_tanimoto = tanimoto_kernel(molf, other_molf)[0][0]
+
+ # If fingerprint similarity is high enough, check for MCS similarity
+ if fingerprint_tanimoto > self.fingerprint_tanimoto_threshold:
+ try:
+ # Find the maximum common substructure (MCS) and compute its size
+ clique_size = len(next(mol.get_mcs_mapping(other_mol, limit=100)))
+
+ # Calculate MCS similarity based on MCS size
+ mcs_tanimoto = clique_size / (len(mol) + len(other_mol) - clique_size)
+
+ # If MCS similarity is also high enough, mark the reaction as having compete products
+ if mcs_tanimoto > self.mcs_tanimoto_threshold:
+ is_compete = True
+ break
+ except StopIteration:
+ continue
+
+ return is_compete
+
+
+@dataclass
+class DynamicBondsConfig(ConfigABC):
+ min_bonds_number: int = 1
+ max_bonds_number: int = 6
+
+ @staticmethod
+ def from_dict(config_dict: Dict[str, Any]):
+ """Create an instance of DynamicBondsConfig from a dictionary."""
+ return DynamicBondsConfig(**config_dict)
+
+ @staticmethod
+ def from_yaml(file_path: str):
+ """Deserialize a YAML file into a DynamicBondsConfig object."""
+ with open(file_path, "r") as file:
+ config_dict = yaml.safe_load(file)
+ return DynamicBondsConfig.from_dict(config_dict)
+
+ def _validate_params(self, params: Dict[str, Any]):
+ """Validate configuration parameters."""
+ if not isinstance(params.get("min_bonds_number"), int) \
+ or params["min_bonds_number"] < 0:
+ raise ValueError(
+ "Invalid 'min_bonds_number'; expected a non-negative integer")
+
+ if not isinstance(params.get("max_bonds_number"), int) \
+ or params["max_bonds_number"] < 0:
+ raise ValueError("Invalid 'max_bonds_number'; expected a non-negative integer")
+
+ if params["min_bonds_number"] > params["max_bonds_number"]:
+ raise ValueError("'min_bonds_number' cannot be greater than 'max_bonds_number'")
+
+
+class DynamicBondsChecker:
+ """Checks if there is an unacceptable number of dynamic bonds in CGR."""
+
+ def __init__(self, min_bonds_number: int = 1, max_bonds_number: int = 6):
+ self.min_bonds_number = min_bonds_number
+ self.max_bonds_number = max_bonds_number
+
+ @staticmethod
+ def from_config(config: DynamicBondsConfig):
+ """Creates an instance of DynamicBondsChecker from a configuration object."""
+ return DynamicBondsChecker(config.min_bonds_number, config.max_bonds_number)
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ cgr = ~reaction
+ return not (self.min_bonds_number <= len(cgr.center_bonds) <= self.max_bonds_number)
+
+
+@dataclass
+class SmallMoleculesConfig(ConfigABC):
+ limit: int = 6
+
+ @staticmethod
+ def from_dict(config_dict: Dict[str, Any]):
+ """Create an instance of SmallMoleculesConfig from a dictionary."""
+ return SmallMoleculesConfig(**config_dict)
+
+ @staticmethod
+ def from_yaml(file_path: str):
+ """Deserialize a YAML file into a SmallMoleculesConfig object."""
+ with open(file_path, "r") as file:
+ config_dict = yaml.safe_load(file)
+ return SmallMoleculesConfig.from_dict(config_dict)
+
+ def _validate_params(self, params: Dict[str, Any]):
+ """Validate configuration parameters."""
+ if not isinstance(params.get("limit"), int) or params["limit"] < 1:
+ raise ValueError("Invalid 'limit'; expected a positive integer")
+
+
+class SmallMoleculesChecker:
+ """Checks if there are only small molecules in the reaction or if there is only one small reactant or product."""
+
+ def __init__(self, limit: int = 6):
+ self.limit = limit
+
+ @staticmethod
+ def from_config(config: SmallMoleculesConfig):
+ """Creates an instance of SmallMoleculesChecker from a configuration object."""
+ return SmallMoleculesChecker(config.limit)
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ if (len(reaction.reactants) == 1 and self.are_only_small_molecules(reaction.reactants)) \
+ or (len(reaction.products) == 1 and self.are_only_small_molecules(reaction.products)) \
+ or (self.are_only_small_molecules(reaction.reactants) and self.are_only_small_molecules(reaction.products)):
+ return True
+ return False
+
+ def are_only_small_molecules(self, molecules: Iterable[MoleculeContainer]) -> bool:
+ """Checks if all molecules in the given iterable are small molecules."""
+ return all(len(molecule) <= self.limit for molecule in molecules)
+
+
+@dataclass
+class CGRConnectedComponentsConfig:
+ pass
+
+
+class CGRConnectedComponentsChecker:
+ """Allows to check if CGR contains unrelated components (without reagents)."""
+
+ @staticmethod
+ def from_config(config: CGRConnectedComponentsConfig): # TODO config class not used
+ """Creates an instance of CGRConnectedComponentsChecker from a configuration object."""
+ return CGRConnectedComponentsChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ tmp_reaction = ReactionContainer(reaction.reactants, reaction.products)
+ cgr = ~tmp_reaction
+ return cgr.connected_components_count > 1
+
+
+@dataclass
+class RingsChangeConfig:
+ pass
+
+
+class RingsChangeChecker:
+ """Allows to check if there is changing rings number in the reaction."""
+
+ @staticmethod
+ def from_config(config: RingsChangeConfig): # TODO config class not used
+ """Creates an instance of RingsChecker from a configuration object."""
+ return RingsChangeChecker()
+
+ def __call__(self, reaction: ReactionContainer):
+ """
+ Returns True if there are valence mistakes in the reaction or there is a reaction with mismatch numbers of all
+ rings or aromatic rings in reactants and products (reaction in rings)
+
+ :param reaction: input reaction
+ :return: True or False
+ """
+
+ reaction.kekule()
+ reaction.thiele()
+ r_rings, r_arom_rings = self._calc_rings(reaction.reactants)
+ p_rings, p_arom_rings = self._calc_rings(reaction.products)
+ if (r_arom_rings != p_arom_rings) or (r_rings != p_rings):
+ return True
+ else:
+ return False
+
+ @staticmethod
+ def _calc_rings(molecules: Iterable) -> Tuple[int, int]:
+ """
+ Calculates number of all rings and number of aromatic rings in molecules
+
+ :param molecules: set of molecules
+ :return: number of all rings and number of aromatic rings in molecules
+ """
+ rings, arom_rings = 0, 0
+ for mol in molecules:
+ rings += mol.rings_count
+ arom_rings += len(mol.aromatic_rings)
+ return rings, arom_rings
+
+
+@dataclass
+class StrangeCarbonsConfig:
+ # Currently empty, but can be extended in the future if needed
+ pass
+
+
+class StrangeCarbonsChecker:
+ """Checks if there are 'strange' carbons in the reaction."""
+
+ @staticmethod
+ def from_config(config: StrangeCarbonsConfig): # TODO config class not used
+ """Creates an instance of StrangeCarbonsChecker from a configuration object."""
+ return StrangeCarbonsChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ for molecule in reaction.reactants + reaction.products:
+ atoms_types = {a.atomic_symbol for _, a in molecule.atoms()} # atoms types in molecule
+ if len(atoms_types) == 1 and atoms_types.pop() == "C":
+ if len(molecule) == 1: # methane
+ return True
+ bond_types = {int(b) for _, _, b in molecule.bonds()}
+ if len(bond_types) == 1 and bond_types.pop() != 4:
+ return True # C molecules with only one type of bond (not aromatic)
+ return False
+
+
+@dataclass
+class NoReactionConfig:
+ # Currently empty, but can be extended in the future if needed
+ pass
+
+
+class NoReactionChecker:
+ """Checks if there is no reaction in the provided reaction container."""
+
+ @staticmethod
+ def from_config(config: NoReactionConfig): # TODO config class not used
+ """Creates an instance of NoReactionChecker from a configuration object."""
+ return NoReactionChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ cgr = ~reaction
+ return not cgr.center_atoms and not cgr.center_bonds
+
+
+@dataclass
+class MultiCenterConfig:
+ pass
+
+
+class MultiCenterChecker:
+ """Checks if there is a multicenter reaction."""
+
+ @staticmethod
+ def from_config(config: MultiCenterConfig): # TODO config class not used
+ return MultiCenterChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ cgr = ~reaction
+ return len(cgr.centers_list) > 1
+
+
+@dataclass
+class WrongCHBreakingConfig:
+ pass
+
+
+class WrongCHBreakingChecker:
+ """Checks for incorrect C-C bond formation from breaking a C-H bond."""
+
+ @staticmethod
+ def from_config(config: WrongCHBreakingConfig): # TODO config class not used
+ return WrongCHBreakingChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ """
+ Determines if a reaction involves incorrect C-C bond formation from breaking a C-H bond.
+
+ :param reaction: The reaction to be checked.
+ :return: True if incorrect C-C bond formation is found, False otherwise.
+ """
+
+ reaction.kekule()
+ if reaction.check_valence():
+ return False
+ reaction.thiele()
+
+ copy_reaction = reaction.copy()
+ copy_reaction.explicify_hydrogens()
+ cgr = ~copy_reaction
+ reduced_cgr = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+
+ return self.is_wrong_c_h_breaking(reduced_cgr)
+
+ @staticmethod
+ def is_wrong_c_h_breaking(cgr: CGRContainer) -> bool:
+ """
+ Checks for incorrect C-C bond formation from breaking a C-H bond in a CGR.
+ :param cgr: The CGR with explicified hydrogens.
+ :return: True if incorrect C-C bond formation is found, False otherwise.
+ """
+ for atom_id in cgr.center_atoms:
+ if cgr.atom(atom_id).atomic_symbol == "C":
+ is_c_h_breaking, is_c_c_formation = False, False
+ c_with_h_id, another_c_id = None, None
+
+ for neighbour_id, bond in cgr._bonds[atom_id].items():
+ neighbour = cgr.atom(neighbour_id)
+
+ if (
+ bond.order
+ and not bond.p_order
+ and neighbour.atomic_symbol == "H"
+ ):
+ is_c_h_breaking = True
+ c_with_h_id = atom_id
+
+ elif (
+ not bond.order
+ and bond.p_order
+ and neighbour.atomic_symbol == "C"
+ ):
+ is_c_c_formation = True
+ another_c_id = neighbour_id
+
+ if is_c_h_breaking and is_c_c_formation:
+ # Check for presence of heteroatoms in the first environment of 2 bonding carbons
+ if any(
+ cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+ for neighbour_id in cgr._bonds[c_with_h_id]
+ ) or any(
+ cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+ for neighbour_id in cgr._bonds[another_c_id]
+ ):
+ return False
+ return True
+
+ return False
+
+
+@dataclass
+class CCsp3BreakingConfig:
+ pass
+
+
+class CCsp3BreakingChecker:
+ """Checks if there is C(sp3)-C bond breaking."""
+
+ @staticmethod
+ def from_config(config: CCsp3BreakingConfig): # TODO config class not used
+ return CCsp3BreakingChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ """
+ Returns True if there is C(sp3)-C bonds breaking, else False
+
+ :param reaction: input reaction
+ :return: True or False
+ """
+ cgr = ~reaction
+ reaction_center = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+ for atom_id, neighbour_id, bond in reaction_center.bonds():
+ atom = reaction_center.atom(atom_id)
+ neighbour = reaction_center.atom(neighbour_id)
+
+ is_bond_broken = bond.order is not None and bond.p_order is None
+ are_atoms_carbons = (
+ atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+ )
+ is_atom_sp3 = atom.hybridization == 1 or neighbour.hybridization == 1
+
+ if is_bond_broken and are_atoms_carbons and is_atom_sp3:
+ return True
+ return False
+
+
+@dataclass
+class CCRingBreakingConfig:
+ pass
+
+
+class CCRingBreakingChecker:
+ """Checks if a reaction involves ring C-C bond breaking."""
+
+ @staticmethod
+ def from_config(config: CCRingBreakingConfig): # TODO config class not used
+ return CCRingBreakingChecker()
+
+ def __call__(self, reaction: ReactionContainer) -> bool:
+ """
+ Returns True if the reaction involves ring C-C bond breaking, else False
+
+ :param reaction: input reaction
+ :return: True or False
+ """
+ cgr = ~reaction
+
+ # Extract reactants' center atoms and their rings
+ reactants_center_atoms = {}
+ reactants_rings = set()
+ for reactant in reaction.reactants:
+ reactants_rings.update(reactant.sssr)
+ for n, atom in reactant.atoms():
+ if n in cgr.center_atoms:
+ reactants_center_atoms[n] = atom
+
+ # Identify reaction center based on center atoms
+ reaction_center = cgr.augmented_substructure(atoms=cgr.center_atoms, deep=0)
+
+ # Iterate over bonds in the reaction center and check for ring C-C bond breaking
+ for atom_id, neighbour_id, bond in reaction_center.bonds():
+ try:
+ # Retrieve corresponding atoms from reactants
+ atom = reactants_center_atoms[atom_id]
+ neighbour = reactants_center_atoms[neighbour_id]
+ except KeyError:
+ continue
+ else:
+ # Check if the bond is broken and both atoms are carbons in rings of size 5, 6, or 7
+ is_bond_broken = (bond.order is not None) and (bond.p_order is None)
+ are_atoms_carbons = (
+ atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+ )
+ are_atoms_in_ring = (
+ set(atom.ring_sizes).intersection({5, 6, 7})
+ and set(neighbour.ring_sizes).intersection({5, 6, 7})
+ and any(
+ atom_id in ring and neighbour_id in ring
+ for ring in reactants_rings
+ )
+ )
+
+ # If all conditions are met, indicate ring C-C bond breaking
+ if is_bond_broken and are_atoms_carbons and are_atoms_in_ring:
+ return True
+
+ return False
+
+
+@dataclass
+class ReactionCheckConfig(ConfigABC):
+ """
+ Configuration class for reaction checks, inheriting from ConfigABC.
+
+ This class manages configuration settings for various reaction checkers, including paths, file formats,
+ and checker-specific parameters.
+
+ Attributes:
+ dynamic_bonds_config: Configuration for dynamic bonds checking.
+ small_molecules_config: Configuration for small molecules checking.
+ strange_carbons_config: Configuration for strange carbons checking.
+ compete_products_config: Configuration for competing products checking.
+ cgr_connected_components_config: Configuration for CGR connected components checking.
+ rings_change_config: Configuration for rings change checking.
+ no_reaction_config: Configuration for no reaction checking.
+ multi_center_config: Configuration for multi-center checking.
+ wrong_ch_breaking_config: Configuration for wrong C-H breaking checking.
+ cc_sp3_breaking_config: Configuration for CC sp3 breaking checking.
+ cc_ring_breaking_config: Configuration for CC ring breaking checking.
+ """
+
+ # Configuration for reaction checkers
+ dynamic_bonds_config: Optional[DynamicBondsConfig] = None
+ small_molecules_config: Optional[SmallMoleculesConfig] = None
+ strange_carbons_config: Optional[StrangeCarbonsConfig] = None
+ compete_products_config: Optional[CompeteProductsConfig] = None
+ cgr_connected_components_config: Optional[CGRConnectedComponentsConfig] = None
+ rings_change_config: Optional[RingsChangeConfig] = None
+ no_reaction_config: Optional[NoReactionConfig] = None
+ multi_center_config: Optional[MultiCenterConfig] = None
+ wrong_ch_breaking_config: Optional[WrongCHBreakingConfig] = None
+ cc_sp3_breaking_config: Optional[CCsp3BreakingConfig] = None
+ cc_ring_breaking_config: Optional[CCRingBreakingConfig] = None
+
+ # Other configuration parameters
+ rebalance_reaction: bool = False
+ remove_reagents: bool = True
+ reagents_max_size: int = 7
+ remove_small_molecules: bool = False
+ small_molecules_max_size: int = 6
+
+ def to_dict(self):
+ """
+ Converts the configuration into a dictionary.
+ """
+ config_dict = {
+ "dynamic_bonds_config": convert_config_to_dict(
+ self.dynamic_bonds_config, DynamicBondsConfig
+ ),
+ "small_molecules_config": convert_config_to_dict(
+ self.small_molecules_config, SmallMoleculesConfig
+ ),
+ "compete_products_config": convert_config_to_dict(
+ self.compete_products_config, CompeteProductsConfig
+ ),
+ "cgr_connected_components_config": {}
+ if self.cgr_connected_components_config is not None
+ else None,
+ "rings_change_config": {} if self.rings_change_config is not None else None,
+ "strange_carbons_config": {}
+ if self.strange_carbons_config is not None
+ else None,
+ "no_reaction_config": {} if self.no_reaction_config is not None else None,
+ "multi_center_config": {} if self.multi_center_config is not None else None,
+ "wrong_ch_breaking_config": {}
+ if self.wrong_ch_breaking_config is not None
+ else None,
+ "cc_sp3_breaking_config": {}
+ if self.cc_sp3_breaking_config is not None
+ else None,
+ "cc_ring_breaking_config": {}
+ if self.cc_ring_breaking_config is not None
+ else None,
+ "rebalance_reaction": self.rebalance_reaction,
+ "remove_reagents": self.remove_reagents,
+ "reagents_max_size": self.reagents_max_size,
+ "remove_small_molecules": self.remove_small_molecules,
+ "small_molecules_max_size": self.small_molecules_max_size,
+ }
+
+ filtered_config_dict = {k: v for k, v in config_dict.items() if v is not None}
+
+ return filtered_config_dict
+
+ @staticmethod
+ def from_dict(config_dict: Dict[str, Any]):
+ """
+ Create an instance of ReactionCheckConfig from a dictionary.
+ """
+ # Instantiate configuration objects if their corresponding dictionary is present
+ dynamic_bonds_config = (
+ DynamicBondsConfig(**config_dict["dynamic_bonds_config"])
+ if "dynamic_bonds_config" in config_dict
+ else None
+ )
+ small_molecules_config = (
+ SmallMoleculesConfig(**config_dict["small_molecules_config"])
+ if "small_molecules_config" in config_dict
+ else None
+ )
+ compete_products_config = (
+ CompeteProductsConfig(**config_dict["compete_products_config"])
+ if "compete_products_config" in config_dict
+ else None
+ )
+ cgr_connected_components_config = (
+ CGRConnectedComponentsConfig()
+ if "cgr_connected_components_config" in config_dict
+ else None
+ )
+ rings_change_config = (
+ RingsChangeConfig()
+ if "rings_change_config" in config_dict
+ else None
+ )
+ strange_carbons_config = (
+ StrangeCarbonsConfig()
+ if "strange_carbons_config" in config_dict
+ else None
+ )
+ no_reaction_config = (
+ NoReactionConfig()
+ if "no_reaction_config" in config_dict
+ else None
+ )
+ multi_center_config = (
+ MultiCenterConfig()
+ if "multi_center_config" in config_dict
+ else None
+ )
+ wrong_ch_breaking_config = (
+ WrongCHBreakingConfig()
+ if "wrong_ch_breaking_config" in config_dict
+ else None
+ )
+ cc_sp3_breaking_config = (
+ CCsp3BreakingConfig()
+ if "cc_sp3_breaking_config" in config_dict
+ else None
+ )
+ cc_ring_breaking_config = (
+ CCRingBreakingConfig()
+ if "cc_ring_breaking_config" in config_dict
+ else None
+ )
+
+ # Extract other simple configuration parameters
+ rebalance_reaction = config_dict.get("rebalance_reaction", False)
+ remove_reagents = config_dict.get("remove_reagents", True)
+ reagents_max_size = config_dict.get("reagents_max_size", 7)
+ remove_small_molecules = config_dict.get("remove_small_molecules", False)
+ small_molecules_max_size = config_dict.get("small_molecules_max_size", 6)
+
+ return ReactionCheckConfig(
+ dynamic_bonds_config=dynamic_bonds_config,
+ small_molecules_config=small_molecules_config,
+ compete_products_config=compete_products_config,
+ cgr_connected_components_config=cgr_connected_components_config,
+ rings_change_config=rings_change_config,
+ strange_carbons_config=strange_carbons_config,
+ no_reaction_config=no_reaction_config,
+ multi_center_config=multi_center_config,
+ wrong_ch_breaking_config=wrong_ch_breaking_config,
+ cc_sp3_breaking_config=cc_sp3_breaking_config,
+ cc_ring_breaking_config=cc_ring_breaking_config,
+ rebalance_reaction=rebalance_reaction,
+ remove_reagents=remove_reagents,
+ reagents_max_size=reagents_max_size,
+ remove_small_molecules=remove_small_molecules,
+ small_molecules_max_size=small_molecules_max_size,
+ )
+
+ @staticmethod
+ def from_yaml(file_path):
+ """
+ Deserializes a YAML file into a ReactionCheckConfig object.
+ """
+ with open(file_path, "r") as file:
+ config_dict = yaml.safe_load(file)
+ return ReactionCheckConfig.from_dict(config_dict)
+
+ def _validate_params(self, params: Dict[str, Any]):
+ if not isinstance(params["rebalance_reaction"], bool):
+ raise ValueError("rebalance_reaction must be a boolean.")
+
+ if not isinstance(params["remove_reagents"], bool):
+ raise ValueError("remove_reagents must be a boolean.")
+
+ if not isinstance(params["reagents_max_size"], int):
+ raise ValueError("reagents_max_size must be an int.")
+
+ if not isinstance(params["remove_small_molecules"], bool):
+ raise ValueError("remove_small_molecules must be a boolean.")
+
+ if not isinstance(params["small_molecules_max_size"], int):
+ raise ValueError("small_molecules_max_size must be an int.")
+
+ def create_checkers(self):
+ checker_instances = []
+
+ if self.dynamic_bonds_config is not None:
+ checker_instances.append(
+ DynamicBondsChecker.from_config(self.dynamic_bonds_config)
+ )
+
+ if self.small_molecules_config is not None:
+ checker_instances.append(
+ SmallMoleculesChecker.from_config(self.small_molecules_config)
+ )
+
+ if self.strange_carbons_config is not None:
+ checker_instances.append(
+ StrangeCarbonsChecker.from_config(self.strange_carbons_config)
+ )
+
+ if self.compete_products_config is not None:
+ checker_instances.append(
+ CompeteProductsChecker.from_config(self.compete_products_config)
+ )
+
+ if self.cgr_connected_components_config is not None:
+ checker_instances.append(
+ CGRConnectedComponentsChecker.from_config(
+ self.cgr_connected_components_config
+ )
+ )
+
+ if self.rings_change_config is not None:
+ checker_instances.append(
+ RingsChangeChecker.from_config(self.rings_change_config)
+ )
+
+ if self.no_reaction_config is not None:
+ checker_instances.append(
+ NoReactionChecker.from_config(self.no_reaction_config)
+ )
+
+ if self.multi_center_config is not None:
+ checker_instances.append(
+ MultiCenterChecker.from_config(self.multi_center_config)
+ )
+
+ if self.wrong_ch_breaking_config is not None:
+ checker_instances.append(
+ WrongCHBreakingChecker.from_config(self.wrong_ch_breaking_config)
+ )
+
+ if self.cc_sp3_breaking_config is not None:
+ checker_instances.append(
+ CCsp3BreakingChecker.from_config(self.cc_sp3_breaking_config)
+ )
+
+ if self.cc_ring_breaking_config is not None:
+ checker_instances.append(
+ CCRingBreakingChecker.from_config(self.cc_ring_breaking_config)
+ )
+
+ return checker_instances
+
+
+def tanimoto_kernel(x, y):
+ """
+ Calculate the Tanimoto coefficient between each element of arrays x and y.
+ """
+ x = x.astype(np.float64)
+ y = y.astype(np.float64)
+ x_dot = np.dot(x, y.T)
+ x2 = np.sum(x**2, axis=1)
+ y2 = np.sum(y**2, axis=1)
+
+ denominator = np.array([x2] * len(y2)).T + np.array([y2] * len(x2)) - x_dot
+ result = np.divide(x_dot, denominator, out=np.zeros_like(x_dot), where=denominator != 0)
+
+ return result
+
+
+def remove_file_if_exists(directory: Path, file_names): # TODO not used
+ for file_name in file_names:
+ file_path = directory / file_name
+ if file_path.is_file():
+ file_path.unlink()
+ logging.warning(f"Removed {file_path}")
+
+
+def filter_reaction(reaction: ReactionContainer, config: ReactionCheckConfig, checkers: list):
+
+ is_filtered = False
+ if config.remove_small_molecules:
+ new_reaction = remove_small_molecules(reaction, number_of_atoms=config.small_molecules_max_size)
+ else:
+ new_reaction = reaction.copy()
+
+ if new_reaction is None:
+ is_filtered = True
+
+ if config.remove_reagents and not is_filtered:
+ new_reaction = remove_reagents(
+ new_reaction,
+ keep_reagents=True,
+ reagents_max_size=config.reagents_max_size,
+ )
+
+ if new_reaction is None:
+ is_filtered = True
+ new_reaction = reaction.copy()
+ # TODO you are specifying that if the reaction has only reagents, it is kept as it ?
+
+ if not is_filtered:
+ if config.rebalance_reaction:
+ new_reaction = rebalance_reaction(new_reaction)
+ for checker in checkers:
+ try: # TODO CGRTools: ValueError: mapping of graphs is not disjoint
+ if checker(new_reaction):
+ # If checker returns True it means the reaction doesn't pass the check
+ new_reaction.meta["filtration_log"] = checker.__class__.__name__
+ is_filtered = True
+ except:
+ is_filtered = True
+
+
+
+ return is_filtered, new_reaction
+
+
+@ray.remote
+def process_batch(batch, config: ReactionCheckConfig, checkers):
+ results = []
+ for index, reaction in batch:
+ try: # TODO CGRtools.exceptions.MappingError: atoms with number {52} not equal
+ is_filtered, processed_reaction = filter_reaction(reaction, config, checkers)
+ results.append((index, is_filtered, processed_reaction))
+ except:
+ results.append((index, True, reaction))
+ return results
+
+
+def process_completed_batches(futures, result_file, pbar, treated: int = 0, passed_filters: int = 0):
+ done, _ = ray.wait(list(futures.keys()), num_returns=1)
+ completed_batch = ray.get(done[0])
+
+ # Write results of the completed batch to file
+ now_treated = 0
+ for index, is_filtered, reaction in completed_batch:
+ now_treated += 1
+ if not is_filtered:
+ result_file.write(reaction.meta['init_smiles'])
+ passed_filters += 1
+
+ # Remove completed future and update progress bar
+ del futures[done[0]]
+ pbar.update(now_treated)
+ treated += now_treated
+
+ return treated, passed_filters
+
+
+def filter_reactions(
+ config: ReactionCheckConfig,
+ reaction_database_path: str,
+ result_reactions_file_name: str = "reaction_data_filtered.smi",
+ append_results: bool = False,
+ num_cpus: int = 1,
+ batch_size: int = 100,
+) -> None:
+ """
+ Processes a database of chemical reactions, applying checks based on the provided configuration,
+ and writes the results to specified files. All configurations are provided by the ReactionCheckConfig object.
+
+ :param config: ReactionCheckConfig object containing all configuration settings.
+ :param reaction_database_path: Path to the reaction database file.
+ :param result_reactions_file_name: Name for the file containing cleaned reactions.
+ :param append_results: Flag indicating whether to append results to existing files.
+ :param num_cpus: Number of CPUs to use for processing.
+ :param batch_size: Size of the batch for processing reactions.
+ :return: None. The function writes the processed reactions to specified RDF and pickle files.
+ Unique reactions are written if save_only_unique is True.
+ """
+
+ checkers = config.create_checkers()
+
+ ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+ max_concurrent_batches = num_cpus # Limit the number of concurrent batches
+
+ with ReactionReader(reaction_database_path) as reactions, \
+ ReactionWriter(result_reactions_file_name, append_results) as result_file:
+
+ pbar = tqdm(reactions, leave=True) # TODO fix progress bars
+
+ futures = {}
+ batch = []
+ treated = filtered = 0
+ for index, reaction in enumerate(reactions):
+ reaction.meta["reaction_index"] = index
+ batch.append((index, reaction))
+ if len(batch) == batch_size:
+ future = process_batch.remote(batch, config, checkers)
+ futures[future] = None
+ batch = []
+
+ # Check and process completed tasks if we've reached the concurrency limit
+ while len(futures) >= max_concurrent_batches:
+ treated, filtered = process_completed_batches(futures, result_file, pbar, treated, filtered)
+
+ # Process the last batch if it's not empty
+ if batch:
+ future = process_batch.remote(batch, config, checkers)
+ futures[future] = None
+
+ # Process remaining batches
+ while futures:
+ treated, filtered = process_completed_batches(futures, result_file, pbar, treated, filtered)
+
+ pbar.close()
+
+ ray.shutdown()
+ print(f'Initial number of reactions: {treated}'),
+ print(f'Removed number of reactions: {treated - filtered}')
diff --git a/SynTool/chem/data/mapping.py b/SynTool/chem/data/mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a4636c464b443bb26f4310778bd6001f01e793
--- /dev/null
+++ b/SynTool/chem/data/mapping.py
@@ -0,0 +1,96 @@
+from pathlib import Path
+from os.path import splitext
+from typing import Union
+from tqdm import tqdm
+
+from chython import smiles, RDFRead, RDFWrite, ReactionContainer
+from chython.exceptions import MappingError, IncorrectSmiles
+
+from SynTool.utils import path_type
+
+
+def remove_reagents_and_map(rea: ReactionContainer, keep_reagent: bool = False) -> Union[ReactionContainer, None]:
+ """
+ Maps atoms of the reaction using chytorch.
+
+ :param rea: reaction to map
+ :type rea: ReactionContainer
+ :param keep_reagent: whenever to remove reagent or not
+ :type keep_reagent: bool
+
+ :return: ReactionContainer or None
+ """
+ try:
+ rea.reset_mapping()
+ except MappingError:
+ rea.reset_mapping() # Successive reset_mapping works
+ if not keep_reagent:
+ try:
+ rea.remove_reagents()
+ except:
+ return None
+ return rea
+
+
+def remove_reagents_and_map_from_file(input_file: path_type, output_file: path_type, keep_reagent: bool = False) -> None:
+ """
+ Reads a file of reactions and maps atoms of the reactions using chytorch.
+
+ :param input_file: the path and name of the input file
+ :type input_file: path_type
+ :param output_file: the path and name of the output file
+ :type output_file: path_type
+ :param keep_reagent: whenever to remove reagent or not
+ :type keep_reagent: bool
+
+ :return: None
+ """
+ input_file = str(Path(input_file).resolve(strict=True))
+ _, input_ext = splitext(input_file)
+ if input_ext == ".smi":
+ input_file = open(input_file, "r")
+ elif input_ext == ".rdf":
+ input_file = RDFRead(input_file, indexable=True)
+ else:
+ raise ValueError("File extension not recognized. File:", input_file,
+ "- Please use smi or rdf file")
+ enumerator = input_file if input_ext == ".rdf" else input_file.readlines()
+
+ _, out_ext = splitext(output_file)
+ if out_ext == ".smi":
+ output_file = open(output_file, "w")
+ elif out_ext == ".rdf":
+ output_file = RDFWrite(output_file)
+ else:
+ raise ValueError("File extension not recognized. File:", output_file,
+ "- Please use smi or rdf file")
+
+ mapping_errors = 0
+ parsing_errors = 0
+ for rea_raw in tqdm(enumerator):
+ try:
+ rea = smiles(rea_raw.strip('\n')) if input_ext == ".smi" else rea_raw
+ except IncorrectSmiles:
+ parsing_errors += 1
+ continue
+ try:
+ rea_mapped = remove_reagents_and_map(rea, keep_reagent)
+ except MappingError:
+ try:
+ rea_mapped = remove_reagents_and_map(smiles(str(rea)), keep_reagent)
+ except MappingError:
+ mapping_errors += 1
+ continue
+ if rea_mapped:
+ rea_output = format(rea, "m") + "\n" if out_ext == ".smi" else rea
+ output_file.write(rea_output)
+ else:
+ mapping_errors += 1
+
+ input_file.close()
+ output_file.close()
+
+ if parsing_errors:
+ print(parsing_errors, "reactions couldn't be parsed")
+ if mapping_errors:
+ print(mapping_errors, "reactions couldn't be mapped")
diff --git a/SynTool/chem/data/mapping.py.bk b/SynTool/chem/data/mapping.py.bk
new file mode 100644
index 0000000000000000000000000000000000000000..0796c137004f49a60ab6fe4569691fd51645490e
--- /dev/null
+++ b/SynTool/chem/data/mapping.py.bk
@@ -0,0 +1,90 @@
+from pathlib import Path
+from os.path import splitext
+from typing import Union
+from tqdm import tqdm
+
+from chython import smiles, RDFRead, RDFWrite, ReactionContainer
+from chython.exceptions import MappingError
+
+from Syntool.utils import path_type
+
+
+def remove_reagents_and_map(rea: ReactionContainer) -> Union[ReactionContainer, None]:
+ """
+ Maps atoms of the reaction using chytorch.
+
+ :param rea: reaction to map
+ :type rea: ReactionContainer
+
+ :return: ReactionContainer or None
+ """
+ try:
+ rea.reset_mapping()
+ except MappingError:
+ rea.reset_mapping()
+ try:
+ rea.remove_reagents()
+ return rea
+ except:
+ # print("Error", str(rea))
+ return None
+
+
+def remove_reagents_and_map_from_file(input_file: path_type, output_file: path_type) -> None:
+ """
+ Reads a file of reactions and maps atoms of the reactions using chytorch.
+
+ :param input_file: the path and name of the input file
+ :type input_file: path_type
+
+ :param output_file: the path and name of the output file
+ :type output_file: path_type
+
+ :return: None
+ """
+ input_file = str(Path(input_file).resolve(strict=True))
+ _, input_ext = splitext(input_file)
+ if input_ext == ".smi":
+ input_file = open(input_file, "r")
+ elif input_ext == ".rdf":
+ input_file = RDFRead(input_file, indexable=True)
+ else:
+ raise ValueError("File extension not recognized. File:", input_file,
+ "- Please use smi or rdf file")
+ enumerator = input_file if input_ext == ".rdf" else input_file.readlines()
+
+ _, out_ext = splitext(output_file)
+ if out_ext == ".smi":
+ output_file = open(output_file, "w")
+ elif out_ext == ".rdf":
+ output_file = RDFWrite(output_file)
+ else:
+ raise ValueError("File extension not recognized. File:", output_file,
+ "- Please use smi or rdf file")
+
+ mapping_errors = 0
+ parsing_errors = 0
+ for rea_raw in tqdm(enumerator):
+ try:
+ rea = smiles(rea_raw.strip('\n')) if input_ext == ".smi" else rea_raw
+ except:
+ parsing_errors += 1
+ print("Error", parsing_errors, rea_raw)
+ continue
+ try:
+ rea_mapped = remove_reagents_and_map(rea)
+ except:
+ parsing_errors += 1
+ print("Error for,", rea)
+ continue
+ if rea_mapped:
+ rea_output = format(rea, "m") + "\n" if out_ext == ".smi" else rea
+ output_file.write(rea_output)
+ else:
+ mapping_errors += 1
+
+ input_file.close()
+ output_file.close()
+
+ if mapping_errors:
+ print(mapping_errors, "reactions couldn't be mapped")
diff --git a/SynTool/chem/data/standardizer.py b/SynTool/chem/data/standardizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ac41e1914570449db0b3769e9c7de9c4e6e3b96
--- /dev/null
+++ b/SynTool/chem/data/standardizer.py
@@ -0,0 +1,604 @@
+#############################################################################
+# Code issued from https://github.com/Laboratoire-de-Chemoinformatique/Reaction_Data_Cleaning
+# Reaction_Data_Cleaning/scripts/standardizer.py
+# version as it from commit 793475e54d8b2c7f714165a61e4eb439435d7d92
+# DOI 10.1002/minf.202100119
+#############################################################################
+# Chemical reactions data curation best practices
+# including optimized RDTool
+#############################################################################
+# GNU LGPL https://www.gnu.org/licenses/lgpl-3.0.en.html
+#############################################################################
+# Corresponding Authors: Timur Madzhidov and Alexandre Varnek
+# Corresponding Authors' emails: tmadzhidov@gmail.com and varnek@unistra.fr
+# Main contributors: Arkadii Lin, Natalia Duybankova, Ramil Nugmanov, Rail Suleymanov and Timur Madzhidov
+# Copyright: Copyright 2020,
+# MaDeSmart, Machine Design of Small Molecules by AI
+# VLAIO project HBC.2018.2287
+# Credits: Kazan Federal University, Russia
+# University of Strasbourg, France
+# University of Linz, Austria
+# University of Leuven, Belgium
+# Janssen Pharmaceutica N.V., Beerse, Belgium
+# Rail Suleymanov, Arcadia, St. Petersburg, Russia
+# License: GNU LGPL https://www.gnu.org/licenses/lgpl-3.0.en.html
+# Version: 00.02
+#############################################################################
+
+from CGRtools.files import RDFRead, RDFWrite, SDFWrite, SDFRead, SMILESRead
+from CGRtools.containers import MoleculeContainer, ReactionContainer
+import logging
+from ordered_set import OrderedSet
+import os
+import io
+import pathlib
+from pathlib import PurePosixPath
+
+
+class Standardizer:
+ def __init__(self, skip_errors=False, log_file=None, keep_unbalanced_ions=False, id_tag='Reaction_ID',
+ action_on_isotopes=0, keep_reagents=False, logger=None, ignore_mapping=False, jvm_path=None,
+ rdkit_dearomatization=False, remove_unchanged_parts=True, skip_tautomerize=True,
+ jchem_path=None, add_reagents_to_reactants=False) -> None:
+ if logger is None:
+ self.logger = self._config_log(log_file, logger_name='logger')
+ else:
+ self.logger = logger
+ self._skip_errors = skip_errors
+ self._keep_unbalanced_ions = keep_unbalanced_ions
+ self._id_tag = id_tag
+ self._action_on_isotopes = action_on_isotopes
+ self._keep_reagents = keep_reagents
+ self._ignore_mapping = ignore_mapping
+ self._remove_unchanged_parts_flag = remove_unchanged_parts
+ self._skip_tautomerize = skip_tautomerize
+ self._dearomatize_by_rdkit = rdkit_dearomatization
+ self._reagents_to_reactants = add_reagents_to_reactants
+ if not skip_tautomerize:
+ if jvm_path:
+ os.environ['JDK_HOME'] = jvm_path
+ os.environ['JAVA_HOME'] = jvm_path
+ os.environ['PATH'] += f';{PurePosixPath(jvm_path).joinpath("bin").joinpath("server")};' \
+ f'{PurePosixPath(jvm_path).joinpath("bin").joinpath("server")};'
+ if jchem_path:
+ import jnius_config
+ jnius_config.add_classpath(jchem_path)
+ from jnius import autoclass
+ Standardizer = autoclass('chemaxon.standardizer.Standardizer')
+ self._Molecule = autoclass('chemaxon.struc.Molecule')
+ self._MolHandler = autoclass('chemaxon.util.MolHandler')
+ self._standardizer = Standardizer('tautomerize')
+
+ def standardize_file(self, input_file=None) -> OrderedSet:
+ """
+ Standardize a set of reactions in a file. Returns an ordered set of ReactionContainer objects passed the
+ standardization protocol.
+ :param input_file: str
+ :return: OrderedSet
+ """
+ if pathlib.Path(input_file).suffix == '.rdf':
+ data = self._read_RDF(input_file)
+ elif pathlib.Path(input_file).suffix == '.smi' or pathlib.Path(input_file).suffix == '.smiles':
+ data = self._read_SMILES(input_file)
+ else:
+ raise ValueError('Data format is not recognized!')
+
+ print("{0} reactions passed..".format(len(data)))
+ return data
+
+ def _read_RDF(self, input_file) -> OrderedSet:
+ """
+ Reads an RDF file. Returns an ordered set of ReactionContainer objects passed the standardization protocol.
+ :param input_file: str
+ :return: OrderedSet
+ """
+ data = OrderedSet()
+ self.logger.info('Start..')
+ with RDFRead(input_file, ignore=self._ignore_mapping, store_log=True, remap=self._ignore_mapping) as ifile, \
+ open(input_file) as meta_searcher:
+ for reaction in ifile._data:
+ if isinstance(reaction, tuple):
+ meta_searcher.seek(reaction.position)
+ flag = False
+ for line in meta_searcher:
+ if flag and '$RFMT' in line:
+ self.logger.critical(f'Reaction id extraction problem rised for the reaction '
+ f'#{reaction.number + 1}: a reaction id was expected but $RFMT line '
+ f'was found!')
+ if flag:
+ self.logger.critical(f'Reaction {line.strip().split()[1]}: Parser has returned an error '
+ f'message\n{reaction.log}')
+ break
+ elif '$RFMT' in line:
+ self.logger.critical(f'Reaction #{reaction.number + 1} has no reaction id!')
+ elif f'$DTYPE {self._id_tag}' in line:
+ flag = True
+ continue
+ standardized_reaction = self.standardize(reaction)
+ if standardized_reaction:
+ if standardized_reaction not in data:
+ data.add(standardized_reaction)
+ else:
+ i = data.index(standardized_reaction)
+ if 'Extraction_IDs' not in data[i].meta:
+ data[i].meta['Extraction_IDs'] = ''
+ data[i].meta['Extraction_IDs'] = ','.join(data[i].meta['Extraction_IDs'].split(',') +
+ [reaction.meta[self._id_tag]])
+ self.logger.info('Reaction {0} is a duplicate of the reaction {1}..'
+ .format(reaction.meta[self._id_tag], data[i].meta[self._id_tag]))
+ return data
+
+ def _read_SMILES(self, input_file) -> OrderedSet:
+ """
+ Reads a SMILES file. Returns an ordered set of ReactionContainer objects passed the standardization protocol.
+ :param input_file: str
+ :return: OrderedSet
+ """
+ data = OrderedSet()
+ self.logger.info('Start..')
+ with SMILESRead(input_file, ignore=True, store_log=True, remap=self._ignore_mapping, header=True) as ifile, \
+ open(input_file) as meta_searcher:
+ id_tag_position = meta_searcher.readline().strip().split().index(self._id_tag)
+ if id_tag_position is None or id_tag_position == 0:
+ self.logger.critical(f'No reaction ID tag was found in the header!')
+ raise ValueError(f'No reaction ID tag was found in the header!')
+ for reaction in ifile._data:
+ if isinstance(reaction, tuple):
+ meta_searcher.seek(reaction.position)
+ line = meta_searcher.readline().strip().split()
+ if len(line) <= id_tag_position:
+ self.logger.critical(f'No reaction ID tag was found in line {reaction.number}!')
+ raise ValueError(f'No reaction ID tag was found in line {reaction.number}!')
+ r_id = line[id_tag_position]
+ self.logger.critical(f'Reaction {r_id}: Parser has returned an error message\n{reaction.log}')
+ continue
+
+ standardized_reaction = self.standardize(reaction)
+ if standardized_reaction:
+ if standardized_reaction not in data:
+ data.add(standardized_reaction)
+ else:
+ i = data.index(standardized_reaction)
+ if 'Extraction_IDs' not in data[i].meta:
+ data[i].meta['Extraction_IDs'] = ''
+ data[i].meta['Extraction_IDs'] = ','.join(data[i].meta['Extraction_IDs'].split(',') +
+ [reaction.meta[self._id_tag]])
+ self.logger.info('Reaction {0} is a duplicate of the reaction {1}..'
+ .format(reaction.meta[self._id_tag], data[i].meta[self._id_tag]))
+ return data
+
+ def standardize(self, reaction: ReactionContainer) -> ReactionContainer:
+ """
+ Standardization protocol: transform functional groups, kekulize, remove explicit hydrogens,
+ check for radicals (remove if something was found), check for isotopes, regroup ions (if the total charge
+ of reactants and/or products is not zero, and the 'keep_unbalanced_ions' option is False which is by default,
+ such reactions are removed; if the 'keep_unbalanced_ions' option is set True, they are kept), check valences
+ (remove if something is wrong), aromatize (thiele method), fix mapping (for symmetric functional groups) if
+ such is in, remove unchanged parts.
+ :param reaction: ReactionContainer
+ :return: ReactionContainer
+ """
+ self.logger.info('Reaction {0}..'.format(reaction.meta[self._id_tag]))
+ try:
+ reaction.standardize()
+ except:
+ self.logger.exception(
+ 'Reaction {0}: Cannot standardize functional groups..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception(
+ 'Reaction {0}: Cannot standardize functional groups..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ reaction.kekule()
+ except:
+ self.logger.exception('Reaction {0}: Cannot kekulize..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot kekulize..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ if self._check_valence(reaction):
+ self.logger.info(
+ 'Reaction {0}: Bad valence: {1}'.format(reaction.meta[self._id_tag], reaction.meta['mistake']))
+ return
+ except:
+ self.logger.exception('Reaction {0}: Cannot check valence..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ self.logger.critical('Stop the algorithm!')
+ raise Exception('Reaction {0}: Cannot check valence..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ if not self._skip_tautomerize:
+ reaction = self._tautomerize(reaction)
+ except:
+ self.logger.exception('Reaction {0}: Cannot tautomerize..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot tautomerize..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ reaction.implicify_hydrogens()
+ except:
+ self.logger.exception(
+ 'Reaction {0}: Cannot remove explicit hydrogens..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot remove explicit hydrogens..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ if self._check_radicals(reaction):
+ self.logger.info('Reaction {0}: Radicals were found..'.format(reaction.meta[self._id_tag]))
+ return
+ except:
+ self.logger.exception('Reaction {0}: Cannot check radicals..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot check radicals..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ if self._action_on_isotopes == 1 and self._check_isotopes(reaction):
+ self.logger.info('Reaction {0}: Isotopes were found..'.format(reaction.meta[self._id_tag]))
+ return
+ elif self._action_on_isotopes == 2 and self._check_isotopes(reaction):
+ reaction.clean_isotopes()
+ self.logger.info('Reaction {0}: Isotopes were removed but the reaction was kept..'.format(
+ reaction.meta[self._id_tag]))
+ except:
+ self.logger.exception('Reaction {0}: Cannot check for isotopes..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot check for isotopes..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ reaction, return_code = self._split_ions(reaction)
+ if return_code == 1:
+ self.logger.info('Reaction {0}: Ions were split..'.format(reaction.meta[self._id_tag]))
+ elif return_code == 2:
+ self.logger.info('Reaction {0}: Ions were split but the reaction is imbalanced..'.format(
+ reaction.meta[self._id_tag]))
+ if not self._keep_unbalanced_ions:
+ return
+ except:
+ self.logger.exception('Reaction {0}: Cannot group ions..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot group ions..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ reaction.thiele()
+ except:
+ self.logger.exception('Reaction {0}: Cannot aromatize..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot aromatize..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ reaction.fix_mapping()
+ except:
+ self.logger.exception('Reaction {0}: Cannot fix mapping..'.format(reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot fix mapping..'.format(reaction.meta[self._id_tag]))
+ else:
+ return
+ try:
+ if self._remove_unchanged_parts_flag:
+ reaction = self._remove_unchanged_parts(reaction)
+ if not reaction.reactants and reaction.products:
+ self.logger.info('Reaction {0}: Reactants are empty..'.format(reaction.meta[self._id_tag]))
+ return
+ if not reaction.products and reaction.reactants:
+ self.logger.info('Reaction {0}: Products are empty..'.format(reaction.meta[self._id_tag]))
+ return
+ if not reaction.reactants and not reaction.products:
+ self.logger.exception(
+ 'Reaction {0}: Cannot remove unchanged parts or the reaction is empty..'.format(
+ reaction.meta[self._id_tag]))
+ return
+ except:
+ self.logger.exception('Reaction {0}: Cannot remove unchanged parts or the reaction is empty..'.format(
+ reaction.meta[self._id_tag]))
+ if not self._skip_errors:
+ raise Exception('Reaction {0}: Cannot remove unchanged parts or the reaction is empty..'.format(
+ reaction.meta[self._id_tag]))
+ else:
+ return
+ self.logger.debug('Reaction {0} is done..'.format(reaction.meta[self._id_tag]))
+ return reaction
+
+ def write(self, output_file: str, data: OrderedSet) -> None:
+ """
+ Dump a set of reactions.
+ :param data: OrderedSet
+ :param output_file: str
+ :return: None
+ """
+ with RDFWrite(output_file) as out:
+ for r in data:
+ out.write(r)
+
+ def _check_valence(self, reaction: ReactionContainer) -> bool:
+ """
+ Checks valences.
+ :param reaction: ReactionContainer
+ :return: bool
+ """
+ mistakes = []
+ for molecule in (reaction.reactants + reaction.products + reaction.reagents):
+ valence_mistakes = molecule.check_valence()
+ if valence_mistakes:
+ mistakes.append(("|".join([str(num) for num in valence_mistakes]),
+ "|".join([str(molecule.atom(n)) for n in valence_mistakes]), str(molecule)))
+ if mistakes:
+ message = ",".join([f'{atom_nums} at {atoms} in {smiles}' for atom_nums, atoms, smiles in mistakes])
+ reaction.meta['mistake'] = f'Valence mistake: {message}'
+ return True
+ return False
+
+ def _config_log(self, log_file: str, logger_name: str):
+ logger = logging.getLogger(logger_name)
+ logger.setLevel(logging.DEBUG)
+ formatter = logging.Formatter(fmt='%(asctime)s: %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
+ logger.handlers.clear()
+ fileHandler = logging.FileHandler(filename=log_file, mode='w')
+ fileHandler.setFormatter(formatter)
+ fileHandler.setLevel(logging.DEBUG)
+ logger.addHandler(fileHandler)
+ # logging.basicConfig(filename=log_file, level=logging.info, filemode='w', format='%(asctime)s: %(message)s',
+ # datefmt='%d/%m/%Y %H:%M:%S')
+ return logger
+
+ def _check_radicals(self, reaction: ReactionContainer) -> bool:
+ """
+ Checks radicals.
+ :param reaction: ReactionContainer
+ :return: bool
+ """
+ for molecule in (reaction.reactants + reaction.products + reaction.reagents):
+ for n, atom in molecule.atoms():
+ if atom.is_radical:
+ return True
+ return False
+
+ def _calc_charge(self, molecule: MoleculeContainer) -> int:
+ """Computing charge of molecule.
+ :param: molecule: MoleculeContainer
+ :return: int
+ """
+ return sum(molecule._charges.values())
+
+ def _group_ions(self, reaction: ReactionContainer):
+ """
+ Ungroup molecules recorded as ions, regroup ions. Returns a tuple with the corresponding ReactionContainer and
+ return code as int (0 - nothing was changed, 1 - ions were regrouped, 2 - ions are unbalanced).
+ :param reaction: current reaction
+ :return: tuple[ReactionContainer, int]
+ """
+ meta = reaction.meta
+ reaction_parts = []
+ return_codes = []
+ for molecules in (reaction.reactants, reaction.reagents, reaction.products):
+ divided_molecules = [x for m in molecules for x in m.split('.')]
+
+ if len(divided_molecules) == 0:
+ reaction_parts.append(())
+ continue
+ elif len(divided_molecules) == 1 and self._calc_charge(divided_molecules[0]) == 0:
+ return_codes.append(0)
+ reaction_parts.append(molecules)
+ continue
+ elif len(divided_molecules) == 1:
+ return_codes.append(2)
+ reaction_parts.append(molecules)
+ continue
+
+ new_molecules = []
+ cations, anions, ions = [], [], []
+ total_charge = 0
+ for molecule in divided_molecules:
+ mol_charge = self._calc_charge(molecule)
+ total_charge += mol_charge
+ if mol_charge == 0:
+ new_molecules.append(molecule)
+ elif mol_charge > 0:
+ cations.append((mol_charge, molecule))
+ ions.append((mol_charge, molecule))
+ else:
+ anions.append((mol_charge, molecule))
+ ions.append((mol_charge, molecule))
+
+ if len(cations) == 0 and len(anions) == 0:
+ return_codes.append(0)
+ reaction_parts.append(tuple(new_molecules))
+ continue
+ elif total_charge != 0:
+ return_codes.append(2)
+ reaction_parts.append(tuple(divided_molecules))
+ continue
+ else:
+ salt = MoleculeContainer()
+ for ion_charge, ion in ions:
+ salt = salt.union(ion)
+ total_charge += ion_charge
+ if total_charge == 0:
+ new_molecules.append(salt)
+ salt = MoleculeContainer()
+ if total_charge != 0:
+ new_molecules.append(salt)
+ return_codes.append(2)
+ reaction_parts.append(tuple(new_molecules))
+ else:
+ return_codes.append(1)
+ reaction_parts.append(tuple(new_molecules))
+ return ReactionContainer(reactants=reaction_parts[0], reagents=reaction_parts[1], products=reaction_parts[2],
+ meta=meta), max(return_codes)
+
+ def _split_ions(self, reaction: ReactionContainer):
+ """
+ Split ions in a reaction. Returns a tuple with the corresponding ReactionContainer and
+ a return code as int (0 - nothing was changed, 1 - ions were split, 2 - ions were split but the reaction
+ is imbalanced).
+ :param reaction: current reaction
+ :return: tuple[ReactionContainer, int]
+ """
+ meta = reaction.meta
+ reaction_parts = []
+ return_codes = []
+ for molecules in (reaction.reactants, reaction.reagents, reaction.products):
+ divided_molecules = [x for m in molecules for x in m.split('.')]
+
+ total_charge = 0
+ ions_present = False
+ for molecule in divided_molecules:
+ mol_charge = self._calc_charge(molecule)
+ total_charge += mol_charge
+ if mol_charge != 0:
+ ions_present = True
+
+ if ions_present and total_charge:
+ return_codes.append(2)
+ elif ions_present:
+ return_codes.append(1)
+ else:
+ return_codes.append(0)
+
+ reaction_parts.append(tuple(divided_molecules))
+
+ return ReactionContainer(reactants=reaction_parts[0], reagents=reaction_parts[1], products=reaction_parts[2],
+ meta=meta), max(return_codes)
+
+ def _remove_unchanged_parts(self, reaction: ReactionContainer) -> ReactionContainer:
+ """
+ Ungroup molecules, remove unchanged parts from reactants and products.
+ :param reaction: current reaction
+ :return: ReactionContainer
+ """
+ meta = reaction.meta
+ new_reactants = [m for m in reaction.reactants]
+ new_reagents = [m for m in reaction.reagents]
+ if self._reagents_to_reactants:
+ new_reactants.extend(new_reagents)
+ new_reagents = []
+ reactants = new_reactants.copy()
+ new_products = [m for m in reaction.products]
+
+ for reactant in reactants:
+ if reactant in new_products:
+ new_reagents.append(reactant)
+ new_reactants.remove(reactant)
+ new_products.remove(reactant)
+ if not self._keep_reagents:
+ new_reagents = []
+ return ReactionContainer(reactants=tuple(new_reactants), reagents=tuple(new_reagents),
+ products=tuple(new_products), meta=meta)
+
+ def _check_isotopes(self, reaction: ReactionContainer) -> bool:
+ for molecules in (reaction.reactants, reaction.products):
+ for molecule in molecules:
+ for _, atom in molecule.atoms():
+ if atom.isotope:
+ return True
+ return False
+
+ def _tautomerize(self, reaction: ReactionContainer) -> ReactionContainer:
+ """
+ Perform ChemAxon tautomerization.
+ :param reaction: reaction that needs to be tautomerized
+ :return: ReactionContainer
+ """
+ new_molecules = []
+ for part in [reaction.reactants, reaction.reagents, reaction.products]:
+ tmp = []
+ for mol in part:
+ with io.StringIO() as f, SDFWrite(f) as i:
+ i.write(mol)
+ sdf = f.getvalue()
+ mol_handler = self._MolHandler(sdf)
+ mol_handler.clean(True, '2')
+ molecule = mol_handler.getMolecule()
+ self._standardizer.standardize(molecule)
+ new_mol_handler = self._MolHandler(molecule)
+ new_sdf = new_mol_handler.toFormat('SDF')
+ with io.StringIO('\n ' + new_sdf.strip()) as f, SDFRead(f, remap=False) as i:
+ new_mol = next(i)
+ tmp.append(new_mol)
+ new_molecules.append(tmp)
+ return ReactionContainer(reactants=tuple(new_molecules[0]), reagents=tuple(new_molecules[1]),
+ products=tuple(new_molecules[2]), meta=reaction.meta)
+
+ # def _dearomatize_by_RDKit(self, reaction: ReactionContainer) -> ReactionContainer:
+ # """
+ # Dearomatizes by RDKit (needs in case of some mappers, such as RXNMapper).
+ # :param reaction: ReactionContainer
+ # :return: ReactionContainer
+ # """
+ # with io.StringIO() as f, RDFWrite(f) as i:
+ # i.write(reaction)
+ # s = '\n'.join(f.getvalue().split('\n')[3:])
+ # rxn = rdChemReactions.ReactionFromRxnBlock(s)
+ # reactants, reagents, products = [], [], []
+ # for mol in rxn.GetReactants():
+ # try:
+ # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_KEKULIZE, catchErrors=True)
+ # except Chem.rdchem.KekulizeException:
+ # return reaction
+ # with io.StringIO(Chem.MolToMolBlock(mol)) as f2, SDFRead(f2, remap=False) as sdf_i:
+ # reactants.append(next(sdf_i))
+ # for mol in rxn.GetAgents():
+ # try:
+ # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_KEKULIZE, catchErrors=True)
+ # except Chem.rdchem.KekulizeException:
+ # return reaction
+ # with io.StringIO(Chem.MolToMolBlock(mol)) as f2, SDFRead(f2, remap=False) as sdf_i:
+ # reagents.append(next(sdf_i))
+ # for mol in rxn.GetProducts():
+ # try:
+ # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_KEKULIZE, catchErrors=True)
+ # except Chem.rdchem.KekulizeException:
+ # return reaction
+ # with io.StringIO(Chem.MolToMolBlock(mol)) as f2, SDFRead(f2, remap=False) as sdf_i:
+ # products.append(next(sdf_i))
+ #
+ # new_reaction = ReactionContainer(reactants=tuple(reactants), reagents=tuple(reagents), products=tuple(products),
+ # meta=reaction.meta)
+ #
+ # return new_reaction
+
+
+if __name__ == '__main__':
+ import argparse
+
+ parser = argparse.ArgumentParser(description="This is a tool for reaction standardization.",
+ epilog="Arkadii Lin, Strasbourg/Kazan 2020", prog="Standardizer")
+ parser.add_argument("-i", "--input", type=str, help="Input RDF file.")
+ parser.add_argument("-o", "--output", type=str, help="Output RDF file.")
+ parser.add_argument("-id", "--idTag", default='Reaction_ID', type=str, help="ID tag in the RDF file.")
+ parser.add_argument("--skipErrors", action="store_true", help="Skip errors.")
+ parser.add_argument("--keep_unbalanced_ions", action="store_true", help="Will keep reactions with unbalanced ions.")
+ parser.add_argument("--action_on_isotopes", type=int, default=0, help="Action performed if an isotope is "
+ "found: 0 - to ignore isotopes; "
+ "1 - to remove reactions with isotopes; "
+ "2 - to clear isotopes' labels.")
+ parser.add_argument("--keep_reagents", action="store_true", help="Will keep reagents from the reaction.")
+ parser.add_argument("--add_reagents", action="store_true", help="Will add the given reagents to reactants.")
+ parser.add_argument("--ignore_mapping", action="store_true", help="Will ignore the initial mapping in the file.")
+ parser.add_argument("--keep_unchanged_parts", action="store_true", help="Will keep unchanged parts in a reaction.")
+ parser.add_argument("--logFile", type=str, default='logFile.txt', help="Log file name.")
+ parser.add_argument("--skip_tautomerize", action="store_true", help="Will skip generation of the major tautomer.")
+ parser.add_argument("--rdkit_dearomatization", action="store_true", help="Will kekulize the reaction using RDKit "
+ "facilities.")
+ parser.add_argument("--jvm_path", type=str,
+ help="JVM path (e.g. C:\\Program Files\\Java\\jdk-13.0.2).")
+ parser.add_argument("--jchem_path", type=str, help="JChem path (e.g. C:\\Users\\user\\JChemSuite\\lib\\jchem.jar).")
+ args = parser.parse_args()
+
+ standardizer = Standardizer(skip_errors=args.skipErrors, log_file=args.logFile,
+ keep_unbalanced_ions=args.keep_unbalanced_ions, id_tag=args.idTag,
+ action_on_isotopes=args.action_on_isotopes, keep_reagents=args.keep_reagents,
+ ignore_mapping=args.ignore_mapping, skip_tautomerize=args.skip_tautomerize,
+ remove_unchanged_parts=(not args.keep_unchanged_parts), jvm_path=args.jvm_path,
+ jchem_path=args.jchem_path, rdkit_dearomatization=args.rdkit_dearomatization,
+ add_reagents_to_reactants=args.add_reagents)
+ data = standardizer.standardize_file(input_file=args.input)
+ standardizer.write(output_file=args.output, data=data)
diff --git a/SynTool/chem/reaction.py b/SynTool/chem/reaction.py
new file mode 100755
index 0000000000000000000000000000000000000000..77f789ff880b4181c2aa12684c98ed7856b3ac67
--- /dev/null
+++ b/SynTool/chem/reaction.py
@@ -0,0 +1,107 @@
+"""
+Module containing classes and functions for manipulating reactions and reaction rules
+"""
+
+from CGRtools.reactor import Reactor
+from CGRtools.containers import MoleculeContainer, ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+
+
+class Reaction(ReactionContainer):
+ """
+ Reaction class can be used for a general representation of reaction for different chemoinformatics Python packages
+ """
+
+ def __init__(self, *args, **kwargs):
+ """
+ Initializes the reaction object.
+ """
+ super().__init__(*args, **kwargs)
+
+
+def add_small_mols(big_mol, small_molecules=None):
+ """
+ The function takes a molecule and returns a list of modified molecules where each small molecule has been added to
+ the big molecule.
+
+ :param big_mol: A molecule
+ :param small_molecules: A list of small molecules that need to be added to the molecule
+ :return: Returns a list of molecules.
+ """
+ if small_molecules:
+ tmp_mol = big_mol.copy()
+ transition_mapping = {}
+ for small_mol in small_molecules:
+
+ for n, atom in small_mol.atoms():
+ new_number = tmp_mol.add_atom(atom.atomic_symbol)
+ transition_mapping[n] = new_number
+
+ for atom, neighbor, bond in small_mol.bonds():
+ tmp_mol.add_bond(transition_mapping[atom], transition_mapping[neighbor], bond)
+
+ transition_mapping = {}
+ return tmp_mol.split()
+ else:
+ return [big_mol]
+
+
+def apply_reaction_rule(
+ molecule: MoleculeContainer,
+ reaction_rule: Reactor,
+ sort_reactions: bool = False,
+ top_reactions_num: int = 3,
+ validate_products: bool = True,
+ rebuild_with_cgr: bool = False,
+) -> list[MoleculeContainer]:
+ """
+ The function applies a reaction rule to a given molecule.
+
+ :param rebuild_with_cgr:
+ :param validate_products:
+ :param sort_reactions:
+ :param top_reactions_num:
+ :param molecule: A MoleculeContainer object representing the molecule on which the reaction rule will be applied
+ :type molecule: MoleculeContainer
+ :param reaction_rule: The reaction_rule is an instance of the Reactor class. It represents a reaction rule that
+ can be applied to a molecule
+ :type reaction_rule: Reactor
+ """
+
+ reactants = add_small_mols(molecule, small_molecules=False)
+
+ try:
+ if sort_reactions:
+ unsorted_reactions = list(reaction_rule(reactants))
+ sorted_reactions = sorted(
+ unsorted_reactions,
+ key=lambda react: len(list(filter(lambda mol: len(mol) > 6, react.products))),
+ reverse=True
+ )
+ reactions = sorted_reactions[:top_reactions_num] # Take top-N reactions from reactor
+ else:
+ reactions = []
+ for reaction in reaction_rule(reactants):
+ reactions.append(reaction)
+ if len(reactions) == top_reactions_num:
+ break
+ except IndexError:
+ reactions = []
+
+ for reaction in reactions:
+ if rebuild_with_cgr:
+ cgr = reaction.compose()
+ products = cgr.decompose()[1].split()
+ else:
+ products = reaction.products
+ products = [mol for mol in products if len(mol) > 0]
+ if validate_products:
+ for molecule in products:
+ try:
+ molecule.kekule()
+ if molecule.check_valence():
+ yield None
+ molecule.thiele()
+ except InvalidAromaticRing:
+ yield None
+ yield products
diff --git a/SynTool/chem/reaction_rules/__init__.py b/SynTool/chem/reaction_rules/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SynTool/chem/reaction_rules/__pycache__/__init__.cpython-310.pyc b/SynTool/chem/reaction_rules/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..979dbb4bef420d97b18a5a953c1cd4a5a645f1cf
Binary files /dev/null and b/SynTool/chem/reaction_rules/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SynTool/chem/reaction_rules/__pycache__/extraction.cpython-310.pyc b/SynTool/chem/reaction_rules/__pycache__/extraction.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97e635f0947372978b3298bd6f3f334536a9876b
Binary files /dev/null and b/SynTool/chem/reaction_rules/__pycache__/extraction.cpython-310.pyc differ
diff --git a/SynTool/chem/reaction_rules/extraction.py b/SynTool/chem/reaction_rules/extraction.py
new file mode 100755
index 0000000000000000000000000000000000000000..2a03beda431c6f61e3db531d5894ab687847185b
--- /dev/null
+++ b/SynTool/chem/reaction_rules/extraction.py
@@ -0,0 +1,679 @@
+"""
+Module containing functions with fixed protocol for reaction rules extraction
+"""
+import logging
+import pickle
+from collections import defaultdict
+from itertools import islice
+from pathlib import Path
+from typing import List, Union, Tuple, IO, Dict, Set, Iterable, Any
+from os.path import splitext
+
+
+import ray
+from CGRtools.containers import MoleculeContainer, QueryContainer, ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+from tqdm.auto import tqdm
+
+from SynTool.chem.utils import reverse_reaction
+from SynTool.utils.config import RuleExtractionConfig
+from SynTool.utils.files import ReactionReader
+
+
+def extract_rules_from_reactions(
+ config: RuleExtractionConfig,
+ reaction_file: str,
+ rules_file_name: str = 'reaction_rules.pickle',
+ num_cpus: int = 1,
+ batch_size: int = 10,
+) -> None:
+ """
+ Extracts reaction rules from a set of reactions based on the given configuration.
+
+ This function initializes a Ray environment for distributed computing and processes each reaction
+ in the provided reaction database to extract reaction rules. It handles the reactions in batches,
+ parallelize the rule extraction process. Extracted rules are written to RDF files and their statistics
+ are recorded. The function also sorts the rules based on their popularity and saves the sorted rules.
+
+ :param config: Configuration settings for rule extraction, including file paths, batch size, and other parameters.
+ :param reaction_file: Path to the file containing reaction database.
+ :param rules_file_name: Name of the file to store the extracted rules.
+ :param num_cpus: Number of CPU cores to use for processing. Defaults to 1.
+ :param batch_size: Number of reactions to process in each batch. Defaults to 10.
+
+ :return: None
+ """
+
+ # read files
+ reaction_file = Path(reaction_file).resolve(strict=True)
+
+ ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+
+ rules_file_name, _ = splitext(rules_file_name)
+ with ReactionReader(reaction_file) as reactions:
+ pbar = tqdm(reactions, disable=False)
+
+ futures = {}
+ batch = []
+ max_concurrent_batches = num_cpus
+
+ extracted_rules_and_statistics = defaultdict(list)
+ for index, reaction in enumerate(reactions):
+ batch.append((index, reaction))
+ if len(batch) == batch_size:
+ future = process_reaction_batch.remote(batch, config)
+ futures[future] = None
+ batch = []
+
+ while len(futures) >= max_concurrent_batches:
+ process_completed_batches(futures, extracted_rules_and_statistics, pbar, batch_size)
+
+ if batch:
+ remaining_size = len(batch)
+ future = process_reaction_batch.remote(batch, config)
+ futures[future] = None
+
+ while futures:
+ process_completed_batches(futures, extracted_rules_and_statistics, pbar, remaining_size)
+
+ pbar.close()
+
+ sorted_rules = sort_rules(
+ extracted_rules_and_statistics,
+ min_popularity=config.min_popularity,
+ single_reactant_only=config.single_reactant_only,
+ )
+
+ with open(f"{rules_file_name}.pickle", "wb") as statistics_file:
+ pickle.dump(sorted_rules, statistics_file)
+ print(f'Number of extracted reaction rules: {len(sorted_rules)}')
+
+ ray.shutdown()
+
+
+@ray.remote
+def process_reaction_batch(
+ batch: List[Tuple[int, ReactionContainer]], config: RuleExtractionConfig
+) -> list[tuple[int, list[ReactionContainer]]]:
+ """
+ Processes a batch of reactions to extract reaction rules based on the given configuration.
+
+ This function operates as a remote task in a distributed system using Ray. It takes a batch of reactions,
+ where each reaction is paired with an index. For each reaction in the batch, it extracts reaction rules
+ as specified by the configuration object. The extracted rules for each reaction are then returned along
+ with the corresponding index.
+
+ :param batch: A list where each element is a tuple containing an index (int) and a ReactionContainer object.
+ The index is typically used to keep track of the reaction's position in a larger dataset.
+ :type batch: List[Tuple[int, ReactionContainer]]
+
+ :param config: An instance of ExtractRuleConfig that provides settings and parameters for the rule extraction process.
+ :type config: RuleExtractionConfig
+
+ :return: A list where each element is a tuple. The first element of the tuple is an index (int), and the second
+ is a list of ReactionContainer objects representing the extracted rules for the corresponding reaction.
+ :rtype: list[tuple[int, list[ReactionContainer]]]
+
+ This function is intended to be used in a distributed manner with Ray to parallelize the rule extraction
+ process across multiple reactions.
+ """
+ processed_batch = []
+ for index, reaction in batch:
+ try:
+ extracted_rules = extract_rules(config, reaction)
+ processed_batch.append((index, extracted_rules))
+ except:
+ continue
+ return processed_batch
+
+
+def process_completed_batches(
+ futures: dict,
+ rules_statistics: Dict[ReactionContainer, List[int]],
+ pbar: tqdm,
+ batch_size: int,
+) -> None:
+ """
+ Processes completed batches of reactions, updating the rules statistics and writing rules to a file.
+
+ This function waits for the completion of a batch of reactions processed in parallel (using Ray),
+ updates the statistics for each extracted rule, and writes the rules to a result file if they are new.
+ It also updates the progress bar with the size of the processed batch.
+
+ :param futures: A dictionary of futures representing ongoing batch processing tasks.
+ :type futures: dict
+
+ :param rules_statistics: A dictionary to keep track of statistics for each rule.
+ :type rules_statistics: Dict[ReactionContainer, List[int]]
+
+ :param pbar: A tqdm progress bar instance for updating the progress of batch processing.
+ :type pbar: tqdm
+
+ :param batch_size: The number of reactions processed in each batch.
+ :type batch_size: int
+
+ :return: None
+ """
+ done, _ = ray.wait(list(futures.keys()), num_returns=1)
+ completed_batch = ray.get(done[0])
+
+ for index, extracted_rules in completed_batch:
+ for rule in extracted_rules:
+ prev_stats_len = len(rules_statistics)
+ rules_statistics[rule].append(index)
+ if len(rules_statistics) != prev_stats_len:
+ rule.meta["first_reaction_index"] = index
+
+ del futures[done[0]]
+ pbar.update(batch_size)
+
+
+def extract_rules(
+ config: RuleExtractionConfig, reaction: ReactionContainer
+) -> list[ReactionContainer]:
+ """
+ Extracts reaction rules from a given reaction based on the specified configuration.
+
+ :param config: An instance of ExtractRuleConfig, which contains various configuration settings
+ for rule extraction, such as whether to include multicenter rules, functional groups,
+ ring structures, leaving and incoming groups, etc.
+ :param reaction: The reaction object (ReactionContainer) from which to extract rules. The reaction
+ object represents a chemical reaction with specified reactants, products, and possibly reagents.
+ :return: A list of ReactionContainer objects, each representing a distinct reaction rule. If
+ config.multicenter_rules is True, a single rule encompassing all reaction centers is returned.
+ Otherwise, separate rules for each reaction center are extracted, up to a maximum of 15 distinct centers.
+ """
+ if config.multicenter_rules:
+ # Extract a single rule encompassing all reaction centers
+ return [create_rule(config, reaction)]
+ else:
+ # Extract separate rules for each distinct reaction center
+ distinct_rules = set()
+ for center_reaction in islice(reaction.enumerate_centers(), 15):
+ single_rule = create_rule(config, center_reaction)
+ distinct_rules.add(single_rule)
+ return list(distinct_rules)
+
+
+def create_rule(
+ config: RuleExtractionConfig, reaction: ReactionContainer
+) -> ReactionContainer:
+ """
+ Creates a reaction rule from a given reaction based on the specified configuration.
+
+ :param config: An instance of ExtractRuleConfig, containing various settings that determine how
+ the rule is created, such as environmental atom count, inclusion of functional groups,
+ rings, leaving and incoming groups, and other parameters.
+ :param reaction: The reaction object (ReactionContainer) from which to create the rule. This object
+ represents a chemical reaction with specified reactants, products, and possibly reagents.
+ :return: A ReactionContainer object representing the extracted reaction rule. This rule includes
+ various elements of the reaction as specified by the configuration, such as reaction centers,
+ environmental atoms, functional groups, and others.
+
+ The function processes the reaction to create a rule that matches the configuration settings. It handles
+ the inclusion of environmental atoms, functional groups, ring structures, and leaving and incoming groups.
+ It also constructs substructures for reactants, products, and reagents, and cleans molecule representations
+ if required. Optionally, it validates the rule using a reactor.
+ """
+ cgr = ~reaction
+ center_atoms = set(cgr.center_atoms)
+
+ # Add atoms of reaction environment based on config settings
+ center_atoms = add_environment_atoms(
+ cgr, center_atoms, config.environment_atom_count
+ )
+
+ # Include functional groups in the rule if specified in config
+ if config.include_func_groups:
+ rule_atoms = add_functional_groups(
+ reaction, center_atoms, config.func_groups_list
+ )
+ else:
+ rule_atoms = center_atoms.copy()
+
+ # Include ring structures in the rule if specified in config
+ if config.include_rings:
+ rule_atoms = add_ring_structures(
+ cgr,
+ rule_atoms,
+ )
+
+ # Add leaving and incoming groups to the rule based on config settings
+ rule_atoms, meta_debug = add_leaving_incoming_groups(
+ reaction, rule_atoms, config.keep_leaving_groups, config.keep_incoming_groups
+ )
+
+ # Create substructures for reactants, products, and reagents
+ (
+ reactant_substructures,
+ product_substructures,
+ reagents,
+ ) = create_substructures_and_reagents(
+ reaction, rule_atoms, config.as_query_container, config.keep_reagents
+ )
+
+ # Clean atom marks in the molecules if they are being converted to query containers
+ if config.as_query_container:
+ reactant_substructures = clean_molecules(
+ reactant_substructures,
+ reaction.reactants,
+ center_atoms,
+ config.atom_info_retention,
+ )
+ product_substructures = clean_molecules(
+ product_substructures,
+ reaction.products,
+ center_atoms,
+ config.atom_info_retention,
+ )
+
+ # Assemble the final rule including metadata if specified
+ rule = assemble_final_rule(
+ reactant_substructures,
+ product_substructures,
+ reagents,
+ meta_debug,
+ config.keep_metadata,
+ reaction,
+ )
+
+ if config.reverse_rule:
+ rule = reverse_reaction(rule)
+ reaction = reverse_reaction(reaction)
+
+ # Validate the rule using a reactor if validation is enabled in config
+ if config.reactor_validation:
+ if validate_rule(rule, reaction):
+ rule.meta["reactor_validation"] = "passed"
+ else:
+ rule.meta["reactor_validation"] = "failed"
+
+ return rule
+
+
+def add_environment_atoms(cgr, center_atoms, environment_atom_count):
+ """
+ Adds environment atoms to the set of center atoms based on the specified depth.
+
+ :param cgr: A complete graph representation of a reaction (ReactionContainer object).
+ :param center_atoms: A set of atom identifiers representing the center atoms of the reaction.
+ :param environment_atom_count: An integer specifying the depth of the environment around
+ the reaction center to be included. If it's 0, only the
+ reaction center is included. If it's 1, the first layer of
+ surrounding atoms is included, and so on.
+ :return: A set of atom identifiers including the center atoms and their environment atoms
+ up to the specified depth. If environment_atom_count is 0, the original set of
+ center atoms is returned unchanged.
+ """
+ if environment_atom_count:
+ env_cgr = cgr.augmented_substructure(center_atoms, deep=environment_atom_count)
+ # Combine the original center atoms with the new environment atoms
+ return center_atoms | set(env_cgr)
+
+ # If no environment is to be included, return the original center atoms
+ return center_atoms
+
+
+def add_functional_groups(reaction, center_atoms, func_groups_list):
+ """
+ Augments the set of rule atoms with functional groups if specified.
+
+ :param reaction: The reaction object (ReactionContainer) from which molecules are extracted.
+ :param center_atoms: A set of atom identifiers representing the center atoms of the reaction.
+ :param func_groups_list: A list of functional group objects (MoleculeContainer or QueryContainer)
+ to be considered when including functional groups. These objects define
+ the structure of the functional groups to be included.
+ :return: A set of atom identifiers representing the rule atoms, including atoms from the
+ specified functional groups if include_func_groups is True. If include_func_groups
+ is False, the original set of center atoms is returned.
+ """
+ rule_atoms = center_atoms.copy()
+ # Iterate over each molecule in the reaction
+ for molecule in reaction.molecules():
+ # For each functional group specified in the list
+ for func_group in func_groups_list:
+ # Find mappings of the functional group in the molecule
+ for mapping in func_group.get_mapping(molecule):
+ # Remap the functional group based on the found mapping
+ func_group.remap(mapping)
+ # If the functional group intersects with center atoms, include it
+ if set(func_group.atoms_numbers) & center_atoms:
+ rule_atoms |= set(func_group.atoms_numbers)
+ # Reset the mapping to its original state for the next iteration
+ func_group.remap({v: k for k, v in mapping.items()})
+ return rule_atoms
+
+
+def add_ring_structures(cgr, rule_atoms):
+ """
+ Appends ring structures to the set of rule atoms if they intersect with the reaction center atoms.
+
+ :param cgr: A condensed graph representation of a reaction (CGRContainer object).
+ :param rule_atoms: A set of atom identifiers representing the center atoms of the reaction.
+ :return: A set of atom identifiers including the original rule atoms and the included ring structures.
+ """
+ for ring in cgr.sssr:
+ # Check if the current ring intersects with the set of rule atoms
+ if set(ring) & rule_atoms:
+ # If the intersection exists, include all atoms in the ring to the rule atoms
+ rule_atoms |= set(ring)
+ return rule_atoms
+
+
+def add_leaving_incoming_groups(
+ reaction, rule_atoms, keep_leaving_groups, keep_incoming_groups
+):
+ """
+ Identifies and includes leaving and incoming groups to the rule atoms based on specified flags.
+
+ :param reaction: The reaction object (ReactionContainer) from which leaving and incoming groups are extracted.
+ :param rule_atoms: A set of atom identifiers representing the center atoms of the reaction.
+ :param keep_leaving_groups: A boolean flag indicating whether to include leaving groups in the rule.
+ :param keep_incoming_groups: A boolean flag indicating whether to include incoming groups in the rule.
+ :return: Updated set of rule atoms including leaving and incoming groups if specified, and metadata about added groups.
+ """
+ meta_debug = {"leaving": set(), "incoming": set()}
+
+ # Extract atoms from reactants and products
+ reactant_atoms = {atom for reactant in reaction.reactants for atom in reactant}
+ product_atoms = {atom for product in reaction.products for atom in product}
+
+ # Identify leaving groups (reactant atoms not in products)
+ if keep_leaving_groups:
+ leaving_atoms = reactant_atoms - product_atoms
+ new_leaving_atoms = leaving_atoms - rule_atoms
+ # Include leaving atoms in the rule atoms
+ rule_atoms |= leaving_atoms
+ # Add leaving atoms to metadata
+ meta_debug["leaving"] |= new_leaving_atoms
+
+ # Identify incoming groups (product atoms not in reactants)
+ if keep_incoming_groups:
+ incoming_atoms = product_atoms - reactant_atoms
+ new_incoming_atoms = incoming_atoms - rule_atoms
+ # Include incoming atoms in the rule atoms
+ rule_atoms |= incoming_atoms
+ # Add incoming atoms to metadata
+ meta_debug["incoming"] |= new_incoming_atoms
+
+ return rule_atoms, meta_debug
+
+
+def clean_molecules(
+ rule_molecules: Iterable[QueryContainer],
+ reaction_molecules: Iterable[MoleculeContainer],
+ reaction_center_atoms: Set[int],
+ atom_retention_details: Dict[str, Dict[str, bool]],
+) -> List[QueryContainer]:
+ """
+ Cleans rule molecules by removing specified information about atoms based on retention details provided.
+
+ :param rule_molecules: A list of query container objects representing the rule molecules.
+ :param reaction_molecules: A list of molecule container objects involved in the reaction.
+ :param reaction_center_atoms: A set of integers representing atom numbers in the reaction center.
+ :param atom_retention_details: A dictionary specifying what atom information to retain or remove.
+ This dictionary should have two keys: "reaction_center" and "environment",
+ each mapping to another dictionary. The nested dictionaries should have
+ keys representing atom attributes (like "neighbors", "hybridization",
+ "implicit_hydrogens", "ring_sizes") and boolean values. A value of True
+ indicates that the corresponding attribute should be retained,
+ while False indicates it should be removed from the atom.
+
+ For example:
+ {
+ "reaction_center": {"neighbors": True, "hybridization": False, ...},
+ "environment": {"neighbors": True, "implicit_hydrogens": False, ...}
+ }
+
+ Returns:
+ A list of QueryContainer objects representing the cleaned rule molecules.
+ """
+ cleaned_rule_molecules = []
+
+ for rule_molecule in rule_molecules:
+ for reaction_molecule in reaction_molecules:
+ if set(rule_molecule.atoms_numbers) <= set(reaction_molecule.atoms_numbers):
+ query_reaction_molecule = reaction_molecule.substructure(
+ reaction_molecule, as_query=True
+ )
+ query_rule_molecule = query_reaction_molecule.substructure(
+ rule_molecule
+ )
+
+ # Clean environment atoms
+ if not all(
+ atom_retention_details["environment"].values()
+ ): # if everything True, we keep all marks
+ local_environment_atoms = (
+ set(rule_molecule.atoms_numbers) - reaction_center_atoms
+ )
+ for atom_number in local_environment_atoms:
+ query_rule_molecule = clean_atom(
+ query_rule_molecule,
+ atom_retention_details["environment"],
+ atom_number,
+ )
+
+ # Clean reaction center atoms
+ if not all(
+ atom_retention_details["reaction_center"].values()
+ ): # if everything True, we keep all marks
+ local_reaction_center_atoms = (
+ set(rule_molecule.atoms_numbers) & reaction_center_atoms
+ )
+ for atom_number in local_reaction_center_atoms:
+ query_rule_molecule = clean_atom(
+ query_rule_molecule,
+ atom_retention_details["reaction_center"],
+ atom_number,
+ )
+
+ cleaned_rule_molecules.append(query_rule_molecule)
+ break
+
+ return cleaned_rule_molecules
+
+
+def clean_atom(
+ query_molecule: QueryContainer,
+ attributes_to_keep: Dict[str, bool],
+ atom_number: int,
+) -> QueryContainer:
+ """
+ Removes specified information from a given atom in a query molecule.
+
+ :param query_molecule: The QueryContainer of molecule.
+ :param attributes_to_keep: Dictionary indicating which attributes to keep in the atom.
+ The keys should be strings representing the attribute names, and
+ the values should be booleans indicating whether to retain (True)
+ or remove (False) that attribute. Expected keys are:
+ - "neighbors": Indicates if neighbors of the atom should be removed.
+ - "hybridization": Indicates if hybridization information of the atom should be removed.
+ - "implicit_hydrogens": Indicates if implicit hydrogen information of the atom should be removed.
+ - "ring_sizes": Indicates if ring size information of the atom should be removed.
+ :param atom_number: The number of the atom to be modified in the query molecule.
+ """
+ target_atom = query_molecule.atom(atom_number)
+
+ if not attributes_to_keep["neighbors"]:
+ target_atom.neighbors = None
+ if not attributes_to_keep["hybridization"]:
+ target_atom.hybridization = None
+ if not attributes_to_keep["implicit_hydrogens"]:
+ target_atom.implicit_hydrogens = None
+ if not attributes_to_keep["ring_sizes"]:
+ target_atom.ring_sizes = None
+
+ return query_molecule
+
+
+def create_substructures_and_reagents(
+ reaction, rule_atoms, as_query_container, keep_reagents
+):
+ """
+ Creates substructures for reactants and products, and optionally includes reagents, based on specified parameters.
+
+ :param reaction: The reaction object (ReactionContainer) from which to extract substructures. This object
+ represents a chemical reaction with specified reactants, products, and possibly reagents.
+ :param rule_atoms: A set of atom identifiers that define the rule atoms. These are used to identify relevant
+ substructures in reactants and products.
+ :param as_query_container: A boolean flag indicating whether the substructures should be converted to query containers.
+ Query containers are used for pattern matching in chemical structures.
+ :param keep_reagents: A boolean flag indicating whether reagents should be included in the resulting structures.
+ Reagents are additional substances that are present in the reaction but are not reactants or products.
+
+ :return: A tuple containing three elements:
+ - A list of reactant substructures, each corresponding to a part of the reactants that matches the rule atoms.
+ - A list of product substructures, each corresponding to a part of the products that matches the rule atoms.
+ - A list of reagents, included as is or as substructures, depending on the as_query_container flag.
+
+ The function processes the reaction to create substructures for reactants and products based on the rule atoms.
+ It also handles the inclusion of reagents based on the keep_reagents flag and converts these structures to query
+ containers if required.
+ """
+ reactant_substructures = [
+ reactant.substructure(rule_atoms.intersection(reactant.atoms_numbers))
+ for reactant in reaction.reactants
+ if rule_atoms.intersection(reactant.atoms_numbers)
+ ]
+
+ product_substructures = [
+ product.substructure(rule_atoms.intersection(product.atoms_numbers))
+ for product in reaction.products
+ if rule_atoms.intersection(product.atoms_numbers)
+ ]
+
+ reagents = []
+ if keep_reagents:
+ if as_query_container:
+ reagents = [
+ reagent.substructure(reagent, as_query=True)
+ for reagent in reaction.reagents
+ ]
+ else:
+ reagents = reaction.reagents
+
+ return reactant_substructures, product_substructures, reagents
+
+
+def assemble_final_rule(
+ reactant_substructures,
+ product_substructures,
+ reagents,
+ meta_debug,
+ keep_metadata,
+ reaction,
+):
+ """
+ Assembles the final reaction rule from the provided substructures and metadata.
+
+ :param reactant_substructures: A list of substructures derived from the reactants of the reaction.
+ These substructures represent parts of reactants that are relevant to the rule.
+ :param product_substructures: A list of substructures derived from the products of the reaction.
+ These substructures represent parts of products that are relevant to the rule.
+ :param reagents: A list of reagents involved in the reaction. These may be included as-is or as substructures,
+ depending on earlier processing steps.
+ :param meta_debug: A dictionary containing additional metadata about the reaction, such as leaving and incoming groups.
+ :param keep_metadata: A boolean flag indicating whether to retain the metadata associated with the reaction in the rule.
+ :param reaction: The original reaction object (ReactionContainer) from which the rule is being created.
+
+ :return: A ReactionContainer object representing the assembled reaction rule. This container includes
+ the reactant and product substructures, reagents, and any additional metadata if keep_metadata is True.
+
+ This function brings together the various components of a reaction rule, including reactant and product substructures,
+ reagents, and metadata. It creates a comprehensive representation of the reaction rule, which can be used for further
+ processing or analysis.
+ """
+ rule_metadata = meta_debug if keep_metadata else {}
+ rule_metadata.update(reaction.meta if keep_metadata else {})
+
+ rule = ReactionContainer(
+ reactant_substructures, product_substructures, reagents, rule_metadata
+ )
+
+ if keep_metadata:
+ rule.name = reaction.name
+
+ rule.flush_cache()
+ return rule
+
+
+def validate_rule(rule: ReactionContainer, reaction: ReactionContainer):
+ """
+ Validates a reaction rule by ensuring it can correctly generate the products from the reactants.
+
+ :param rule: The reaction rule to be validated. This is a ReactionContainer object representing a chemical reaction rule,
+ which includes the necessary information to perform a reaction.
+ :param reaction: The original reaction object (ReactionContainer) against which the rule is to be validated. This object
+ contains the actual reactants and products of the reaction.
+
+ :return: The validated rule if the rule correctly generates the products from the reactants.
+
+ :raises ValueError: If the rule does not correctly generate the products from the reactants, indicating
+ an incorrect or incomplete rule.
+
+ The function uses a chemical reactor to simulate the reaction based on the provided rule. It then compares
+ the products generated by the simulation with the actual products of the reaction. If they match, the rule
+ is considered valid. If not, a ValueError is raised, indicating an issue with the rule.
+ """
+ # Create a reactor with the given rule
+ reactor = Reactor(rule)
+ try:
+ for result_reaction in reactor(reaction.reactants):
+ result_products = []
+ for result_product in result_reaction.products:
+ tmp = result_product.copy()
+ try:
+ tmp.kekule()
+ if tmp.check_valence():
+ continue
+ except InvalidAromaticRing:
+ continue
+ result_products.append(result_product)
+ if set(reaction.products) == set(result_products) and len(
+ reaction.products
+ ) == len(result_products):
+ return True
+ except (KeyError, IndexError):
+ # KeyError - iteration over reactor is finished and products are different from the original reaction
+ # IndexError - mistake in __contract_ions, possibly problems with charges in rule?
+ return False
+
+
+def sort_rules(
+ rules_stats: Dict[ReactionContainer, List[int]],
+ min_popularity: int = 3,
+ single_reactant_only: bool = True,
+) -> List[Tuple[ReactionContainer, List[int]]]:
+ """
+ Sorts reaction rules based on their popularity and validation status.
+
+ This function sorts the given rules according to their popularity (i.e., the number of times they have been
+ applied) and filters out rules that haven't passed reactor validation or are less popular than the specified
+ minimum popularity threshold.
+
+ :param rules_stats: A dictionary where each key is a reaction rule and the value is a list of integers.
+ Each integer represents an index where the rule was applied.
+ :type rules_stats: Dict[ReactionContainer, List[int]]
+
+ :param min_popularity: The minimum number of times a rule must be applied to be considered. Default is 3.
+ :type min_popularity: int
+
+ :param single_reactant_only: Whether to keep only reaction rules with a single molecule on the right side
+ of reaction arrow. Default is True.
+
+ :return: A list of tuples, where each tuple contains a reaction rule and a list of indices representing
+ the rule's applications. The list is sorted in descending order of the rule's popularity.
+ :rtype: List[Tuple[ReactionContainer, List[int]]]
+ """
+ return sorted(
+ (
+ (rule, indices)
+ for rule, indices in rules_stats.items()
+ if len(indices) >= min_popularity
+ and rule.meta["reactor_validation"] == "passed"
+ and (not single_reactant_only or len(rule.reactants) == 1)
+ ),
+ key=lambda x: -len(x[1]),
+ )
diff --git a/SynTool/chem/reaction_rules/manual/__init__.py b/SynTool/chem/reaction_rules/manual/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..1ac0aa571b5a5fc26f40aaf6f3150c02a5a74b7e
--- /dev/null
+++ b/SynTool/chem/reaction_rules/manual/__init__.py
@@ -0,0 +1,6 @@
+from .decompositions import rules as d_rules
+from .transformations import rules as t_rules
+
+hardcoded_rules = t_rules + d_rules
+
+__all__ = ["hardcoded_rules"]
diff --git a/SynTool/chem/reaction_rules/manual/decompositions.py b/SynTool/chem/reaction_rules/manual/decompositions.py
new file mode 100755
index 0000000000000000000000000000000000000000..8192e62a579d4e0af1c092bb9ba615fd39d4a403
--- /dev/null
+++ b/SynTool/chem/reaction_rules/manual/decompositions.py
@@ -0,0 +1,415 @@
+"""
+Module containing hardcoded decomposition reaction rules
+"""
+
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+
+rules = []
+
+
+def prepare():
+ """
+ Creates and returns three query containers and appends a reaction container to the "rules" list
+ """
+ q_ = QueryContainer()
+ p1_ = QueryContainer()
+ p2_ = QueryContainer()
+ rules.append(ReactionContainer((q_,), (p1_, p2_)))
+ return q_, p1_, p2_
+
+
+# R-amide/ester formation
+# [C](-[N,O;D23;Zs])(-[C])=[O]>>[A].[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom(ListElement(['N', 'O']), hybridization=1, neighbors=(2, 3))
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O', _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+
+p2.add_atom('A', _map=4)
+
+# acyl group addition with aromatic carbon's case (Friedel-Crafts)
+# [C;Za]-[C](-[C])=[O]>>[C].[C]-[C](-[Cl])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C', hybridization=4)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('Cl', _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+
+p2.add_atom('C', _map=4)
+
+# Williamson reaction
+# [C;Za]-[O]-[C;Zs;W0]>>[C]-[Br].[C]-[O]
+q, p1, p2 = prepare()
+q.add_atom('C', hybridization=4)
+q.add_atom('O')
+q.add_atom('C', hybridization=1, heteroatoms=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_bond(1, 2, 1)
+
+p2.add_atom('C', _map=3)
+p2.add_atom('Br')
+p2.add_bond(3, 4, 1)
+
+# Buchwald-Hartwig amination
+# [N;D23;Zs;W0]-[C;Za]>>[C]-[Br].[N]
+q, p1, p2 = prepare()
+q.add_atom('N', heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom('C', hybridization=4)
+q.add_bond(1, 2, 1)
+
+p1.add_atom('C', _map=2)
+p1.add_atom('Br')
+p1.add_bond(2, 3, 1)
+
+p2.add_atom('N')
+
+# imidazole imine atom's alkylation
+# [C;r5](:[N;r5]-[C;Zs;W1]):[N;D2;r5]>>[C]-[Br].[N]:[C]:[N]
+q, p1, p2 = prepare()
+q.add_atom('N', rings_sizes=5)
+q.add_atom('C', rings_sizes=5)
+q.add_atom('N', rings_sizes=5, neighbors=2)
+q.add_atom('C', hybridization=1, heteroatoms=(1, 2))
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 4)
+q.add_bond(1, 4, 1)
+
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(1, 2, 4)
+p1.add_bond(2, 3, 4)
+
+p2.add_atom('C', _map=4)
+p2.add_atom('Br')
+p2.add_bond(4, 5, 1)
+
+# Knoevenagel condensation (nitryl and carboxyl case)
+# [C]=[C](-[C]#[N])-[C](-[O])=[O]>>[C]=[O].[C](-[C]#[N])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 2)
+q.add_bond(5, 7, 1)
+
+p1.add_atom('C', _map=2)
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O')
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 2)
+p1.add_bond(5, 7, 1)
+
+p2.add_atom('C', _map=1)
+p2.add_atom('O', _map=8)
+p2.add_bond(1, 8, 2)
+
+# Knoevenagel condensation (double nitryl case)
+# [C]=[C](-[C]#[N])-[C]#[N]>>[C]=[O].[C](-[C]#[N])-[C]#[N]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('N')
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 3)
+
+p1.add_atom('C', _map=2)
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 3)
+
+p2.add_atom('C', _map=1)
+p2.add_atom('O', _map=8)
+p2.add_bond(1, 8, 2)
+
+# Knoevenagel condensation (double carboxyl case)
+# [C]=[C](-[C](-[O])=[O])-[C](-[O])=[O]>>[C]=[O].[C](-[C](-[O])=[O])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 2)
+q.add_bond(3, 5, 1)
+q.add_bond(2, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+
+p1.add_atom('C', _map=2)
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O')
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 2)
+p1.add_bond(3, 5, 1)
+p1.add_bond(2, 6, 1)
+p1.add_bond(6, 7, 2)
+p1.add_bond(6, 8, 1)
+
+p2.add_atom('C', _map=1)
+p2.add_atom('O', _map=9)
+p2.add_bond(1, 9, 2)
+
+# heterocyclization with guanidine
+# [c]((-[N;W0;Zs])@[n]@[c](-[N;D1])@[c;W0])@[n]@[c]-[O; D1]>>[C](-[N])(=[N])-[N].[C](#[N])-[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('N', heteroatoms=0, hybridization=1)
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('N', neighbors=1)
+q.add_atom('C', heteroatoms=0)
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 4)
+q.add_bond(1, 7, 4)
+q.add_bond(7, 8, 4)
+q.add_bond(8, 9, 1)
+
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('N')
+p1.add_atom('N', _map=7)
+p1.add_bond(1, 2, 1)
+p1.add_bond(1, 3, 2)
+p1.add_bond(1, 7, 1)
+
+p2.add_atom('C', _map=4)
+p2.add_atom('N')
+p2.add_atom('C')
+p2.add_atom('C', _map=8)
+p2.add_atom('O', _map=9)
+p2.add_atom('O')
+p2.add_bond(4, 5, 3)
+p2.add_bond(4, 6, 1)
+p2.add_bond(6, 8, 1)
+p2.add_bond(8, 9, 2)
+p2.add_bond(8, 10, 1)
+
+# alkylation of amine
+# [C]-[N]-[C]>>[C]-[N].[C]-[Br]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+
+p2.add_atom('C', _map=4)
+p2.add_atom('Cl')
+p2.add_bond(4, 5, 1)
+
+# Synthesis of guanidines
+#
+q, p1, p2 = prepare()
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('N', hybridization=1)
+q.add_atom('N', hybridization=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+
+p2.add_atom('N', _map=4)
+
+# Grignard reaction with nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 3)
+
+p2.add_atom('C', _map=4)
+p2.add_atom('Br')
+p2.add_bond(4, 5, 1)
+
+# Alkylation of alpha-carbon atom of nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C', neighbors=(3, 4))
+q.add_atom('C', hybridization=1)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+
+p2.add_atom('C', _map=4)
+p2.add_atom('Cl')
+p2.add_bond(4, 5, 1)
+
+# Gomberg-Bachmann reaction
+#
+q, p1, p2 = prepare()
+q.add_atom('C', hybridization=4, heteroatoms=0)
+q.add_atom('C', hybridization=4, heteroatoms=0)
+q.add_bond(1, 2, 1)
+
+p1.add_atom('C')
+p1.add_atom('N', _map=3)
+p1.add_bond(1, 3, 1)
+
+p2.add_atom('C', _map=2)
+
+# Cyclocondensation
+#
+q, p1, p2 = prepare()
+q.add_atom('N', neighbors=2)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 1)
+q.add_bond(7, 8, 2)
+q.add_bond(1, 7, 1)
+
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('O', _map=9)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 1)
+p1.add_bond(4, 9, 2)
+
+p2.add_atom('N', _map=5)
+p2.add_atom('C')
+p2.add_atom('C')
+p2.add_atom('O')
+p2.add_atom('O', _map=10)
+p2.add_bond(5, 6, 1)
+p2.add_bond(6, 7, 1)
+p2.add_bond(7, 8, 2)
+p2.add_bond(7, 10, 1)
+
+# heterocyclization dicarboxylic acids
+#
+q, p1, p2 = prepare()
+q.add_atom('C', rings_sizes=(5, 6))
+q.add_atom('O')
+q.add_atom(ListElement(['O', 'N']))
+q.add_atom('C', rings_sizes=(5, 6))
+q.add_atom('O')
+q.add_bond(1, 2, 2)
+q.add_bond(1, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O', _map=6)
+p1.add_bond(1, 2, 2)
+p1.add_bond(1, 6, 1)
+
+p2.add_atom('C', _map=4)
+p2.add_atom('O')
+p2.add_atom('O', _map=7)
+p2.add_bond(4, 5, 2)
+p2.add_bond(4, 7, 1)
+
+__all__ = ['rules']
diff --git a/SynTool/chem/reaction_rules/manual/transformations.py b/SynTool/chem/reaction_rules/manual/transformations.py
new file mode 100755
index 0000000000000000000000000000000000000000..6a8890a43aed1a08e4dc7c11ea0ef13129ab09da
--- /dev/null
+++ b/SynTool/chem/reaction_rules/manual/transformations.py
@@ -0,0 +1,535 @@
+"""
+Module containing hardcoded transformation reaction rules
+"""
+
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+
+rules = []
+
+
+def prepare():
+ """
+ Creates and returns three query containers and appends a reaction container to the "rules" list
+ """
+ q_ = QueryContainer()
+ p_ = QueryContainer()
+ rules.append(ReactionContainer((q_,), (p_,)))
+ return q_, p_
+
+
+# aryl nitro reduction
+# [C;Za;W1]-[N;D1]>>[O-]-[N+](-[C])=[O]
+q, p = prepare()
+q.add_atom('N', neighbors=1)
+q.add_atom('C', hybridization=4, heteroatoms=1)
+q.add_bond(1, 2, 1)
+
+p.add_atom('N', charge=1)
+p.add_atom('C')
+p.add_atom('O', charge=-1)
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(1, 3, 1)
+p.add_bond(1, 4, 2)
+
+# aryl nitration
+# [O-]-[N+](=[O])-[C;Za;W12]>>[C]
+q, p = prepare()
+q.add_atom('N', charge=1)
+q.add_atom('C', hybridization=4, heteroatoms=(1, 2))
+q.add_atom('O', charge=-1)
+q.add_atom('O')
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 1)
+q.add_bond(1, 4, 2)
+
+p.add_atom('C', _map=2)
+
+# Beckmann rearrangement (oxime -> amide)
+# [C]-[N;D2]-[C]=[O]>>[O]-[N]=[C]-[C]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('N', neighbors=2)
+q.add_atom('O')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+q.add_bond(2, 4, 1)
+
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('O')
+p.add_atom('C')
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+p.add_bond(1, 4, 1)
+
+# aldehydes or ketones into oxime/imine reaction
+# [C;Zd;W1]=[N]>>[C]=[O]
+q, p = prepare()
+q.add_atom('C', hybridization=2, heteroatoms=1)
+q.add_atom('N')
+q.add_bond(1, 2, 2)
+
+p.add_atom('C')
+p.add_atom('O', _map=3)
+p.add_bond(1, 3, 2)
+
+# addition of halogen atom into phenol ring (orto)
+# [C](-[Cl,F,Br,I;D1]):[C]-[O,N;Zs]>>[C](-[A]):[C]
+q, p = prepare()
+q.add_atom(ListElement(['O', 'N']), hybridization=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom(ListElement(['Cl', 'F', 'Br', 'I']), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+
+p.add_atom('A')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+
+# addition of halogen atom into phenol ring (para)
+# [C](:[C]:[C]:[C]-[O,N;Zs])-[Cl,F,Br,I;D1]>>[A]-[C]:[C]:[C]:[C]
+q, p = prepare()
+q.add_atom(ListElement(['O', 'N']), hybridization=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom(ListElement(['Cl', 'F', 'Br', 'I']), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 4)
+q.add_bond(5, 6, 1)
+
+p.add_atom('A')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+p.add_bond(3, 4, 4)
+p.add_bond(4, 5, 4)
+
+# hard reduction of Ar-ketones
+# [C;Za]-[C;D2;Zs;W0]>>[C]-[C]=[O]
+q, p = prepare()
+q.add_atom('C', hybridization=4)
+q.add_atom('C', hybridization=1, neighbors=2, heteroatoms=0)
+q.add_bond(1, 2, 1)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+
+# reduction of alpha-hydroxy pyridine
+# [C;W1]:[N;H0;r6]>>[C](:[N])-[O]
+q, p = prepare()
+q.add_atom('C', heteroatoms=1)
+q.add_atom('N', rings_sizes=6, hydrogens=0)
+q.add_bond(1, 2, 4)
+
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('O')
+p.add_bond(1, 2, 4)
+p.add_bond(1, 3, 1)
+
+# Reduction of alkene
+# [C]-[C;D23;Zs;W0]-[C;D123;Zs;W0]>>[C](-[C])=[C]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('C', heteroatoms=0, neighbors=(2, 3), hybridization=1)
+q.add_atom('C', heteroatoms=0, neighbors=(1, 2, 3), hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+
+# Kolbe-Schmitt reaction
+# [C](:[C]-[O;D1])-[C](=[O])-[O;D1]>>[C](-[O]):[C]
+q, p = prepare()
+q.add_atom('O', neighbors=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_atom('O')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 2)
+
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+
+# reduction of carboxylic acid
+# [O;D1]-[C;D2]-[C]>>[C]-[C](-[O])=[O]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('C', neighbors=2)
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 1)
+p.add_bond(2, 4, 2)
+
+# halogenation of alcohols
+# [C;Zs]-[Cl,Br;D1]>>[C]-[O]
+q, p = prepare()
+q.add_atom('C', hybridization=1, heteroatoms=1)
+q.add_atom(ListElement(['Cl', 'Br']), neighbors=1)
+q.add_bond(1, 2, 1)
+
+p.add_atom('C')
+p.add_atom('O', _map=3)
+p.add_bond(1, 3, 1)
+
+# Kolbe nitrilation
+# [N]#[C]-[C;Zs;W0]>>[Br]-[C]
+q, p = prepare()
+q.add_atom('C', heteroatoms=0, hybridization=1)
+q.add_atom('C')
+q.add_atom('N')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 3)
+
+p.add_atom('C')
+p.add_atom('Br', _map=4)
+p.add_bond(1, 4, 1)
+
+# Nitrile hydrolysis
+# [O;D1]-[C]=[O]>>[N]#[C]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_atom('O')
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+
+p.add_atom('C')
+p.add_atom('N', _map=4)
+p.add_bond(1, 4, 3)
+
+# sulfamidation
+# [c]-[S](=[O])(=[O])-[N]>>[c]
+q, p = prepare()
+q.add_atom('C', hybridization=4)
+q.add_atom('S')
+q.add_atom('O')
+q.add_atom('O')
+q.add_atom('N', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 2)
+q.add_bond(2, 5, 1)
+
+p.add_atom('C')
+
+# Ring expansion rearrangement
+#
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C', rings_sizes=6)
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(3, 6, 1)
+q.add_bond(4, 7, 1)
+
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(4, 6, 1)
+p.add_bond(4, 7, 1)
+
+# hydrolysis of bromide alkyl
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+
+p.add_atom('C')
+p.add_atom('Br')
+p.add_bond(1, 2, 1)
+
+# Condensation of ketones/aldehydes and amines into imines
+#
+q, p = prepare()
+q.add_atom('N', neighbors=(1, 2))
+q.add_atom('C', neighbors=(2, 3), heteroatoms=1)
+q.add_bond(1, 2, 2)
+
+p.add_atom('C', _map=2)
+p.add_atom('O')
+p.add_bond(2, 3, 2)
+
+# Halogenation of alkanes
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom(ListElement(['F', 'Cl', 'Br']))
+q.add_bond(1, 2, 1)
+
+p.add_atom('C')
+
+# heterocyclization
+#
+q, p = prepare()
+q.add_atom('N', heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom('C', heteroatoms=2)
+q.add_atom('N', heteroatoms=0, neighbors=2)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 4, 2)
+
+# Reduction of nitrile
+#
+q, p = prepare()
+q.add_atom('N', neighbors=1)
+q.add_atom('C')
+q.add_atom('C', hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+
+# SPECIAL CASE
+# Reduction of nitrile into methylamine
+#
+q, p = prepare()
+q.add_atom('C', neighbors=1)
+q.add_atom('N', neighbors=2)
+q.add_atom('C')
+q.add_atom('C', hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+
+p.add_atom('N', _map=2)
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(2, 3, 3)
+p.add_bond(3, 4, 1)
+
+# methylation of amides
+#
+q, p = prepare()
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C', neighbors=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('N')
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+
+# hydrocyanation of alkenes
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 2)
+
+# decarbocylation (alpha atom of nitrile)
+#
+q, p = prepare()
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C', neighbors=2)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(4, 6, 1)
+
+# Bichler-Napieralski reaction
+#
+q, p = prepare()
+q.add_atom('C', rings_sizes=(6,))
+q.add_atom('C', rings_sizes=(6,))
+q.add_atom('N', rings_sizes=(6,), neighbors=2)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+q.add_bond(5, 9, 4)
+q.add_bond(9, 10, 1)
+q.add_bond(1, 9, 1)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_bond(1, 2, 4)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(5, 6, 1)
+p.add_bond(6, 7, 2)
+p.add_bond(6, 8, 1)
+p.add_bond(5, 9, 1)
+p.add_bond(9, 10, 2)
+p.add_bond(9, 11, 1)
+
+# heterocyclization in Prins reaction
+#
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom(ListElement(['N', 'O']), neighbors=2)
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 1)
+
+p.add_atom('C')
+p.add_atom('C', _map=5)
+p.add_bond(1, 5, 2)
+
+# recyclization of tetrahydropyran through an opening the ring and dehydration
+#
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom(ListElement(['N', 'O']))
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 2)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('A')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(1, 7, 1)
+p.add_bond(3, 7, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(5, 6, 1)
+p.add_bond(1, 6, 1)
+
+# alkenes + h2o/hHal
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom('C', hybridization=1)
+q.add_atom(ListElement(['O', 'F', 'Cl', 'Br', 'I']), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 2)
+
+# methylation of dimethylamines
+#
+q, p = prepare()
+q.add_atom('C', neighbors=1)
+q.add_atom('N', neighbors=3)
+q.add_bond(1, 2, 1)
+
+p.add_atom('N', _map=2)
+
+__all__ = ['rules']
diff --git a/SynTool/chem/retron.py b/SynTool/chem/retron.py
new file mode 100755
index 0000000000000000000000000000000000000000..3c763663edbc2699c3ea215b3948dde23f5bd721
--- /dev/null
+++ b/SynTool/chem/retron.py
@@ -0,0 +1,132 @@
+"""
+Module containing a class Retron that represents a retron (extend molecule object) in the search tree
+"""
+
+from CGRtools.containers import MoleculeContainer
+from CGRtools.exceptions import InvalidAromaticRing
+
+from SynTool.chem.utils import safe_canonicalization
+
+
+class Retron:
+ """
+ Retron class is used to extend the molecule behavior needed for interaction with a tree in MCTS
+ """
+
+ def __init__(self, molecule: MoleculeContainer, canonicalize: bool = True):
+ """
+ It initializes a Retron object with a molecule container as a parameter.
+
+ :param molecule: The `molecule` parameter is of type `MoleculeContainer`.
+ :type molecule: MoleculeContainer
+ """
+ self._molecule = safe_canonicalization(molecule) if canonicalize else molecule
+ self._mapping = None
+ self.prev_retrons = []
+
+ def __len__(self):
+ """
+ Return the number of atoms in Retron.
+ """
+ return len(self._molecule)
+
+ def __hash__(self):
+ """
+ Returns the hash value of Retron.
+ """
+ return hash(self._molecule)
+
+ def __str__(self):
+ return str(self._molecule)
+
+ def __eq__(self, other: "Retron"):
+ """
+ The function checks if the current Retron is equal to another Retron of the same class.
+
+ :param other: The "other" parameter is a reference to another object of the same class "Retron". It is used to
+ compare the current Retron with the other Retron to check if they are equal
+ :type other: "Retron"
+ """
+ return self._molecule == other._molecule
+
+ def validate_molecule(self):
+ molecule = self._molecule.copy()
+ try:
+ molecule.kekule()
+ if molecule.check_valence():
+ return False
+ molecule.thiele()
+ except InvalidAromaticRing:
+ return False
+ return True
+
+ @property
+ def molecule(self) -> MoleculeContainer:
+ """
+ Returns a remapped MoleculeContainer object if self._mapping=True.
+ """
+ if self._mapping:
+ remapped = self._molecule.copy()
+ try:
+ remapped = self._molecule.remap(self._mapping, copy=True)
+ except ValueError:
+ pass
+ return remapped
+ return self._molecule.copy()
+
+ def __repr__(self):
+ """
+ Returns a SMILES of the retron
+ """
+ return str(self._molecule)
+
+ def is_building_block(self, stock, min_mol_size=6):
+ """
+ The function checks if a Retron is a building block.
+
+ :param min_mol_size:
+ :param stock: The list of building blocks. Each building block is represented by a smiles.
+ """
+ if len(self._molecule) <= min_mol_size:
+ return True
+ else:
+ return str(self._molecule) in stock
+
+
+def compose_retrons(retrons: list = None, exclude_small=True, min_mol_size: int = 6
+ ) -> MoleculeContainer:
+ """
+ The function takes a list of retrons, excludes small retrons if specified, and composes them into a single molecule.
+ This molecule is used for the prediction of synthesisability of the characterizing the possible success of the path
+ including the nodes with the given retrons.
+
+ :param retrons: The list of retrons to be composed.
+ :type retrons: list
+ :param exclude_small: The parameter that determines whether small retrons should be
+ excluded from the composition process. If `exclude_small` is set to `True`, only retrons with a length greater than
+ min_mol_size will be considered for composition.
+ :param min_mol_size: parameter used with exclude_small
+ :return: A composed retrons as a MoleculeContainer object.
+ """
+
+ if len(retrons) == 1:
+ return retrons[0].molecule
+ elif len(retrons) > 1:
+ if exclude_small:
+ big_retrons = [
+ retron for retron in retrons if len(retron.molecule) > min_mol_size
+ ]
+ if big_retrons:
+ retrons = big_retrons
+ tmp_mol = retrons[0].molecule.copy()
+ transition_mapping = {}
+ for mol in retrons[1:]:
+ for n, atom in mol.molecule.atoms():
+ new_number = tmp_mol.add_atom(atom.atomic_symbol)
+ transition_mapping[n] = new_number
+ for atom, neighbor, bond in mol.molecule.bonds():
+ tmp_mol.add_bond(
+ transition_mapping[atom], transition_mapping[neighbor], bond
+ )
+ transition_mapping = {}
+ return tmp_mol
diff --git a/SynTool/chem/utils.py b/SynTool/chem/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e641fb90ea2e3d1ab9929f58ec826f50666708
--- /dev/null
+++ b/SynTool/chem/utils.py
@@ -0,0 +1,227 @@
+from typing import List, Iterable, Tuple, Union
+
+from CGRtools.containers import MoleculeContainer, ReactionContainer, QueryContainer
+from CGRtools.exceptions import InvalidAromaticRing
+
+
+def query_to_mol(query: QueryContainer) -> MoleculeContainer:
+ """
+ Converts a QueryContainer object into a MoleculeContainer object.
+
+ :param query: A QueryContainer object representing the query structure.
+ :return: A MoleculeContainer object that replicates the structure of the query.
+ """
+ new_mol = MoleculeContainer()
+ for n, atom in query.atoms():
+ new_mol.add_atom(atom.atomic_symbol, n, charge=atom.charge, is_radical=atom.is_radical)
+ for i, j, bond in query.bonds():
+ new_mol.add_bond(i, j, int(bond))
+ return new_mol
+
+
+def reaction_query_to_reaction(rule: ReactionContainer) -> ReactionContainer:
+ """
+ Converts a ReactionContainer object with query structures into a ReactionContainer with molecular structures.
+
+ :param rule: A ReactionContainer object where reactants and products are QueryContainer objects.
+ :return: A new ReactionContainer
+ :return: A new ReactionContainer object where reactants and products are MoleculeContainer objects.
+ """
+ reactants = [query_to_mol(q) for q in rule.reactants]
+ products = [query_to_mol(q) for q in rule.products]
+ reagents = [query_to_mol(q) for q in rule.reagents] # Assuming reagents are also part of the rule
+ reaction = ReactionContainer(reactants, products, reagents, rule.meta)
+ reaction.name = rule.name
+ return reaction
+
+
+def unite_molecules(molecules: Iterable[MoleculeContainer]) -> MoleculeContainer:
+ """
+ Unites a list of MoleculeContainer objects into a single MoleculeContainer.
+
+ This function takes multiple molecules and combines them into one larger molecule.
+ The first molecule in the list is taken as the base, and subsequent molecules are united with it sequentially.
+
+ :param molecules: A list of MoleculeContainer objects to be united.
+ :return: A single MoleculeContainer object representing the union of all input molecules.
+ """
+ new_mol = MoleculeContainer()
+ for mol in molecules:
+ new_mol = new_mol.union(mol)
+ return new_mol
+
+
+def safe_canonicalization(molecule: MoleculeContainer):
+ """
+ Attempts to canonicalize a molecule, handling any exceptions.
+
+ This function tries to canonicalize the given molecule.
+ If the canonicalization process fails due to an InvalidAromaticRing exception,
+ it safely returns the original molecule.
+
+ :param molecule: The given molecule to be canonicalized.
+ :return: The canonicalized molecule if successful, otherwise the original molecule.
+ """
+ molecule._atoms = dict(sorted(molecule._atoms.items()))
+
+ tmp = molecule.copy()
+ try:
+ tmp.canonicalize()
+ return tmp
+ except InvalidAromaticRing:
+ return molecule
+
+
+def split_molecules(molecules: Iterable, number_of_atoms: int) -> Tuple[List, List]:
+ """
+ Splits molecules according to the number of heavy atoms.
+
+ :param molecules: Iterable of molecules.
+ :param number_of_atoms: Threshold for splitting molecules.
+ :return: Tuple of lists containing "big" molecules and "small" molecules.
+ """
+ big_molecules, small_molecules = [], []
+ for molecule in molecules:
+ if len(molecule) > number_of_atoms:
+ big_molecules.append(molecule)
+ else:
+ small_molecules.append(molecule)
+
+ return big_molecules, small_molecules
+
+
+def remove_small_molecules(
+ reaction: ReactionContainer,
+ number_of_atoms: int = 6,
+ small_molecules_to_meta: bool = True
+) -> Union[ReactionContainer, None]:
+ """
+ Processes a reaction by removing small molecules.
+
+ :param reaction: ReactionContainer object.
+ :param number_of_atoms: Molecules with the number of atoms equal to or below this will be removed.
+ :param small_molecules_to_meta: If True, deleted molecules are saved to meta.
+ :return: Processed ReactionContainer without small molecules.
+ """
+ new_reactants, small_reactants = split_molecules(reaction.reactants, number_of_atoms)
+ new_products, small_products = split_molecules(reaction.products, number_of_atoms)
+
+ if sum(len(mol) for mol in new_reactants) == 0 or sum(len(mol) for mol in new_reactants) == 0:
+ return None
+
+ new_reaction = ReactionContainer(new_reactants, new_products, reaction.reagents, reaction.meta)
+ new_reaction.name = reaction.name
+
+ if small_molecules_to_meta:
+ united_small_reactants = unite_molecules(small_reactants)
+ new_reaction.meta["small_reactants"] = str(united_small_reactants)
+
+ united_small_products = unite_molecules(small_products)
+ new_reaction.meta["small_products"] = str(united_small_products)
+
+ return new_reaction
+
+
+def rebalance_reaction(reaction: ReactionContainer) -> ReactionContainer:
+ """
+ Rebalances the reaction by assembling CGR and then decomposing it. Works for all reactions for which the correct
+ CGR can be assembled
+
+ :param reaction: a reaction object
+ :return: a rebalanced reaction
+ """
+ tmp_reaction = ReactionContainer(reaction.reactants, reaction.products)
+ cgr = ~tmp_reaction
+ reactants, products = ~cgr
+ rebalanced_reaction = ReactionContainer(reactants.split(), products.split(), reaction.reagents, reaction.meta)
+ rebalanced_reaction.name = reaction.name
+ return rebalanced_reaction
+
+
+def reverse_reaction(reaction: ReactionContainer) -> ReactionContainer:
+ """
+ Reverses given reaction
+
+ :param reaction: a reaction object
+ :return: the reversed reaction
+ """
+ reversed_reaction = ReactionContainer(reaction.products, reaction.reactants, reaction.reagents, reaction.meta)
+ reversed_reaction.name = reaction.name
+
+ return reversed_reaction
+
+
+def remove_reagents(
+ reaction: ReactionContainer,
+ keep_reagents: bool = True,
+ reagents_max_size: int = 7
+) -> Union[ReactionContainer, None]:
+ """
+ Removes reagents (not changed molecules or molecules not involved in the reaction) from reactants and products
+
+ :param reaction: a reaction object
+ :param keep_reagents: if True, the reagents are written to ReactionContainer
+ :param reagents_max_size: max size of molecules that are called reagents, bigger are deleted
+ :return: cleaned reaction
+ """
+ not_changed_molecules = set(reaction.reactants).intersection(reaction.products)
+
+ cgr = ~reaction
+ center_atoms = set(cgr.center_atoms)
+
+ new_reactants = []
+ new_products = []
+ new_reagents = []
+
+ for molecule in reaction.reactants:
+ if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+ new_reagents.append(molecule)
+ else:
+ new_reactants.append(molecule)
+
+ for molecule in reaction.products:
+ if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+ new_reagents.append(molecule)
+ else:
+ new_products.append(molecule)
+
+ if sum(len(mol) for mol in new_reactants) == 0 or sum(len(mol) for mol in new_reactants) == 0:
+ return None
+
+ if keep_reagents:
+ new_reagents = {molecule for molecule in new_reagents if len(molecule) <= reagents_max_size}
+ else:
+ new_reagents = []
+
+ new_reaction = ReactionContainer(new_reactants, new_products, new_reagents, reaction.meta)
+ new_reaction.name = reaction.name
+
+ return new_reaction
+
+
+def to_reaction_smiles_record(reaction):
+ if isinstance(reaction, str):
+ return reaction
+
+ reaction_record = [format(reaction, "m")]
+ sorted_meta = sorted(reaction.meta.items(), key=lambda x: x[0])
+ for _, meta_info in sorted_meta:
+ # meta_info = str(meta_info)
+ meta_info = '' # TODO decide what to do with meta
+ meta_info = ";".join(meta_info.split("\n"))
+ reaction_record.append(str(meta_info))
+ # return "\t".join(reaction_record) + "\n"
+ return "".join(reaction_record)
+
+
+def cgr_from_rule(rule: ReactionContainer):
+ reaction_rule = reaction_query_to_reaction(rule)
+ cgr_rule = ~reaction_rule
+ return cgr_rule
+
+
+def hash_from_rule(reaction_rule: ReactionContainer):
+ reactants_hash = tuple(sorted(hash(r) for r in reaction_rule.reactants))
+ reagents_hash = tuple(sorted(hash(r) for r in reaction_rule.reagents))
+ products_hash = tuple(sorted(hash(r) for r in reaction_rule.products))
+ return hash((reactants_hash, reagents_hash, products_hash))
diff --git a/SynTool/interfaces/__init__.py b/SynTool/interfaces/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SynTool/interfaces/__pycache__/__init__.cpython-310.pyc b/SynTool/interfaces/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1690b536c1c159a54b0386e9305953efc009d5a
Binary files /dev/null and b/SynTool/interfaces/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SynTool/interfaces/__pycache__/visualisation.cpython-310.pyc b/SynTool/interfaces/__pycache__/visualisation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7011e8fef7284cfb2f84aedee236c4f425d6fd0c
Binary files /dev/null and b/SynTool/interfaces/__pycache__/visualisation.cpython-310.pyc differ
diff --git a/SynTool/interfaces/cli.py b/SynTool/interfaces/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cdde6f6f99c3e60a7d002f9a87d202e0e37f304
--- /dev/null
+++ b/SynTool/interfaces/cli.py
@@ -0,0 +1,530 @@
+"""
+Module containing commands line scripts for training and planning mode
+"""
+
+import os
+import shutil
+import yaml
+import warnings
+from pathlib import Path
+
+import click
+import gdown
+
+from SynTool.chem.data.cleaning import reactions_cleaner
+from SynTool.chem.data.filtering import filter_reactions, ReactionCheckConfig
+from SynTool.utils.loading import standardize_building_blocks
+from SynTool.chem.reaction_rules.extraction import extract_rules_from_reactions
+from SynTool.mcts.search import tree_search
+from SynTool.ml.training.reinforcement import run_reinforcement_tuning
+from SynTool.ml.training.supervised import create_policy_dataset, run_policy_training
+from SynTool.utils.config import ReinforcementConfig, TreeConfig, PolicyNetworkConfig, ValueNetworkConfig
+from SynTool.utils.config import ReactionStandardizationConfig, RuleExtractionConfig
+from SynTool.chem.data.mapping import remove_reagents_and_map_from_file
+
+warnings.filterwarnings("ignore")
+
+
+@click.group(name="syntool")
+def syntool():
+ pass
+
+
+@syntool.command(name="download_planning_data")
+@click.option(
+ "--root_dir",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+def download_planning_data_cli(root_dir='.'):
+ """
+ Downloads data for retrosythesis planning
+ """
+ remote_id = "1ygq9BvQgH2Tq_rL72BvSOdASSSbPFTsL"
+ output = os.path.join(root_dir, "syntool_planning_data.zip")
+ #
+ gdown.download(output=output, id=remote_id, quiet=False)
+ shutil.unpack_archive(output, root_dir)
+ #
+ os.remove(output)
+
+
+@syntool.command(name='download_training_data')
+@click.option(
+ "--root_dir",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+def download_training_data_cli(root_dir='.'):
+ """
+ Downloads data for retrosythetic models training
+ """
+ remote_id = "1ckhO1l6xud0_bnC0rCDMkIlKRUMG_xs8"
+ output = os.path.join(root_dir, "syntool_training_data.zip")
+ #
+ gdown.download(output=output, id=remote_id, quiet=False)
+ shutil.unpack_archive(output, root_dir)
+ #
+ os.remove(output)
+
+
+@syntool.command(name="building_blocks")
+@click.option(
+ "--input",
+ "input_file",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--output",
+ "output_file",
+ required=True,
+ type=click.Path(),
+ help="File where the results will be stored.",
+)
+def building_blocks_cli(input_file, output_file):
+ """
+ Standardizes building blocks
+ """
+
+ standardize_building_blocks(input_file=input_file, output_file=output_file)
+
+
+@syntool.command(name="reaction_mapping")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for mapping and standardizing reactions.",
+)
+@click.option(
+ "--input",
+ "input_file",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--output",
+ "output_file",
+ default=Path("reaction_data_standardized.smi"),
+ type=click.Path(),
+ help="File where the results will be stored.",
+)
+def reaction_mapping_cli(config_path, input_file, output_file):
+ """
+ Reaction data mapping
+ """
+
+ stand_config = ReactionStandardizationConfig.from_yaml(config_path)
+ remove_reagents_and_map_from_file(input_file=input_file, output_file=output_file, keep_reagent=stand_config.keep_reagents)
+
+
+@syntool.command(name="reaction_standardizing")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for mapping and standardizing reactions.",
+)
+@click.option(
+ "--input",
+ "input_file",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--output",
+ "output_file",
+ type=click.Path(),
+ help="File where the results will be stored.",
+)
+@click.option(
+ "--num_cpus",
+ default=8,
+ type=int,
+ help="Number of CPUs to use for processing. Defaults to 1.",
+)
+def reaction_standardizing_cli(config_path, input_file, output_file, num_cpus):
+ """
+ Standardizes reactions and remove duplicates
+ """
+
+ stand_config = ReactionStandardizationConfig.from_yaml(config_path)
+ reactions_cleaner(config=stand_config,
+ input_file=input_file,
+ output_file=output_file,
+ num_cpus=num_cpus)
+
+
+@syntool.command(name="reaction_filtering")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for filtering reactions.",
+)
+@click.option(
+ "--input",
+ "input_file",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--output",
+ "output_file",
+ default=Path("./"),
+ type=click.Path(),
+ help="File where the results will be stored.",
+)
+@click.option(
+ "--append_results",
+ is_flag=True,
+ default=False,
+ help="If set, results will be appended to existing files. By default, new files are created.",
+)
+@click.option(
+ "--batch_size",
+ default=100,
+ type=int,
+ help="Size of the batch for processing reactions. Defaults to 10.",
+)
+@click.option(
+ "--num_cpus",
+ default=8,
+ type=int,
+ help="Number of CPUs to use for processing. Defaults to 1.",
+)
+def reaction_filtering_cli(config_path,
+ input_file,
+ output_file,
+ append_results,
+ batch_size,
+ num_cpus):
+ """
+ Filters erroneous reactions
+ """
+ reaction_check_config = ReactionCheckConfig().from_yaml(config_path)
+ filter_reactions(
+ config=reaction_check_config,
+ reaction_database_path=input_file,
+ result_reactions_file_name=output_file,
+ append_results=append_results,
+ num_cpus=num_cpus,
+ batch_size=batch_size,
+ )
+
+
+@syntool.command(name="rule_extracting")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for reaction rules extraction.",
+)
+@click.option(
+ "--input",
+ "input_file",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--output",
+ "output_file",
+ required=True,
+ type=click.Path(),
+ help="File where the results will be stored.",
+)
+@click.option(
+ "--batch_size",
+ default=100,
+ type=int,
+ help="Size of the batch for processing reactions. Defaults to 10.",
+)
+@click.option(
+ "--num_cpus",
+ default=4,
+ type=int,
+ help="Number of CPUs to use for processing. Defaults to 4.",
+)
+def rule_extracting_cli(
+ config_path,
+ input_file,
+ output_file,
+ num_cpus,
+ batch_size,
+):
+ """
+ Extracts reaction rules
+ """
+
+ reaction_rule_config = RuleExtractionConfig.from_yaml(config_path)
+ extract_rules_from_reactions(config=reaction_rule_config,
+ reaction_file=input_file,
+ rules_file_name=output_file,
+ num_cpus=num_cpus,
+ batch_size=batch_size)
+
+
+@syntool.command(name="supervised_ranking_policy_training")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--reaction_data",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--reaction_rules",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--results_dir",
+ default=Path("."),
+ type=click.Path(),
+ help="Root directory where the results will be stored.",
+)
+@click.option(
+ "--num_cpus",
+ default=4,
+ type=int,
+ help="Number of CPUs to use for processing. Defaults to 4.",
+)
+def supervised_ranking_policy_training_cli(config_path, reaction_data, reaction_rules, results_dir, num_cpus):
+ """
+ Trains ranking policy network
+ """
+
+ policy_config = PolicyNetworkConfig.from_yaml(config_path)
+
+ policy_dataset_file = os.path.join(results_dir, 'policy_dataset.dt')
+
+ datamodule = create_policy_dataset(reaction_rules_path=reaction_rules,
+ molecules_or_reactions_path=reaction_data,
+ output_path=policy_dataset_file,
+ dataset_type='ranking',
+ batch_size=policy_config.batch_size,
+ num_cpus=num_cpus)
+
+ run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+
+
+@syntool.command(name="supervised_filtering_policy_training")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--molecules_file",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the molecules database file that will be mapped.",
+)
+@click.option(
+ "--reaction_rules",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+ "--results_dir",
+ default=Path("."),
+ type=click.Path(),
+ help="Root directory where the results will be stored.",
+)
+@click.option(
+ "--num_cpus",
+ default=8,
+ type=int,
+ help="Number of CPUs to use for processing. Defaults to 1.",
+)
+def supervised_filtering_policy_training_cli(config_path, molecules_file, reaction_rules, results_dir, num_cpus):
+ """
+ Trains filtering policy network
+ """
+
+ policy_config = PolicyNetworkConfig.from_yaml(config_path)
+
+ policy_dataset_file = os.path.join(results_dir, 'policy_dataset.ckpt')
+ datamodule = create_policy_dataset(reaction_rules_path=reaction_rules,
+ molecules_or_reactions_path=molecules_file,
+ output_path=policy_dataset_file,
+ dataset_type='filtering',
+ batch_size=policy_config.batch_size,
+ num_cpus=num_cpus)
+
+ run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+
+
+@syntool.command(name="reinforcement_value_network_training")
+@click.option(
+ "--config",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--targets",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--reaction_rules",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--building_blocks",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--policy_network",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--value_network",
+ default=None,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--results_dir",
+ default='.',
+ type=click.Path(exists=False),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+def reinforcement_value_network_training_cli(config,
+ targets,
+ reaction_rules,
+ building_blocks,
+ policy_network,
+ value_network,
+ results_dir):
+ """
+ Trains value network with reinforcement learning
+ """
+
+ with open(config, "r") as file:
+ config = yaml.safe_load(file)
+
+ policy_config = PolicyNetworkConfig.from_dict(config['node_expansion'])
+ policy_config.weights_path = policy_network
+
+ value_config = ValueNetworkConfig.from_dict(config['value_network'])
+ if value_network is None:
+ value_config.weights_path = os.path.join(results_dir, 'weights', 'value_network.ckpt')
+
+ tree_config = TreeConfig.from_dict(config['tree'])
+ reinforce_config = ReinforcementConfig.from_dict(config['reinforcement'])
+
+ run_reinforcement_tuning(targets_path=targets,
+ tree_config=tree_config,
+ policy_config=policy_config,
+ value_config=value_config,
+ reinforce_config=reinforce_config,
+ reaction_rules_path=reaction_rules,
+ building_blocks_path=building_blocks,
+ results_root=results_dir)
+
+
+@syntool.command(name="planning")
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--targets",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--reaction_rules",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--building_blocks",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--policy_network",
+ required=True,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--value_network",
+ default=None,
+ type=click.Path(exists=True),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+ "--results_dir",
+ default='.',
+ type=click.Path(exists=False),
+ help="Path to the configuration file. This file contains settings for policy training.",
+)
+def planning_cli(config_path,
+ targets,
+ reaction_rules,
+ building_blocks,
+ policy_network,
+ value_network,
+ results_dir):
+ """
+ Runs retrosynthesis planning
+ """
+
+ with open(config_path, "r") as file:
+ config = yaml.safe_load(file)
+
+ tree_config = TreeConfig.from_dict({**config['tree'], **config['node_evaluation']})
+ policy_config = PolicyNetworkConfig.from_dict({**config['node_expansion'], **{'weights_path': policy_network}})
+
+ tree_search(targets_path=targets,
+ tree_config=tree_config,
+ policy_config=policy_config,
+ reaction_rules_path=reaction_rules,
+ building_blocks_path=building_blocks,
+ value_weights_path=value_network,
+ results_root=results_dir)
+
+
+if __name__ == '__main__':
+ syntool()
+
+
diff --git a/SynTool/interfaces/cli.py.bk b/SynTool/interfaces/cli.py.bk
new file mode 100644
index 0000000000000000000000000000000000000000..550f128644c43402dabe3db437e847fb049c24f9
--- /dev/null
+++ b/SynTool/interfaces/cli.py.bk
@@ -0,0 +1,241 @@
+"""
+Module containing commands line scripts for training and planning mode
+"""
+
+import warnings
+import os
+import shutil
+from pathlib import Path
+import click
+import gdown
+from datetime import datetime
+
+from Syntool.chem.reaction_rules.extraction import extract_rules_from_reactions
+from Syntool.chem.data.cleaning import reactions_cleaner
+from Syntool.chem.data.mapping import remove_reagents_and_map_from_file
+from Syntool.chem.loading import standardize_building_blocks
+from Syntool.ml.training import create_policy_dataset, run_policy_training
+from Syntool.ml.training.reinforcement import run_self_tuning
+from Syntool.ml.networks.policy import PolicyNetworkConfig
+from Syntool.utils.config import read_planning_config, read_training_config, TreeConfig
+from Syntool.mcts.search import tree_search
+
+from Syntool.chem.data.filtering import (
+ filter_reactions,
+ ReactionCheckConfig,
+ CCRingBreakingConfig,
+ WrongCHBreakingConfig,
+ CCsp3BreakingConfig,
+ DynamicBondsConfig,
+ MultiCenterConfig,
+ NoReactionConfig,
+ SmallMoleculesConfig,
+)
+
+warnings.filterwarnings("ignore")
+main = click.Group()
+
+
+@main.command(name='planning_data')
+def planning_data_cli():
+ """
+ Downloads a file from Google Drive using its remote ID, saves it as a zip file, extracts the contents,
+ and then deletes the zip file
+ """
+ remote_id = '1c5YJDT-rP1ZvFA-ELmPNTUj0b8an4yFf'
+ output = 'synto_planning_data.zip'
+ #
+ gdown.download(output=output, id=remote_id, quiet=True)
+ shutil.unpack_archive(output, './')
+ #
+ os.remove(output)
+
+
+@main.command(name='training_data')
+def training_data_cli():
+ """
+ Downloads a file from Google Drive using its remote ID, saves it as a zip file, extracts the contents,
+ and then deletes the zip file
+ """
+ remote_id = '1r4I7OskGvzg-zxYNJ7WVYpVR2HSYW10N'
+ output = 'synto_training_data.zip'
+ #
+ gdown.download(output=output, id=remote_id, quiet=True)
+ shutil.unpack_archive(output, './')
+ #
+ os.remove(output)
+
+
+@main.command(name='syntool_planning')
+@click.option("--config",
+ "config_path",
+ required=True,
+ help="Path to the config YAML molecules_path.",
+ type=click.Path(exists=True, path_type=Path),
+ )
+def syntool_planning_cli(config_path):
+ """
+ Launches tree search for the given target molecules and stores search statistics and found retrosynthetic paths
+
+ :param config_path: The path to the configuration file that contains the settings and parameters for the tree search
+ """
+ config = read_planning_config(config_path)
+ config['Tree']['silent'] = True
+
+ # standardize building blocks
+ if config['InputData']['standardize_building_blocks']:
+ print('STANDARDIZE BUILDING BLOCKS ...')
+ standardize_building_blocks(config['InputData']['building_blocks_path'],
+ config['InputData']['building_blocks_path'])
+ # run planning
+ print('\nRUN PLANNING ...')
+ tree_config = TreeConfig.from_dict(config['Tree'])
+ tree_search(targets=config['General']['targets_path'],
+ tree_config=tree_config,
+ reaction_rules_path=config['InputData']['reaction_rules_path'],
+ building_blocks_path=config['InputData']['building_blocks_path'],
+ policy_weights_path=config['PolicyNetwork']['weights_path'],
+ value_weights_paths=config['ValueNetwork']['weights_path'],
+ results_root=config['General']['results_root'])
+
+
+@main.command(name='syntool_training')
+@click.option(
+ "--config",
+ "config_path",
+ required=True,
+ help="Path to the config YAML file.",
+ type=click.Path(exists=True, path_type=Path)
+ )
+def syntool_training_cli(config_path):
+
+ # read training config
+ print('READ CONFIG ...')
+ config = read_training_config(config_path)
+ print('Config is read')
+
+ reaction_data_file = config['InputData']['reaction_data_path']
+
+ # reaction data mapping
+ startTime0 = datetime.now()
+ data_output_folder = os.path.join(config['General']['results_root'], 'reaction_data')
+ Path(data_output_folder).mkdir(parents=True, exist_ok=True)
+ mapped_data_file = os.path.join(data_output_folder, 'reaction_data_mapped.smi')
+ if config['DataCleaning']['map_reactions']:
+ print('\nMAP REACTION DATA ...')
+
+ remove_reagents_and_map_from_file(input_file=config['InputData']['reaction_data_path'],
+ output_file=mapped_data_file)
+
+ reaction_data_file = mapped_data_file
+ print("remove_reagents_and_map_from_file:", datetime.now() - startTime0)
+
+ # reaction data cleaning
+ startTime0 = datetime.now()
+ cleaned_data_file = os.path.join(data_output_folder, 'reaction_data_cleaned.rdf')
+ if config['DataCleaning']['clean_reactions']:
+ print('\nCLEAN REACTION DATA ...')
+
+ reactions_cleaner(input_file=reaction_data_file,
+ output_file=cleaned_data_file,
+ num_cpus=config['General']['num_cpus'])
+
+ reaction_data_file = cleaned_data_file
+ print("reactions_cleaner:", datetime.now() - startTime0)
+
+ # reactions data filtering
+ startTime0 = datetime.now()
+ if config['DataCleaning']['filter_reactions']:
+ print('\nFILTER REACTION DATA ...')
+ #
+ filtration_config = ReactionCheckConfig(
+ remove_small_molecules=False,
+ small_molecules_config=SmallMoleculesConfig(limit=6),
+ dynamic_bonds_config=DynamicBondsConfig(min_bonds_number=1, max_bonds_number=6),
+ no_reaction_config=NoReactionConfig(),
+ multi_center_config=MultiCenterConfig(),
+ wrong_ch_breaking_config=WrongCHBreakingConfig(),
+ cc_sp3_breaking_config=CCsp3BreakingConfig(),
+ cc_ring_breaking_config=CCRingBreakingConfig()
+ )
+
+ filtered_data_file = os.path.join(data_output_folder, 'reaction_data_filtered.rdf')
+ filter_reactions(config=filtration_config,
+ reaction_database_path=reaction_data_file,
+ result_directory_path=data_output_folder,
+ result_reactions_file_name='reaction_data_filtered',
+ num_cpus=config['General']['num_cpus'],
+ batch_size=100)
+
+ reaction_data_file = filtered_data_file
+ print("filter_reactions:", datetime.now() - startTime0)
+
+ # standardize building blocks
+ startTime0 = datetime.now()
+ if config['DataCleaning']['standardize_building_blocks']:
+ print('\nSTANDARDIZE BUILDING BLOCKS ...')
+
+ standardize_building_blocks(config['InputData']['building_blocks_path'],
+ config['InputData']['building_blocks_path'])
+ print("standardize_building_blocks:", datetime.now() - startTime0)
+
+ # reaction rules extraction
+ startTime0 = datetime.now()
+ print('\nEXTRACT REACTION RULES ...')
+
+ rules_output_folder = os.path.join(config['General']['results_root'], 'reaction_rules')
+ Path(rules_output_folder).mkdir(parents=True, exist_ok=True)
+ reaction_rules_path = os.path.join(rules_output_folder, 'reaction_rules_filtered.pickle')
+ config['InputData']['reaction_rules_path'] = reaction_rules_path
+
+ extract_rules_from_reactions(config=config,
+ reaction_file=reaction_data_file,
+ results_root=rules_output_folder,
+ num_cpus=config['General']['num_cpus'])
+ print("extract_rules_from_reactions:", datetime.now() - startTime0)
+
+ # create policy network dataset
+ startTime0 = datetime.now()
+ print('\nCREATE POLICY NETWORK DATASET ...')
+ policy_output_folder = os.path.join(config['General']['results_root'], 'policy_network')
+ Path(policy_output_folder).mkdir(parents=True, exist_ok=True)
+ policy_data_file = os.path.join(policy_output_folder, 'policy_dataset.pt')
+
+ if config['PolicyNetwork']['policy_type'] == 'ranking':
+ molecules_or_reactions_path = reaction_data_file
+ elif config['PolicyNetwork']['policy_type'] == 'filtering':
+ molecules_or_reactions_path = config['InputData']['policy_data_path']
+ else:
+ raise ValueError(
+ "Invalid policy_type. Allowed values are 'ranking', 'filtering'."
+ )
+
+ datamodule = create_policy_dataset(reaction_rules_path=reaction_rules_path,
+ molecules_or_reactions_path=molecules_or_reactions_path,
+ output_path=policy_data_file,
+ dataset_type=config['PolicyNetwork']['policy_type'],
+ batch_size=config['PolicyNetwork']['batch_size'],
+ num_cpus=config['General']['num_cpus'])
+ print("datamodule:", datetime.now() - startTime0)
+
+ # train policy network
+ startTime0 = datetime.now()
+ print('\nTRAIN POLICY NETWORK ...')
+ policy_config = PolicyNetworkConfig.from_dict(config['PolicyNetwork'])
+ run_policy_training(datamodule, config=policy_config, results_path=policy_output_folder)
+ config['PolicyNetwork']['weights_path'] = os.path.join(policy_output_folder, 'weights', 'policy_network.ckpt')
+ print("run_policy_training:", datetime.now() - startTime0)
+
+ # self-tuning value network training
+ startTime0 = datetime.now()
+ print('\nTRAIN VALUE NETWORK ...')
+ value_output_folder = os.path.join(config['General']['results_root'], 'value_network')
+ Path(value_output_folder).mkdir(parents=True, exist_ok=True)
+ config['ValueNetwork']['weights_path'] = os.path.join(value_output_folder, 'weights', 'value_network.ckpt')
+
+ run_self_tuning(config, results_root=value_output_folder)
+ print("run_self_tuning:", datetime.now() - startTime0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/SynTool/interfaces/visualisation.py b/SynTool/interfaces/visualisation.py
new file mode 100755
index 0000000000000000000000000000000000000000..e82ecf851c16afab365cb7444f3e1ae32ad47797
--- /dev/null
+++ b/SynTool/interfaces/visualisation.py
@@ -0,0 +1,346 @@
+"""
+Module containing functions for analysis and visualization of the built search tree
+"""
+
+from itertools import count, islice
+
+from CGRtools.containers import MoleculeContainer
+
+from SynTool import Tree
+from SynTool.utils import path_type
+
+
+def get_child_nodes(tree, molecule, graph):
+ nodes = []
+ try:
+ graph[molecule]
+ except KeyError:
+ return []
+ for retron in graph[molecule]:
+ temp_obj = {
+ "smiles": str(retron),
+ "type": "mol",
+ "in_stock": str(retron) in tree.building_blocks,
+ }
+ node = get_child_nodes(tree, retron, graph)
+ if node:
+ temp_obj["children"] = [node]
+ nodes.append(temp_obj)
+ return {"type": "reaction", "children": nodes}
+
+
+def extract_routes(tree, extended=False):
+ """
+ The function takes the target and the dictionary of
+ successors and predecessors and returns a list of dictionaries that contain the target
+ and the list of successors
+ :return: A list of dictionaries. Each dictionary contains a target, a list of children, and a
+ boolean indicating whether the target is in building_blocks.
+ """
+ target = tree.nodes[1].retrons_to_expand[0].molecule
+ target_in_stock = tree.nodes[1].curr_retron.is_building_block(tree.building_blocks)
+ # Append encoded routes to list
+ paths_block = []
+ winning_nodes = []
+ if extended:
+ # Gather paths
+ for i, node in tree.nodes.items():
+ if node.is_solved():
+ winning_nodes.append(i)
+ else:
+ winning_nodes = tree.winning_nodes
+ if winning_nodes:
+ for winning_node in winning_nodes:
+ # Create graph for route
+ nodes = tree.path_to_node(winning_node)
+ graph, pred = {}, {}
+ for before, after in zip(nodes, nodes[1:]):
+ before = before.curr_retron.molecule
+ graph[before] = after = [x.molecule for x in after.new_retrons]
+ for x in after:
+ pred[x] = before
+
+ paths_block.append({"type": "mol", "smiles": str(target),
+ "in_stock": target_in_stock,
+ "children": [get_child_nodes(tree, target, graph)]})
+ else:
+ paths_block = [{"type": "mol", "smiles": str(target), "in_stock": target_in_stock, "children": []}]
+ return paths_block
+
+
+def path_graph(tree, node: int) -> str:
+ """
+ Visualizes reaction path
+
+ :param node: node id
+ :type node: int
+ :return: The SVG string.
+ """
+ nodes = tree.path_to_node(node)
+ # Set up node_id types for different box colors
+ for node in nodes:
+ for retron in node.new_retrons:
+ retron._molecule.meta["status"] = "instock" if retron.is_building_block(
+ tree.building_blocks) else "mulecule"
+ nodes[0].curr_retron._molecule.meta["status"] = "target"
+ # Box colors
+ box_colors = {"target": "#98EEFF", # 152, 238, 255
+ "mulecule": "#F0AB90", # 240, 171, 144
+ "instock": "#9BFAB3", # 155, 250, 179
+ }
+
+ # first column is target
+ # second column are first new retrons_to_expand
+ columns = [[nodes[0].curr_retron.molecule], [x.molecule for x in nodes[1].new_retrons], ]
+ pred = {x: 0 for x in range(1, len(columns[1]) + 1)}
+ cx = [n for n, x in enumerate(nodes[1].new_retrons, 1) if not x.is_building_block(tree.building_blocks)]
+ size = len(cx)
+ nodes = iter(nodes[2:])
+ cy = count(len(columns[1]) + 1)
+ while size:
+ layer = []
+ for s in islice(nodes, size):
+ n = cx.pop(0)
+ for x in s.new_retrons:
+ layer.append(x)
+ m = next(cy)
+ if not x.is_building_block(tree.building_blocks):
+ cx.append(m)
+ pred[m] = n
+ size = len(cx)
+ columns.append([x.molecule for x in layer])
+
+ columns = [columns[::-1] for columns in columns[::-1]] # Reverse array to make retrosynthetic graph
+ pred = tuple( # Change dict to tuple to make multiple retrons_to_expand available
+ (abs(source - len(pred)), abs(target - len(pred))) for target, source in pred.items())
+
+ # now we have columns for visualizing
+ # lets start recalculate XY
+ x_shift = 0.0
+ c_max_x = 0.0
+ c_max_y = 0.0
+ render = []
+ cx = count()
+ cy = count()
+ arrow_points = {}
+ for ms in columns:
+ heights = []
+ for m in ms:
+ m.clean2d()
+ # X-shift for target
+ min_x = min(x for x, y in m._plane.values()) - x_shift
+ min_y = min(y for x, y in m._plane.values())
+ m._plane = {n: (x - min_x, y - min_y) for n, (x, y) in m._plane.items()}
+ max_x = max(x for x, y in m._plane.values())
+ if max_x > c_max_x:
+ c_max_x = max_x
+ arrow_points[next(cx)] = [x_shift, max_x]
+ heights.append(max(y for x, y in m._plane.values()))
+
+ x_shift = c_max_x + 5.0 # between columns gap
+ # calculate Y-shift
+ y_shift = sum(heights) + 3.0 * (len(heights) - 1)
+ if y_shift > c_max_y:
+ c_max_y = y_shift
+ y_shift /= 2.0
+ for m, h in zip(ms, heights):
+ m._plane = {n: (x, y - y_shift) for n, (x, y) in m._plane.items()}
+
+ # Calculate coordinates for boxes
+ max_x = max(x for x, y in m._plane.values()) + 0.9 # Max x
+ min_x = min(x for x, y in m._plane.values()) - 0.6 # Min x
+ max_y = -(max(y for x, y in m._plane.values()) + 0.45) # Max y
+ min_y = -(min(y for x, y in m._plane.values()) - 0.45) # Min y
+ x_delta = abs(max_x - min_x)
+ y_delta = abs(max_y - min_y)
+ box = (
+ f'