SynPlanner

Sleeping

App Files Files Community

Gilmullin Almaz commited on May 23, 2025

Commit

914ea41

1 Parent(s): f2f3593

Refactor code structure and remove redundant sections for improved readability and maintainability

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +1148 -591
cluster/clustering.py +0 -174
cluster/generalized_cgr.py +0 -204
cluster/reduced_g_cgr.py +0 -159
cluster/subcluster.py +0 -33
cluster/utils.py +0 -314
cluster/visualize.py +0 -481
synplan/__init__.py +3 -0
synplan/chem/__init__.py +3 -0
{cluster → synplan/chem/data}/__init__.py +0 -0
synplan/chem/data/filtering.py +962 -0
synplan/chem/data/standardizing.py +1187 -0
synplan/chem/precursor.py +100 -0
synplan/chem/reaction.py +125 -0
synplan/chem/reaction_routes/__init__.py +0 -0
synplan/chem/reaction_routes/clustering.py +857 -0
synplan/chem/reaction_routes/io.py +286 -0
synplan/chem/reaction_routes/leaving_groups.py +131 -0
synplan/chem/reaction_routes/route_cgr.py +570 -0
synplan/chem/reaction_routes/visualisation.py +903 -0
synplan/chem/reaction_rules/__init__.py +0 -0
synplan/chem/reaction_rules/extraction.py +744 -0
synplan/chem/reaction_rules/manual/__init__.py +6 -0
synplan/chem/reaction_rules/manual/decompositions.py +413 -0
synplan/chem/reaction_rules/manual/transformations.py +532 -0
synplan/chem/utils.py +225 -0
synplan/interfaces/__init__.py +0 -0
synplan/interfaces/building_blocks/building_blocks_em_sa_ln.smi +0 -0
synplan/interfaces/cli.py +506 -0
synplan/interfaces/gui.py +1304 -0
synplan/interfaces/uspto/uspto_reaction_rules.pickle +3 -0
synplan/interfaces/uspto/weights/ranking_policy_network.ckpt +3 -0
synplan/mcts/__init__.py +8 -0
synplan/mcts/evaluation.py +45 -0
synplan/mcts/expansion.py +96 -0
synplan/mcts/node.py +47 -0
synplan/mcts/search.py +199 -0
synplan/mcts/tree.py +635 -0
synplan/ml/__init__.py +0 -0
synplan/ml/networks/__init__.py +0 -0
synplan/ml/networks/modules.py +234 -0
synplan/ml/networks/policy.py +137 -0
synplan/ml/networks/value.py +67 -0
synplan/ml/training/__init__.py +11 -0
synplan/ml/training/preprocessing.py +516 -0
synplan/ml/training/reinforcement.py +379 -0
synplan/ml/training/supervised.py +153 -0
synplan/utils/__init__.py +4 -0
synplan/utils/config.py +543 -0
synplan/utils/files.py +226 -0

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import base64
 import pickle
 import re
 import uuid
 import pandas as pd
 import streamlit as st
@@ -15,18 +17,19 @@ from synplan.mcts.expansion import PolicyNetworkFunction
 from synplan.mcts.search import extract_tree_stats
 from synplan.mcts.tree import Tree
 from synplan.chem.utils import mol_from_smiles
 from synplan.utils.config import TreeConfig, PolicyNetworkConfig
 from synplan.utils.loading import load_reaction_rules, load_building_blocks
-from synplan.utils.visualisation import generate_results_html, get_route_svg
-from cluster.generalized_cgr import *
-from cluster.reduced_g_cgr import *
-from cluster.clustering import *
-from cluster.visualize import *
-from cluster.utils import *
-from cluster.subcluster import *
-from StructureFingerprint import MorganFingerprint
 import psutil
 import gc
@@ -35,8 +38,13 @@ import gc
 disable_progress_bars("huggingface_hub")
 smiles_parser = SMILESRead.create_parser(ignore=True)
-def download_button(object_to_download, download_filename, button_text, pickle_it=False):
     """
     Issued from
     Generates a link to download the given object_to_download.
@@ -68,21 +76,17 @@ def download_button(object_to_download, download_filename, button_text, pickle_i
             pass
         elif isinstance(object_to_download, pd.DataFrame):
-            object_to_download = object_to_download.to_csv(index=False).encode('utf-8')
-    # Try JSON encode for everything else  # else:  #     object_to_download = json.dumps(object_to_download)
     try:
-        # some strings <-> bytes conversions necessary here
         b64 = base64.b64encode(object_to_download.encode()).decode()
     except AttributeError:
         b64 = base64.b64encode(object_to_download).decode()
-    button_uuid = str(uuid.uuid4()).replace('-', '')
-    button_id = re.sub('\d+', '', button_uuid)
-    custom_css = f"""
         <style>
             #{button_id} {{
                 background-color: rgb(255, 255, 255);
@@ -93,7 +97,7 @@ def download_button(object_to_download, download_filename, button_text, pickle_i
                 border-style: solid;
                 border-color: rgb(230, 234, 241);
                 border-image: initial;
-            }}
             #{button_id}:hover {{
                 border-color: rgb(246, 51, 102);
                 color: rgb(246, 51, 102);
@@ -105,644 +109,1197 @@ def download_button(object_to_download, download_filename, button_text, pickle_i
                 }}
         </style> """
-    dl_link = custom_css + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br></br>'
     return dl_link
-st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
-# Initialize session state variables if they don't exist.
-if "planning_done" not in st.session_state:
-    st.session_state.planning_done = False
-if "tree" not in st.session_state:
-    st.session_state.tree = None
-if "res" not in st.session_state:
-    st.session_state.res = None
-if "target_smiles" not in st.session_state:
-    st.session_state.target_smiles = ''
-# Clustering state
-if "clustering_done" not in st.session_state:
-    st.session_state.clustering_done = False
-if "clusters" not in st.session_state:
-    st.session_state.clusters = None
-if "reactions_dict" not in st.session_state:
-    st.session_state.reactions_dict = None
-if "num_clusters_setting" not in st.session_state: # Store the setting used
-    st.session_state.num_clusters_setting = 10
-# Subclustering state
-if "subclustering_done" not in st.session_state:
-    st.session_state.subclustering_done = False
-if "sub" not in st.session_state:
-    st.session_state.sub = None
-# Download state (less critical now with direct download links)
-if 'clusters_downloaded' not in st.session_state: # Example, might not be needed
-    st.session_state.clusters_downloaded = False
-intro_text = '''
-This is a demo of the graphical user interface of
-[SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
-SynPlanner is a comprehensive tool for reaction data curation, rule extraction, model training and retrosynthetic planning.
-More information on SynPlanner is available in the [official docs](https://synplanner.readthedocs.io/en/latest/index.html).
-'''
-st.title("`SynPlanner GUI`")
-st.write(intro_text)
-st.header('Molecule input')
-st.markdown(
-    '''
-    You can provide a molecular structure by either providing:
-    * SMILES string + Enter
-    * Draw it + Apply
-    '''
-)
-DEFAULT_MOL = 'c1cc(ccc1Cl)C(CCO)NC(C2(CCN(CC2)c3c4cc[nH]c4ncn3)N)=O'
-molecule = st.text_input("SMILES:", DEFAULT_MOL)
-smile_code = st_ketcher(molecule)
-target_molecule = mol_from_smiles(smile_code)
-if 'target_smiles' in st.session_state and smile_code != st.session_state.target_smiles:
-    # If the SMILES changes, invalidate previous results
-    st.warning("Molecule structure changed. Please re-run planning.")
-    st.session_state.planning_done = False
-    st.session_state.clustering_done = False
-    st.session_state.subclustering_done = False
-    st.session_state.tree = None
-    st.session_state.res = None
-    st.session_state.clusters = None
-    st.session_state.reactions_dict = None
-    st.session_state.sub = None
 @st.cache_resource
-def load_planning_resources():
     building_blocks_path = hf_hub_download(
-            repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
-            filename="building_blocks_em_sa_ln.smi",
-            subfolder="building_blocks",
-            local_dir="."
-        )
     ranking_policy_weights_path = hf_hub_download(
-            repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
-            filename="ranking_policy_network.ckpt",
-            subfolder="uspto/weights",
-            local_dir="."
-        )
     reaction_rules_path = hf_hub_download(
-            repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
-            filename="uspto_reaction_rules.pickle",
-            subfolder="uspto",
-            local_dir="."
-        )
     return building_blocks_path, ranking_policy_weights_path, reaction_rules_path
-building_blocks_path, ranking_policy_weights_path, reaction_rules_path = load_planning_resources()
-st.header('Launch calculation')
-st.markdown(
-    '''If you modified the structure, please ensure you clicked on `Apply` (bottom right of the molecular editor).'''
-)
-st.markdown(f"The molecule SMILES is actually: ``{smile_code}``")
-st.subheader('Planning options')
-st.markdown(
-    '''
-    The description of each option can be found in the
-    [Retrosynthetic Planning Tutorial](https://synplanner.readthedocs.io/en/latest/tutorial_files/retrosynthetic_planning.html#Configuring-search-tree).
-    '''
-)
-col_options_1, col_options_2 = st.columns(2, gap="medium")
-with col_options_1:
-    search_strategy_input = st.selectbox(label='Search strategy', options=('Expansion first', 'Evaluation first',), index=0)
-    ucb_type = st.selectbox(label='Search strategy', options=('uct', 'puct', 'value'), index=0)
-    c_ucb = st.number_input("C coefficient of UCB", value=0.1, placeholder="Type a number...")
-with col_options_2:
-    max_iterations = st.slider('Total number of MCTS iterations', min_value=50, max_value=1000, value=300)
-    max_depth = st.slider('Maximal number of reaction steps', min_value=3, max_value=9, value=6)
-    min_mol_size = st.slider('Minimum size of a molecule to be precursor', min_value=0, max_value=7, value=0)
-search_strategy_translator = {
-    "Expansion first": "expansion_first",
-    "Evaluation first": "evaluation_first",
-}
-search_strategy = search_strategy_translator[search_strategy_input]
-submit_planning = st.button('Start retrosynthetic planning')
-if submit_planning:
-    # Reset downstream states if replanning
-    st.session_state.planning_done = False
-    st.session_state.clustering_done = False
-    st.session_state.subclustering_done = False
-    st.session_state.tree = None
-    st.session_state.res = None
-    st.session_state.clusters = None
-    st.session_state.reactions_dict = None
-    st.session_state.sub = None
-    st.session_state.target_smiles = smile_code # Store the SMILES used for this run
-    try:
-        target_molecule = mol_from_smiles(smile_code)
-        if target_molecule is None:
-             st.error(f"Could not parse the input SMILES: {smile_code}")
-        else:
-            with st.spinner("Running retrosynthetic planning..."):
-                with st.status("Loading resources...", expanded=False) as status:
-                    st.write("Loading building blocks...")
-                    building_blocks = load_building_blocks(building_blocks_path, standardize=False)
-                    st.write('Loading reaction rules...')
-                    reaction_rules = load_reaction_rules(reaction_rules_path)
-                    st.write('Loading policy network...')
-                    policy_config = PolicyNetworkConfig(weights_path=ranking_policy_weights_path)
-                    policy_function = PolicyNetworkFunction(policy_config=policy_config)
-                    status.update(label="Resources loaded!", state="complete")
-                tree_config = TreeConfig(
-                    search_strategy=search_strategy,
-                    evaluation_type="rollout",
-                    max_iterations=max_iterations,
-                    max_depth=max_depth,
-                    min_mol_size=min_mol_size,
-                    init_node_value=0.5,
-                    ucb_type=ucb_type,
-                    c_ucb=c_ucb,
-                    silent=True
-                )
-                tree = Tree(
-                    target=target_molecule,
-                    config=tree_config,
-                    reaction_rules=reaction_rules,
-                    building_blocks=building_blocks,
-                    expansion_function=policy_function,
-                    evaluation_function=None,
-                )
-                mcts_progress_text = "Running MCTS iterations..."
-                mcts_bar = st.progress(0, text=mcts_progress_text)
-                for step, (solved, node_id) in enumerate(tree):
-                    progress_value = min(1.0, (step + 1) / max_iterations)
-                    mcts_bar.progress(progress_value, text=f"{mcts_progress_text} ({step+1}/{max_iterations})")
-                res = extract_tree_stats(tree, target_molecule)
-                # Store planning outputs in session_state
-                st.session_state['tree'] = tree
-                st.session_state['res'] = res
-                st.session_state.planning_done = True
-                st.rerun() # Rerun to display results cleanly
-    except Exception as e:
-        st.error(f"An error occurred during planning: {e}")
-        st.session_state.planning_done = False # Ensure state reflects failure
-# Display results if planning has been completed
-if st.session_state.get('planning_done', False):
-    res = st.session_state.res
-    tree = st.session_state.tree
-    if res is None or tree is None:
-        st.error("Planning results are missing from session state. Please re-run planning.")
-        st.session_state.planning_done = False # Reset state
-    elif res["solved"]:
-        st.header('Planning Results')
-        # st.balloons() # Optional fun
-        winning_nodes = sorted(set(tree.winning_nodes)) if hasattr(tree, 'winning_nodes') and tree.winning_nodes else []
-        st.subheader(f"Number of unique routes found: {len(winning_nodes)}")
-        st.subheader("Examples of found retrosynthetic routes")
-        image_counter = 0
-        visualised_node_ids = set()
-        # Ensure winning_nodes is iterable and not empty
-        if not winning_nodes:
-             st.warning("Planning solved, but no winning nodes found in the tree object.")
-        else:
-            for n, node_id in enumerate(winning_nodes):
-                if image_counter >= 3: # Use >= for clarity
-                    break
-                # Simple display logic: show first 3 unique routes
-                if node_id not in visualised_node_ids:
-                    try:
-                        visualised_node_ids.add(node_id)
-                        num_steps = len(tree.synthesis_route(node_id))
-                        route_score = round(tree.route_score(node_id), 3)
-                        svg = get_route_svg(tree, node_id)
-                        if svg:
-                            st.image(svg, caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
-                            image_counter += 1
-                        else:
-                            st.warning(f"Could not generate SVG for route {node_id}.")
-                    except Exception as e:
-                        st.error(f"Error displaying route {node_id}: {e}")
-        stat_col, download_col = st.columns(2, gap="medium")
-        with stat_col:
-            st.subheader("Statistics")
-            try:
-                # Ensure 'target_smiles' exists in res, if not, use the stored one
-                if 'target_smiles' not in res:
-                    res['target_smiles'] = st.session_state.target_smiles
-                # Select only existing columns safely
-                cols_to_show = [col for col in ["target_smiles", "num_routes", "num_nodes", "num_iter", "search_time"] if col in res]
-                df = pd.DataFrame(res, index=[0])[cols_to_show]
-                st.dataframe(df) # Use dataframe for better display
-            except Exception as e:
-                st.error(f"Error displaying statistics: {e}")
-                st.write(res) # Show raw dict if DataFrame fails
-        with download_col:
-            st.subheader("Downloads")
-            try:
-                html_body = generate_results_html(tree, html_path=None, extended=True)
-                dl_html = download_button(html_body, 'results_synplanner.html', 'Download results (HTML)')
-                if dl_html: st.markdown(dl_html, unsafe_allow_html=True)
-                # Ensure res is suitable for DataFrame before creating/downloading
                 try:
-                    res_df = pd.DataFrame(res, index=[0])
-                    dl_csv = download_button(res_df, 'results_synplanner.csv', 'Download statistics (CSV)')
-                    if dl_csv: st.markdown(dl_csv, unsafe_allow_html=True)
                 except Exception as e:
-                     st.error(f"Could not prepare statistics CSV for download: {e}")
             except Exception as e:
-                st.error(f"Error generating download links: {e}")
-        st.divider()
         st.header("Clustering the retrosynthetic routes")
-        if 'num_clusters' not in st.session_state:
-            st.session_state['num_clusters'] = 10
-        cluster_box, _ = st.columns(2, gap="medium")
-        with cluster_box:
-            num_clusters_input = st.slider(
-                'Max number of clusters to generate',
-                min_value=2,
-                max_value=min(50, res.get("num_routes", 50)), # Sensible max based on routes found
-                value=st.session_state.num_clusters_setting,
-                key='cluster_slider'
-            )
-        if st.button('Run Clustering', key='submit_clustering'):
-             # Update the setting in session state when the button is clicked
-            st.session_state.num_clusters_setting = num_clusters_input
-            # Reset downstream states
             st.session_state.clustering_done = False
             st.session_state.subclustering_done = False
             st.session_state.clusters = None
             st.session_state.reactions_dict = None
-            st.session_state.sub = None
             with st.spinner("Performing clustering..."):
                 try:
-                    # Ensure tree is available from session state
                     current_tree = st.session_state.tree
                     if not current_tree:
                         st.error("Tree object not found. Please re-run planning.")
                     else:
-                        st.write("Calculating Generalized CGRs...")
-                        g_cgrs_dict = reassign_nums(current_tree) # Assuming this needs the tree
-                        st.write("Processing RG-CGRs...")
-                        reduced_g_cgrs_dict = process_all_rg_cgrs(g_cgrs_dict) # Assuming this uses the previous output
-                        mfp = MorganFingerprint()
-                        st.write(f"Clustering into max {st.session_state.num_clusters_setting} clusters...")
-                        results = cluster_molecules(reduced_g_cgrs_dict, mfp, max_clusters=st.session_state.num_clusters_setting)
-                        st.session_state.clusters = results.get('clusters_dict')
-                        st.session_state.rg_cgrs_dict = reduced_g_cgrs_dict
-                        # Extract reactions *after* clustering if needed, ensure tree is passed
-                        st.write("Extracting reactions...")
-                        st.session_state.reactions_dict = extract_reactions(current_tree)
-                        if st.session_state.clusters and st.session_state.reactions_dict:
-                             st.session_state.clustering_done = True
-                             st.success(f"Clustering complete. Found {len(st.session_state.clusters)} clusters.")
-                        else:
-                             st.error("Clustering failed or returned empty results.")
-                             st.session_state.clustering_done = False
-                        # Clean up large intermediate objects if possible
-                        del g_cgrs_dict
-                        del results
-                        gc.collect()
-                        st.rerun() # Rerun to display clustering results cleanly
                 except Exception as e:
                     st.error(f"An error occurred during clustering: {e}")
                     st.session_state.clustering_done = False
-        # --- Display Clustering Results (if done) ---
-        if st.session_state.get('clustering_done', False):
-            clusters = st.session_state.clusters
-            reactions_dict = st.session_state.reactions_dict # Needed for download
-            tree = st.session_state.tree # Needed for display and download
-            if not clusters or not reactions_dict or not tree:
-                 st.error("Clustering results are missing from session state. Please re-run clustering.")
-                 st.session_state.clustering_done = False # Reset flag
-            else:
-                st.subheader(f"Best routes from {len(clusters)} Found Clusters")
-                # Display first route from first few clusters
-                displayed_clusters = 0
-                for cluster_num, node_id_list in clusters.items():
-                    if displayed_clusters >= 10: # Limit displayed clusters
-                         st.write(f"... and {len(clusters) - displayed_clusters} more clusters.")
-                         break
-                    if not node_id_list: continue # Skip empty clusters
-                    st.markdown(f"**Cluster {cluster_num}** (Size: {len(node_id_list)}) - Example Route:")
-                    node_id = node_id_list[0] # Display the first route as example
                     try:
                         num_steps = len(tree.synthesis_route(node_id))
                         route_score = round(tree.route_score(node_id), 3)
                         svg = get_route_svg(tree, node_id)
-                        if svg:
-                            st.image(svg, caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
                         else:
-                            st.warning(f"Could not generate SVG for route {node_id}.")
-                        displayed_clusters += 1
                     except Exception as e:
-                        st.error(f"Error displaying route {node_id} for cluster {cluster_num+1}: {e}")
-                cluster_sizes = [len(cluster) for cluster in clusters.values()]
-                cluster_stat_col, cluster_download_col = st.columns(2, gap="medium")
-                with cluster_stat_col:
-                    st.subheader("Cluster Statistics")
-                    if cluster_sizes:
-                        cluster_df = pd.DataFrame({'Cluster': range(1, len(cluster_sizes) + 1), 'Number of Routes': cluster_sizes})
-                        st.dataframe(cluster_df)
-                        # Display Pie Chart using Matplotlib
-                        # try:
-                        #     fig, ax = plt.subplots(figsize=(5, 4)) # Adjust size if needed
-                        #     ax.pie(cluster_sizes, labels=[f'C{i+1}' for i in range(len(cluster_sizes))], autopct='%1.1f%%', startangle=90)
-                        #     ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
-                        #     st.pyplot(fig)
-                        #     plt.close(fig) # Close the figure to free memory
-                        # except Exception as e:
-                        #     st.error(f"Could not generate pie chart: {e}")
                     else:
-                        st.write("No cluster data to display statistics for.")
-                with cluster_download_col:
-                    st.subheader("Cluster Reports") # Changed subheader
-                    # Retrieve necessary data from session state
-                    tree_for_html = st.session_state.get('tree')
-                    clusters_for_html = st.session_state.get('clusters')
-                    rg_cgrs_for_html = st.session_state.get('rg_cgrs_dict')
-                    if not tree_for_html:
-                        st.warning("MCTS Tree data not found. Cannot generate reports.")
-                    elif not clusters_for_html:
-                        st.warning("Cluster data not found. Cannot generate reports.")
-                    else:
-                        st.write("Generate downloadable HTML reports for each cluster:")
-                        # Limit the number of download links shown directly if there are many clusters
-                        MAX_DOWNLOAD_LINKS_DISPLAYED = 15 # Adjust as needed
-                        num_clusters_total = len(clusters_for_html)
-                        clusters_items = list(clusters_for_html.items()) # Get items to slice
-                        for i, (cluster_num_idx, node_ids) in enumerate(clusters_items):
-                            if i >= MAX_DOWNLOAD_LINKS_DISPLAYED:
-                                st.caption(f"... plus {num_clusters_total - MAX_DOWNLOAD_LINKS_DISPLAYED} more cluster reports available.")
-                                # Consider adding a button to download all as a zip if needed
-                                break
-                            cluster_num_display = int(cluster_num_idx) # Use 1-based index
-                            if not node_ids: # Skip empty clusters
-                                st.caption(f"Cluster {cluster_num_display} is empty, no report generated.")
-                                continue
-                            try:
                                 try:
-                                    cluster_html_content = generate_cluster_html(
-                                        tree=tree_for_html,
-                                        cluster_node_ids=node_ids,
-                                        cluster_num=cluster_num_display,
-                                        rg_cgrs_dict=rg_cgrs_for_html,
-                                        aam=False
                                     )
                                 except Exception as e:
-                                    st.error(f"Error generating report/link for Cluster {cluster_num_display}: {e}")
-                                # --- Create the download button using the existing function ---
-                                download_filename = f"cluster_{cluster_num_display}_report.html"
-                                button_text = f"Cluster {cluster_num_display} Report (HTML)"
-                                dl_button_html = download_button(
-                                    object_to_download=cluster_html_content,
-                                    download_filename=download_filename,
-                                    button_text=button_text
-                                    # pickle_it=False # Ensure it's not pickled
-                                )
-                                if dl_button_html:
-                                    st.markdown(dl_button_html, unsafe_allow_html=True)
-                                else:
-                                    # Error message if button creation failed (e.g., encoding error)
-                                    st.error(f"Failed to create download link for Cluster {cluster_num_display}.")
-                            except Exception as e:
-                                # Catch errors during HTML generation or button creation for a specific cluster
-                                st.error(f"Error generating report/link for Cluster {cluster_num_display}: {e}")
-                                # Optionally add more detailed logging here:
-                                # import traceback
-                                # st.error(traceback.format_exc())
-                        if num_clusters_total > MAX_DOWNLOAD_LINKS_DISPLAYED:
-                            # Optional: Add a button here to generate and download a ZIP file
-                            # containing all cluster reports. This requires more implementation
-                            # (using zipfile library in memory).
-                            # e.g., if st.button("Download All Reports as ZIP"): ...
-                            pass
-                st.divider()
-                # --- Subclustering Section ---
-                st.header("Sub-Clustering within a selected Cluster")
-                # Button to trigger the subclustering calculation
-                if st.button("Run Subclustering Analysis", key="submit_subclustering"):
-                    st.session_state.subclustering_done = False # Reset flag
-                    st.session_state.sub = None # Clear old results
-                    with st.spinner("Performing subclustering analysis..."):
-                         try:
-                              # Retrieve necessary data from session state
-                              clusters_for_sub = st.session_state.get('clusters')
-                              reactions_for_sub = st.session_state.get('reactions_dict')
-                              if clusters_for_sub and reactions_for_sub:
-                                   sub = sublcuster_all(clusters_for_sub, reactions_for_sub)
-                                   st.session_state.sub = sub
-                                   st.session_state.subclustering_done = True
-                                   st.success("Subclustering analysis complete.")
-                                   # Clean up intermediates if possible
-                                   gc.collect()
-                                   st.rerun() # Rerun to display results/inputs cleanly
-                              else:
-                                   st.error("Missing cluster or reaction data needed for subclustering.")
-                         except Exception as e:
-                              st.error(f"An error occurred during subclustering: {e}")
-                              st.session_state.subclustering_done = False
-                # Display subclustering inputs and results ONLY if subclustering is done
-                if st.session_state.get('subclustering_done', False):
-                    sub = st.session_state.sub
-                    tree = st.session_state.tree
-                    clusters_for_sub = st.session_state.get('clusters')
-                    if not sub or not tree:
-                        st.error("Subclustering results are missing from session state. Please re-run subclustering.")
-                        st.session_state.subclustering_done = False
-                    else:
-                        sub_input_col, sub_display_col = st.columns([0.2, 0.8]) # Adjust column ratio if needed
-                        with sub_input_col:
-                            st.subheader("Select Cluster and Step")
-                            # Cluster selection (use cluster numbers as displayed, usually 1-based)
-                            available_cluster_nums = [int(k) for k in sub.keys()] # Use 1-based indexing for UI
-                            if not available_cluster_nums:
-                                st.warning("No clusters available in subclustering results.")
-                            else:
-                                # Key is essential here to maintain state across reruns
-                                user_input_cluster_num_display = st.selectbox(
-                                    "Select Cluster #:",
-                                    options=sorted(available_cluster_nums),
-                                    key='subcluster_num_select'
-                                )
-                                # Convert back to 0-based index for accessing 'sub' dictionary
-                                selected_cluster_idx = user_input_cluster_num_display
-                                if selected_cluster_idx in sub:
-                                    sub_step_cluster = sub[selected_cluster_idx]
-                                    allowed_step_numbers = sorted(list(sub_step_cluster.keys()))
-                                    if not allowed_step_numbers:
-                                        st.warning(f"No reaction steps found for Cluster {user_input_cluster_num_display}.")
-                                    else:
-                                        # Key is essential here
-                                        selected_step_number = st.selectbox(
-                                            "Select Number of Steps:",
-                                            options=allowed_step_numbers,
-                                            key='subcluster_steps_select'
-                                        )
-                                        # --- Display logic moved to the right column ---
-                                        rg_cgrs = st.session_state.get('rg_cgrs_dict')
-                                        cluster_rg_cgr = rg_cgrs[clusters_for_sub[user_input_cluster_num_display][0]]
-                                        cluster_rg_cgr.clean2d()
-                                        st.image(cluster_rg_cgr.depict(), caption=f"RG-CGR of cluster")
-                                else:
-                                    st.warning(f"Selected cluster {user_input_cluster_num_display} (index {selected_cluster_idx}) not found in subclustering results.")
-                        with sub_display_col:
-                            st.subheader("Subcluster Results")
-                            # Check if inputs are valid before trying to display
-                            if 'user_input_cluster_num_display' in locals() and \
-                            'selected_cluster_idx' in locals() and \
-                            selected_cluster_idx in sub and \
-                            'selected_step_number' in locals() and \
-                            selected_step_number in sub[selected_cluster_idx]:
-                                subclusters = sub[selected_cluster_idx][selected_step_number]
-                                st.write(f"Displaying **{len(subclusters)}** subclusters for **Cluster {user_input_cluster_num_display}** with **{selected_step_number} steps**:")
-                                if not subclusters:
-                                    st.info("No subclusters found for this selection.")
-                                else:
-                                    # Limit the display if there are too many subclusters/routes
-                                    MAX_DISPLAY_SUBCLUSTERS = 20
-                                    MAX_ROUTES_PER_SUBCLUSTER = 10
-                                    for subcluster_num, subcluster_set in enumerate(subclusters):
-                                        if subcluster_num >= MAX_DISPLAY_SUBCLUSTERS:
-                                            st.write(f"... and {len(subclusters) - MAX_DISPLAY_SUBCLUSTERS} more subclusters.")
-                                            break
-                                        st.markdown(f"--- \n**Subcluster {subcluster_num + 1}** (Size: {len(subcluster_set)})")
-                                        routes_shown = 0
-                                        for route_id in subcluster_set:
-                                            if routes_shown >= MAX_ROUTES_PER_SUBCLUSTER:
-                                                st.write(f"(Showing first {MAX_ROUTES_PER_SUBCLUSTER} routes)")
-                                                break
-                                            try:
-                                                # Need num_steps and route_score for caption (optional but nice)
-                                                num_steps_sub = len(tree.synthesis_route(route_id))
-                                                route_score_sub = round(tree.route_score(route_id), 3)
-                                                svg_sub = get_route_svg(tree, route_id)
-                                                if svg_sub:
-                                                    st.image(svg_sub, caption=f"Route {route_id}; {num_steps_sub} steps; Score: {route_score_sub}")
-                                                else:
-                                                    st.warning(f"Could not generate SVG for route {route_id}.")
-                                                routes_shown += 1
-                                            except Exception as e:
-                                                st.error(f"Error displaying route {route_id} in subcluster {subcluster_num+1}: {e}")
-                            else:
-                                st.info("Select a cluster and step number to view subclusters.")
-    # --- Handling No Solution Case ---
-    elif not st.session_state.get('planning_done', False):
-        # Only show this if planning was attempted but failed (or not run yet)
-        # Avoid showing it if just molecule changed
-        if submit_planning: # Check if the button was just pressed
-            st.warning("Planning did not complete successfully or is still running.")
-    else: # Planning done, but res["solved"] is False
-        st.header('Planning Results')
-        st.warning("No reaction path found for the target molecule with the current settings.")
-        st.write("Consider adjusting planning options (e.g., increase iterations, adjust depth, check molecule validity).")
-        # Optionally display basic stats even if not solved
-        stat_col, _ = st.columns(2)
-        with stat_col:
-            st.subheader("Run Statistics (No Solution)")
             try:
-                 if 'target_smiles' not in res: res['target_smiles'] = st.session_state.target_smiles
-                 cols_to_show = [col for col in ["target_smiles", "num_nodes", "num_iter", "search_time"] if col in res]
-                 df = pd.DataFrame(res, index=[0])[cols_to_show]
-                 st.dataframe(df)
             except Exception as e:
-                 st.error(f"Error displaying statistics: {e}")
-                 st.write(res)
-# --- Restart Button ---
-st.divider()
-st.header('Restart Application State')
-if st.button("Clear All Results & Restart"):
-    # Clear all relevant session state keys
-    keys_to_clear = [
-        "planning_done", "tree", "res", "target_smiles",
-        "clustering_done", "clusters", "reactions_dict", "num_clusters_setting", "rg_cgrs_dict",
-        "subclustering_done", "sub",
-        "clusters_downloaded" # Add any other state keys you use
-    ]
-    for key in keys_to_clear:
-        if key in st.session_state:
-            del st.session_state[key]
-    # Clear ketcher state by assigning a default value (or empty string)
-    st.session_state.ketcher = DEFAULT_MOL # Reset ketcher to default
-    st.rerun()

 import pickle
 import re
 import uuid
+import io
+import zipfile
 import pandas as pd
 import streamlit as st
 from synplan.mcts.search import extract_tree_stats
 from synplan.mcts.tree import Tree
 from synplan.chem.utils import mol_from_smiles
+from synplan.chem.reaction_routes.route_cgr import *
+from synplan.chem.reaction_routes.clustering import *
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+    generate_results_html,
+    html_top_routes_cluster,
+    get_route_svg,
+)
 from synplan.utils.config import TreeConfig, PolicyNetworkConfig
 from synplan.utils.loading import load_reaction_rules, load_building_blocks
 import psutil
 import gc
 disable_progress_bars("huggingface_hub")
 smiles_parser = SMILESRead.create_parser(ignore=True)
+DEFAULT_MOL = "c1cc(ccc1Cl)C(CCO)NC(C2(CCN(CC2)c3c4cc[nH]c4ncn3)N)=O"
+# --- Helper Functions ---
+def download_button(
+    object_to_download, download_filename, button_text, pickle_it=False
+):
     """
     Issued from
     Generates a link to download the given object_to_download.
             pass
         elif isinstance(object_to_download, pd.DataFrame):
+            object_to_download = object_to_download.to_csv(index=False).encode("utf-8")
     try:
         b64 = base64.b64encode(object_to_download.encode()).decode()
     except AttributeError:
         b64 = base64.b64encode(object_to_download).decode()
+    button_uuid = str(uuid.uuid4()).replace("-", "")
+    button_id = re.sub("\d+", "", button_uuid)
+    custom_css = f"""
         <style>
             #{button_id} {{
                 background-color: rgb(255, 255, 255);
                 border-style: solid;
                 border-color: rgb(230, 234, 241);
                 border-image: initial;
+            }}
             #{button_id}:hover {{
                 border-color: rgb(246, 51, 102);
                 color: rgb(246, 51, 102);
                 }}
         </style> """
+    dl_link = (
+        custom_css
+        + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br></br>'
+    )
     return dl_link
 @st.cache_resource
+def load_planning_resources_cached():  # Renamed to avoid conflict if main calls it directly
     building_blocks_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="building_blocks_em_sa_ln.smi",
+        subfolder="building_blocks",
+        local_dir=".",
+    )
     ranking_policy_weights_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="ranking_policy_network.ckpt",
+        subfolder="uspto/weights",
+        local_dir=".",
+    )
     reaction_rules_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="uspto_reaction_rules.pickle",
+        subfolder="uspto",
+        local_dir=".",
+    )
     return building_blocks_path, ranking_policy_weights_path, reaction_rules_path
+# --- GUI Sections ---
+def initialize_app():
+    """1. Initialization: Setting up the main window, layout, and initial widgets."""
+    st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
+    # Initialize session state variables if they don't exist.
+    if "planning_done" not in st.session_state:
+        st.session_state.planning_done = False
+    if "tree" not in st.session_state:
+        st.session_state.tree = None
+    if "res" not in st.session_state:
+        st.session_state.res = None
+    if "target_smiles" not in st.session_state:
+        st.session_state.target_smiles = (
+            ""  # Initial value, might be overwritten by ketcher
+        )
+    # Clustering state
+    if "clustering_done" not in st.session_state:
+        st.session_state.clustering_done = False
+    if "clusters" not in st.session_state:
+        st.session_state.clusters = None
+    if "reactions_dict" not in st.session_state:
+        st.session_state.reactions_dict = None
+    if "num_clusters_setting" not in st.session_state:  # Store the setting used
+        st.session_state.num_clusters_setting = 10
+    if "route_cgrs_dict" not in st.session_state:
+        st.session_state.route_cgrs_dict = None
+    if "r_route_cgrs_dict" not in st.session_state:
+        st.session_state.r_route_cgrs_dict = None
+    # Subclustering state
+    if "subclustering_done" not in st.session_state:
+        st.session_state.subclustering_done = False
+    if "subclusters" not in st.session_state:  # Renamed from 'sub' for clarity
+        st.session_state.subclusters = None
+    # Download state (less critical now with direct download links)
+    if "clusters_downloaded" not in st.session_state:  # Example, might not be needed
+        st.session_state.clusters_downloaded = False
+    if "ketcher" not in st.session_state:  # For ketcher persistence
+        st.session_state.ketcher = DEFAULT_MOL
+    intro_text = """
+    This is a demo of the graphical user interface of
+    [SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
+    SynPlanner is a comprehensive tool for reaction data curation, rule extraction, model training and retrosynthetic planning.
+    More information on SynPlanner is available in the [official docs](https://synplanner.readthedocs.io/en/latest/index.html).
+    """
+    st.title("`SynPlanner GUI`")
+    st.write(intro_text)
+def setup_sidebar():
+    """2. Sidebar: Handling the widgets and logic within the sidebar area."""
+    # st.sidebar.image("img/logo.png") # Assuming img/logo.png is available
+    st.sidebar.title("Docs")
+    st.sidebar.markdown("https://synplanner.readthedocs.io/en/latest/")
+    st.sidebar.title("Tutorials")
+    st.sidebar.markdown(
+        "https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/tree/main/tutorials"
+    )
+    st.sidebar.title("Paper")
+    st.sidebar.markdown(
+        "https://chemrxiv.org/engage/chemrxiv/article-details/66add90bc9c6a5c07ae65796"
+    )
+    st.sidebar.title("Issues")
+    st.sidebar.markdown(
+        "[Report a bug 🐞](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/issues/new?assignees=&labels=bug&projects=&template=bug_report.md&title=%5BBUG%5D)"
+    )
+def handle_molecule_input():
+    """3. Molecule Input: Managing the input area for molecule data."""
+    st.header("Molecule input")
+    st.markdown(
+        """
+        You can provide a molecular structure by either providing:
+        * SMILES string + Enter
+        * Draw it + Apply
+        """
+    )
+    # Use st.session_state.ketcher to persist drawn molecule
+    molecule_text_input = st.text_input(
+        "SMILES:", value=st.session_state.ketcher, key="smiles_text_input_key"
+    )
+    smile_code_ketcher = st_ketcher(molecule_text_input, key="ketcher_widget")
+    # col_kethcer, col_info = st.columns([0.8, 0.2])
+    # with col_kethcer:
+    #     smile_code_ketcher = st_ketcher(molecule_text_input, key="ketcher_widget")
+    # with col_info:
+    #     st.subheader("Synthetic Complexity")
+    #     sascore = ()
+    #     st.markdown(f"SAScore: {sascore}")
+    #     syba_score = ()
+    #     st.markdown(f"SYBA: {sascore}")
+    current_smile_code = (
+        smile_code_ketcher  # The output from ketcher is the definitive SMILES
+    )
+    if (
+        "target_smiles" in st.session_state
+        and current_smile_code != st.session_state.target_smiles
+    ):
+        st.warning("Molecule structure changed. Please re-run planning.")
+        st.session_state.planning_done = False
+        st.session_state.clustering_done = False
+        st.session_state.subclustering_done = False
+        st.session_state.tree = None
+        st.session_state.res = None
+        st.session_state.clusters = None
+        st.session_state.reactions_dict = None
+        st.session_state.subclusters = None
+        st.session_state.ketcher = current_smile_code
+    return current_smile_code
+def setup_planning_options():
+    """4. Planning: Encapsulating the logic related to the "planning" functionality."""
+    st.header("Launch calculation")
+    st.markdown(
+        """If you modified the structure, please ensure you clicked on `Apply` (bottom right of the molecular editor)."""
+    )
+    # This smile_code display will be updated if handle_molecule_input has run and returned a new smile_code
+    # However, to display it correctly, we need the current smile_code from the session or input handler.
+    # For simplicity, let's assume handle_molecule_input has updated st.session_state.ketcher
+    st.markdown(
+        f"The molecule SMILES is actually: ``{st.session_state.get('ketcher', DEFAULT_MOL)}``"
+    )
+    st.subheader("Planning options")
+    st.markdown(
+        """
+        The description of each option can be found in the
+        [Retrosynthetic Planning Tutorial](https://synplanner.readthedocs.io/en/latest/tutorial_files/retrosynthetic_planning.html#Configuring-search-tree).
+        """
+    )
+    col_options_1, col_options_2 = st.columns(2, gap="medium")
+    with col_options_1:
+        search_strategy_input = st.selectbox(
+            label="Search strategy",
+            options=(
+                "Expansion first",
+                "Evaluation first",
+            ),
+            index=0,
+            key="search_strategy_input",
+        )
+        ucb_type = st.selectbox(
+            label="UCB type",
+            options=("uct", "puct", "value"),
+            index=0,
+            key="ucb_type_input",
+        )  # Fixed label
+        c_ucb = st.number_input(
+            "C coefficient of UCB",
+            value=0.1,
+            placeholder="Type a number...",
+            key="c_ucb_input",
+        )
+    with col_options_2:
+        max_iterations = st.slider(
+            "Total number of MCTS iterations",
+            min_value=50,
+            max_value=1000,
+            value=300,
+            key="max_iterations_slider",
+        )
+        max_depth = st.slider(
+            "Maximal number of reaction steps",
+            min_value=3,
+            max_value=9,
+            value=6,
+            key="max_depth_slider",
+        )
+        min_mol_size = st.slider(
+            "Minimum size of a molecule to be precursor",
+            min_value=0,
+            max_value=7,
+            value=0,
+            key="min_mol_size_slider",
+            help="Number of non-hydrogen atoms in molecule",
+        )
+    search_strategy_translator = {
+        "Expansion first": "expansion_first",
+        "Evaluation first": "evaluation_first",
+    }
+    search_strategy = search_strategy_translator[search_strategy_input]
+    planning_params = {
+        "search_strategy": search_strategy,
+        "ucb_type": ucb_type,
+        "c_ucb": c_ucb,
+        "max_iterations": max_iterations,
+        "max_depth": max_depth,
+        "min_mol_size": min_mol_size,
+    }
+    if st.button("Start retrosynthetic planning", key="submit_planning_button"):
+        # Reset downstream states if replanning
+        st.session_state.planning_done = False
+        st.session_state.clustering_done = False
+        st.session_state.subclustering_done = False
+        st.session_state.tree = None
+        st.session_state.res = None
+        st.session_state.clusters = None
+        st.session_state.reactions_dict = None
+        st.session_state.subclusters = None
+        st.session_state.route_cgrs_dict = None
+        st.session_state.r_route_cgrs_dict = None
+        active_smile_code = st.session_state.get(
+            "ketcher", DEFAULT_MOL
+        )  # Get current SMILES
+        st.session_state.target_smiles = (
+            active_smile_code  # Store the SMILES used for this run
+        )
+        try:
+            target_molecule = mol_from_smiles(active_smile_code)
+            if target_molecule is None:
+                st.error(f"Could not parse the input SMILES: {active_smile_code}")
+            else:
+                (
+                    building_blocks_path,
+                    ranking_policy_weights_path,
+                    reaction_rules_path,
+                ) = load_planning_resources_cached()
+                with st.spinner("Running retrosynthetic planning..."):
+                    with st.status("Loading resources...", expanded=False) as status:
+                        st.write("Loading building blocks...")
+                        building_blocks = load_building_blocks(
+                            building_blocks_path, standardize=False
+                        )
+                        st.write("Loading reaction rules...")
+                        reaction_rules = load_reaction_rules(reaction_rules_path)
+                        st.write("Loading policy network...")
+                        policy_config = PolicyNetworkConfig(
+                            weights_path=ranking_policy_weights_path
+                        )
+                        policy_function = PolicyNetworkFunction(
+                            policy_config=policy_config
+                        )
+                        status.update(label="Resources loaded!", state="complete")
+                    tree_config = TreeConfig(
+                        search_strategy=planning_params["search_strategy"],
+                        evaluation_type="rollout",  # This was hardcoded, keeping it.
+                        max_iterations=planning_params["max_iterations"],
+                        max_depth=planning_params["max_depth"],
+                        min_mol_size=planning_params["min_mol_size"],
+                        init_node_value=0.5,  # This was hardcoded
+                        ucb_type=planning_params["ucb_type"],
+                        c_ucb=planning_params["c_ucb"],
+                        silent=True,  # This was hardcoded
+                    )
+                    tree = Tree(
+                        target=target_molecule,
+                        config=tree_config,
+                        reaction_rules=reaction_rules,
+                        building_blocks=building_blocks,
+                        expansion_function=policy_function,
+                        evaluation_function=None,  # This was hardcoded
+                    )
+                    mcts_progress_text = "Running MCTS iterations..."
+                    mcts_bar = st.progress(0, text=mcts_progress_text)
+                    for step, (solved, node_id) in enumerate(tree):
+                        progress_value = min(
+                            1.0, (step + 1) / planning_params["max_iterations"]
+                        )
+                        mcts_bar.progress(
+                            progress_value,
+                            text=f"{mcts_progress_text} ({step+1}/{planning_params['max_iterations']})",
+                        )
+                    res = extract_tree_stats(tree, target_molecule)
+                    st.session_state["tree"] = tree
+                    st.session_state["res"] = res
+                    st.session_state.planning_done = True
+                    st.rerun()
+        except Exception as e:
+            st.error(f"An error occurred during planning: {e}")
+            st.session_state.planning_done = False
+def display_planning_results():
+    """5. Planning Results Display: Handling the presentation of results."""
+    if st.session_state.get("planning_done", False):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        if res is None or tree is None:
+            st.error(
+                "Planning results are missing from session state. Please re-run planning."
+            )
+            st.session_state.planning_done = False  # Reset state
+            return  # Exit this function if no results
+        if res.get("solved", False):  # Use .get for safety
+            st.header("Planning Results")
+            winning_nodes = (
+                sorted(set(tree.winning_nodes))
+                if hasattr(tree, "winning_nodes") and tree.winning_nodes
+                else []
+            )
+            st.subheader(f"Number of unique routes found: {len(winning_nodes)}")
+            st.subheader("Examples of found retrosynthetic routes")
+            image_counter = 0
+            visualised_node_ids = set()
+            if not winning_nodes:
+                st.warning(
+                    "Planning solved, but no winning nodes found in the tree object."
+                )
+            else:
+                for n, node_id in enumerate(winning_nodes):
+                    if image_counter >= 3:
+                        break
+                    if node_id not in visualised_node_ids:
+                        try:
+                            visualised_node_ids.add(node_id)
+                            num_steps = len(tree.synthesis_route(node_id))
+                            route_score = round(tree.route_score(node_id), 3)
+                            svg = get_route_svg(tree, node_id)
+                            if svg:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                                image_counter += 1
+                            else:
+                                st.warning(
+                                    f"Could not generate SVG for route {node_id}."
+                                )
+                        except Exception as e:
+                            st.error(f"Error displaying route {node_id}: {e}")
+        else:  # Not solved
+            st.header("Planning Results")
+            st.warning(
+                "No reaction path found for the target molecule with the current settings."
+            )
+            st.write(
+                "Consider adjusting planning options (e.g., increase iterations, adjust depth, check molecule validity)."
+            )
+            stat_col, _ = st.columns(2)
+            with stat_col:
+                st.subheader("Run Statistics (No Solution)")
                 try:
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display for the unsuccessful run.")
                 except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)
+def download_planning_results():
+    """6. Planning Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        # This section is usually placed within a column in the original script
+        # We'll assume it's called after display_planning_results and can use a new column or area.
+        # For proper layout, this should be integrated with display_planning_results' columns.
+        # For now, creating a placeholder or separate section for downloads:
+        # st.subheader("Downloads") # This might be redundant if called within a layout context.
+        # The original code places downloads in the second column of planning results.
+        # To replicate, we'd need to pass the column object or call this within that context.
+        # Simulating this by just creating the download links:
+        try:
+            html_body = generate_results_html(tree, html_path=None, extended=True)
+            dl_html = download_button(
+                html_body,
+                f"results_synplanner_{st.session_state.target_smiles}.html",
+                "Download results (HTML)",
+            )
+            if dl_html:
+                st.markdown(dl_html, unsafe_allow_html=True)
+            try:
+                res_df = pd.DataFrame(res, index=[0])
+                dl_csv = download_button(
+                    res_df,
+                    f"stats_synplanner_{st.session_state.target_smiles}.csv",
+                    "Download statistics (CSV)",
+                )
+                if dl_csv:
+                    st.markdown(dl_csv, unsafe_allow_html=True)
             except Exception as e:
+                st.error(f"Could not prepare statistics CSV for download: {e}")
+        except Exception as e:
+            st.error(f"Error generating download links for planning results: {e}")
+def setup_clustering():
+    """7. Clustering: Encapsulating the logic related to the "clustering" functionality."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        st.divider()
         st.header("Clustering the retrosynthetic routes")
+        # num_clusters_input = st.number_input( # This input was removed in the final user code, so omitting.
+        #     "Desired Number of Clusters (approximate):",
+        #     min_value=2, max_value=50, value=st.session_state.get("num_clusters_setting", 10),
+        #     key="num_clusters_input_key"
+        # )
+        if st.button("Run Clustering", key="submit_clustering_button"):
+            # st.session_state.num_clusters_setting = num_clusters_input
             st.session_state.clustering_done = False
             st.session_state.subclustering_done = False
             st.session_state.clusters = None
             st.session_state.reactions_dict = None
+            st.session_state.subclusters = None
+            st.session_state.route_cgrs_dict = None
+            st.session_state.r_route_cgrs_dict = None
             with st.spinner("Performing clustering..."):
                 try:
                     current_tree = st.session_state.tree
                     if not current_tree:
                         st.error("Tree object not found. Please re-run planning.")
+                        return
+                    st.write("Calculating RoutesCGRs...")
+                    route_cgrs_dict = compose_all_route_cgrs(current_tree)
+                    st.write("Processing ReducedRoutesCGRs...")
+                    r_route_cgrs_dict = compose_all_reduced_route_cgrs(route_cgrs_dict)
+                    results = cluster_routes(
+                        r_route_cgrs_dict, use_strat=False
+                    )  # num_clusters was removed from args
+                    results = dict(sorted(results.items(), key=lambda x: float(x[0])))
+                    st.session_state.clusters = results
+                    st.session_state.route_cgrs_dict = route_cgrs_dict
+                    st.session_state.r_route_cgrs_dict = r_route_cgrs_dict
+                    st.write("Extracting reactions...")
+                    st.session_state.reactions_dict = extract_reactions(current_tree)
+                    if (
+                        st.session_state.clusters is not None
+                        and st.session_state.reactions_dict is not None
+                    ):  # Check for None explicitly
+                        st.session_state.clustering_done = True
+                        st.success(
+                            f"Clustering complete. Found {len(st.session_state.clusters)} clusters."
+                        )
                     else:
+                        st.error("Clustering failed or returned empty results.")
+                        st.session_state.clustering_done = False
+                    del results  # route_cgrs_dict, r_route_cgrs_dict are stored
+                    gc.collect()
+                    st.rerun()
                 except Exception as e:
                     st.error(f"An error occurred during clustering: {e}")
                     st.session_state.clustering_done = False
+def display_clustering_results():
+    """8. Clustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("clustering_done", False):
+        clusters = st.session_state.clusters
+        # reactions_dict = st.session_state.reactions_dict # Needed for download, not directly for display here
+        tree = st.session_state.tree
+        MAX_DISPLAY_CLUSTERS_DATA = 10
+        if (
+            clusters is None or tree is None
+        ):  # reactions_dict removed as not critical for display part
+            st.error(
+                "Clustering results (clusters or tree) are missing. Please re-run clustering."
+            )
+            st.session_state.clustering_done = False
+            return
+        st.subheader(f"Best routes from {len(clusters)} Found Clusters")
+        clusters_items = list(clusters.items())
+        first_items = clusters_items[:MAX_DISPLAY_CLUSTERS_DATA]
+        remaining_items = clusters_items[MAX_DISPLAY_CLUSTERS_DATA:]
+        for cluster_num, group_data in first_items:
+            if (
+                not group_data
+                or "node_ids" not in group_data
+                or not group_data["node_ids"]
+            ):
+                st.warning(f"Cluster {cluster_num} has no data or node_ids.")
+                continue
+            st.markdown(
+                f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+            )
+            node_id = group_data["node_ids"][0]
+            try:
+                num_steps = len(tree.synthesis_route(node_id))
+                route_score = round(tree.route_score(node_id), 3)
+                svg = get_route_svg(tree, node_id)
+                r_route_cgr = group_data.get("r_route_cgr")  # Safely get r_route_cgr
+                r_route_cgr_svg = None
+                if r_route_cgr:
+                    r_route_cgr.clean2d()
+                    r_route_cgr_svg = cgr_display(r_route_cgr)
+                if svg and r_route_cgr_svg:
+                    col1, col2 = st.columns([0.2, 0.8])
+                    with col1:
+                        st.image(r_route_cgr_svg, caption="ReducedRouteCGR")
+                    with col2:
+                        st.image(
+                            svg,
+                            caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                        )
+                elif svg:  # Only route SVG available
+                    st.image(
+                        svg,
+                        caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                    )
+                    st.warning(
+                        f"ReducedRouteCGR could not be displayed for cluster {cluster_num}."
+                    )
+                else:
+                    st.warning(
+                        f"Could not generate SVG for route {node_id} or its ReducedRouteCGR."
+                    )
+            except Exception as e:
+                st.error(
+                    f"Error displaying route {node_id} for cluster {cluster_num}: {e}"
+                )
+        if remaining_items:
+            with st.expander(f"... and {len(remaining_items)} more clusters"):
+                for cluster_num, group_data in remaining_items:
+                    if (
+                        not group_data
+                        or "node_ids" not in group_data
+                        or not group_data["node_ids"]
+                    ):
+                        st.warning(
+                            f"Cluster {cluster_num} in expansion has no data or node_ids."
+                        )
+                        continue
+                    st.markdown(
+                        f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+                    )
+                    node_id = group_data["node_ids"][0]
                     try:
                         num_steps = len(tree.synthesis_route(node_id))
                         route_score = round(tree.route_score(node_id), 3)
                         svg = get_route_svg(tree, node_id)
+                        r_route_cgr = group_data.get("r_route_cgr")
+                        r_route_cgr_svg = None
+                        if r_route_cgr:
+                            r_route_cgr.clean2d()
+                            r_route_cgr_svg = cgr_display(r_route_cgr)
+                        if svg and r_route_cgr_svg:
+                            col1, col2 = st.columns([0.2, 0.8])
+                            with col1:
+                                st.image(r_route_cgr_svg, caption="ReducedRouteCGR")
+                            with col2:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                        elif svg:
+                            st.image(
+                                svg,
+                                caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                            )
+                            st.warning(
+                                f"ReducedRouteCGR could not be displayed for cluster {cluster_num}."
+                            )
                         else:
+                            st.warning(
+                                f"Could not generate SVG for route {node_id} or its ReducedRouteCGR."
+                            )
+                    except Exception as e:
+                        st.error(
+                            f"Error displaying route {node_id} for cluster {cluster_num}: {e}"
+                        )
+def download_clustering_results():
+    """10. Clustering Results Download: Providing functionality to download."""
+    if st.session_state.get("clustering_done", False):
+        tree_for_html = st.session_state.get("tree")
+        clusters_for_html = st.session_state.get("clusters")
+        r_route_cgrs_for_html = st.session_state.get(
+            "r_route_cgrs_dict"
+        )  # This was used instead of reactions_dict in the original for report
+        if not tree_for_html:
+            st.warning("MCTS Tree data not found. Cannot generate cluster reports.")
+            return
+        if not clusters_for_html:
+            st.warning("Cluster data not found. Cannot generate cluster reports.")
+            return
+        # r_route_cgrs_for_html is optional for routes_clustering_report if not essential
+        st.subheader("Cluster Reports")  # Changed subheader in original
+        st.write("Generate downloadable HTML reports for each cluster:")
+        MAX_DOWNLOAD_LINKS_DISPLAYED = 10
+        num_clusters_total = len(clusters_for_html)
+        clusters_items = list(clusters_for_html.items())
+        for i, (cluster_idx, group_data) in enumerate(
+            clusters_items
+        ):  # group_data might not be needed here if report uses cluster_idx
+            if i >= MAX_DOWNLOAD_LINKS_DISPLAYED:
+                break
+            try:
+                html_content = routes_clustering_report(
+                    tree_for_html,
+                    clusters_for_html,  # Pass the whole dict
+                    str(cluster_idx),  # Pass the key of the cluster
+                    r_route_cgrs_for_html,  # Pass the r_route_cgrs dict
+                    aam=False,
+                )
+                st.download_button(
+                    label=f"Download report for cluster {cluster_idx}",
+                    data=html_content,
+                    file_name=f"cluster_{cluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_cluster_{cluster_idx}",
+                )
+            except Exception as e:
+                st.error(f"Error generating report for cluster {cluster_idx}: {e}")
+        if num_clusters_total > MAX_DOWNLOAD_LINKS_DISPLAYED:
+            remaining_items = clusters_items[MAX_DOWNLOAD_LINKS_DISPLAYED:]
+            remaining_count = len(remaining_items)
+            expander_label = f"Show remaining {remaining_count} cluster reports"
+            with st.expander(expander_label):
+                for (
+                    group_index,
+                    _,
+                ) in remaining_items:  # group_data not needed here either
+                    try:
+                        html_content = routes_clustering_report(
+                            tree_for_html,
+                            clusters_for_html,
+                            str(group_index),
+                            r_route_cgrs_for_html,
+                            aam=False,
+                        )
+                        st.download_button(
+                            label=f"Download report for cluster {group_index}",
+                            data=html_content,
+                            file_name=f"cluster_{group_index}_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_expanded_{group_index}",
+                        )
                     except Exception as e:
+                        st.error(
+                            f"Error generating report for cluster {group_index} (expanded): {e}"
+                        )
+        try:
+            buffer = io.BytesIO()
+            with zipfile.ZipFile(
+                buffer, mode="w", compression=zipfile.ZIP_DEFLATED
+            ) as zf:
+                for idx, _ in clusters_items:  # group_data not needed
+                    html_content_zip = routes_clustering_report(
+                        tree_for_html,
+                        clusters_for_html,
+                        str(idx),
+                        r_route_cgrs_for_html,
+                        aam=False,
+                    )
+                    filename = f"cluster_{idx}_{st.session_state.target_smiles}.html"
+                    zf.writestr(filename, html_content_zip)
+            buffer.seek(0)
+            st.download_button(
+                label="📦 Download all cluster reports as ZIP",
+                data=buffer,
+                file_name=f"all_cluster_reports_{st.session_state.target_smiles}.zip",
+                mime="application/zip",
+                key="download_all_clusters_zip",
+            )
+        except Exception as e:
+            st.error(f"Error generating ZIP file for cluster reports: {e}")
+def setup_subclustering():
+    """11. Subclustering: Encapsulating the logic related to the "subclustering" functionality."""
+    if st.session_state.get(
+        "clustering_done", False
+    ):  # Subclustering depends on clustering being done
+        st.divider()
+        st.header("Sub-Clustering within a selected Cluster")
+        if st.button("Run Subclustering Analysis", key="submit_subclustering_button"):
+            st.session_state.subclustering_done = False
+            st.session_state.subclusters = None
+            with st.spinner("Performing subclustering analysis..."):
+                try:
+                    clusters_for_sub = st.session_state.get("clusters")
+                    r_route_cgrs_dict_for_sub = st.session_state.get(
+                        "r_route_cgrs_dict"
+                    )
+                    route_cgrs_dict_for_sub = st.session_state.get("route_cgrs_dict")
+                    if (
+                        clusters_for_sub
+                        and r_route_cgrs_dict_for_sub
+                        and route_cgrs_dict_for_sub
+                    ):  # Ensure all are present
+                        all_subgroups = subcluster_all_clusters(
+                            clusters_for_sub,
+                            r_route_cgrs_dict_for_sub,
+                            route_cgrs_dict_for_sub,
+                        )
+                        st.session_state.subclusters = all_subgroups
+                        st.session_state.subclustering_done = True
+                        st.success("Subclustering analysis complete.")
+                        gc.collect()
+                        st.rerun()
                     else:
+                        missing = []
+                        if not clusters_for_sub:
+                            missing.append("clusters")
+                        if not r_route_cgrs_dict_for_sub:
+                            missing.append("ReducedRouteCGRs dictionary")
+                        if not route_cgrs_dict_for_sub:
+                            missing.append("RouteCGRs dictionary")
+                        st.error(
+                            f"Cannot run subclustering. Missing data: {', '.join(missing)}. Please ensure clustering ran successfully."
+                        )
+                        st.session_state.subclustering_done = False
+                except Exception as e:
+                    st.error(f"An error occurred during subclustering: {e}")
+                    st.session_state.subclustering_done = False
+def display_subclustering_results():
+    """12. Subclustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("subclustering_done", False):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        # clusters_for_sub_display = st.session_state.get('clusters') # Not directly used in display logic from original code snippet
+        if not sub or not tree:
+            st.error(
+                "Subclustering results (subclusters or tree) are missing. Please re-run subclustering."
+            )
+            st.session_state.subclustering_done = False
+            return
+        sub_input_col, sub_display_col = st.columns([0.25, 0.75])
+        with sub_input_col:
+            st.subheader("Select Cluster and Subcluster")
+            available_cluster_nums = list(sub.keys())
+            if not available_cluster_nums:
+                st.warning("No clusters available in subclustering results.")
+                return  # Exit if no clusters to select
+            user_input_cluster_num_display = st.selectbox(
+                "Select Cluster #:",
+                options=sorted(available_cluster_nums),
+                key="subcluster_num_select_key",
+            )
+            selected_subcluster_idx = 0
+            if user_input_cluster_num_display in sub:
+                sub_step_cluster = sub[user_input_cluster_num_display]
+                allowed_subclusters_indices = sorted(list(sub_step_cluster.keys()))
+                if not allowed_subclusters_indices:
+                    st.warning(
+                        f"No reaction steps (subclusters) found for Cluster {user_input_cluster_num_display}."
+                    )
+                else:
+                    selected_subcluster_idx = st.selectbox(
+                        "Select Subcluster Index:",
+                        options=allowed_subclusters_indices,
+                        key="subcluster_index_select_key",
+                    )
+                    if selected_subcluster_idx in sub[user_input_cluster_num_display]:
+                        current_subcluster_data = sub[user_input_cluster_num_display][
+                            selected_subcluster_idx
+                        ]
+                        if "r_route_cgr" in current_subcluster_data:
+                            cluster_r_route_cgr_display = current_subcluster_data[
+                                "r_route_cgr"
+                            ]
+                            cluster_r_route_cgr_display.clean2d()
+                            st.image(
+                                cluster_r_route_cgr_display.depict(),
+                                caption=f"ReducedRouteCGR of parent Cluster {user_input_cluster_num_display}",
+                            )
+                        else:
+                            st.warning("ReducedRouteCGR for this subcluster not found.")
+            else:
+                st.warning(
+                    f"Selected cluster {user_input_cluster_num_display} not found in subclustering results."
+                )
+                return
+        with sub_display_col:
+            st.subheader("Subcluster Details")
+            if (
+                user_input_cluster_num_display in sub
+                and selected_subcluster_idx in sub[user_input_cluster_num_display]
+            ):
+                subcluster_content = sub[user_input_cluster_num_display][
+                    selected_subcluster_idx
+                ]
+                # subcluster_to_display = post_process_subgroup(subcluster_content) #Under development
+                subcluster_to_display = subcluster_content
+                if (
+                    not subcluster_to_display
+                    or "nodes_data" not in subcluster_to_display
+                    or not subcluster_to_display["nodes_data"]
+                ):
+                    st.info("No routes or data found for this subcluster selection.")
+                else:
+                    MAX_ROUTES_PER_SUBCLUSTER = 5
+                    all_route_ids_in_subcluster = list(
+                        subcluster_to_display["nodes_data"].keys()
+                    )
+                    routes_to_display_direct = all_route_ids_in_subcluster[
+                        :MAX_ROUTES_PER_SUBCLUSTER
+                    ]
+                    remaining_routes_sub = all_route_ids_in_subcluster[
+                        MAX_ROUTES_PER_SUBCLUSTER:
+                    ]
+                    st.markdown(
+                        f"--- \n**Subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}** (Size: {len(all_route_ids_in_subcluster)})"
+                    )
+                    if "synthon_reaction" in subcluster_to_display:
+                        synthon_reaction = subcluster_to_display["synthon_reaction"]
+                        synthon_reaction.clean2d()
+                        try:
+                            st.image(
+                                depict_custom_reaction(synthon_reaction),
+                                caption=f"Markush-like pseudo reaction of subcluster",
+                            )  # Assuming depict_custom_reaction
+                        except Exception as e_depict:
+                            st.warning(f"Could not depict synthon reaction: {e_depict}")
+                    else:
+                        st.info("No synthon reaction data for this subcluster.")
+                    for route_id in routes_to_display_direct:
+                        try:
+                            route_score_sub = round(tree.route_score(route_id), 3)
+                            svg_sub = get_route_svg(tree, route_id)
+                            if svg_sub:
+                                st.image(
+                                    svg_sub,
+                                    caption=f"Route {route_id}; Score: {route_score_sub}",
+                                )
+                            else:
+                                st.warning(
+                                    f"Could not generate SVG for route {route_id}."
+                                )
+                        except Exception as e:
+                            st.error(
+                                f"Error displaying route {route_id} in subcluster: {e}"
+                            )
+                    if remaining_routes_sub:
+                        with st.expander(
+                            f"... and {len(remaining_routes_sub)} more routes in this subcluster"
+                        ):
+                            for route_id in remaining_routes_sub:
                                 try:
+                                    route_score_sub = round(
+                                        tree.route_score(route_id), 3
                                     )
+                                    svg_sub = get_route_svg(tree, route_id)
+                                    if svg_sub:
+                                        st.image(
+                                            svg_sub,
+                                            caption=f"Route {route_id}; Score: {route_score_sub}",
+                                        )
+                                    else:
+                                        st.warning(
+                                            f"Could not generate SVG for route {route_id}."
+                                        )
                                 except Exception as e:
+                                    st.error(
+                                        f"Error displaying route {route_id} in subcluster (expanded): {e}"
+                                    )
+            else:
+                st.info("Select a valid cluster and subcluster index to see details.")
+def download_subclustering_results():
+    """13. Subclustering Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("subclustering_done", False)
+        and "subcluster_num_select_key" in st.session_state
+        and "subcluster_index_select_key" in st.session_state
+    ):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        r_route_cgrs_for_report = st.session_state.get(
+            "r_route_cgrs_dict"
+        )  # Used by routes_subclustering_report
+        user_input_cluster_num_display = st.session_state.subcluster_num_select_key
+        selected_subcluster_idx = st.session_state.subcluster_index_select_key
+        if not tree or not sub or not r_route_cgrs_for_report:
+            st.warning(
+                "Missing data for subclustering report generation (tree, subclusters, or ReducedRouteCGRs)."
+            )
+            return
+        if (
+            user_input_cluster_num_display in sub
+            and selected_subcluster_idx in sub[user_input_cluster_num_display]
+        ):
+            subcluster_data_for_report = sub[user_input_cluster_num_display][
+                selected_subcluster_idx
+            ]
+            # Apply the same post-processing as in display
+            processed_subcluster_data = post_process_subgroup(
+                subcluster_data_for_report
+            )
+            if "nodes_data" in subcluster_data_for_report and isinstance(
+                subcluster_data_for_report["nodes_data"], dict
+            ):
+                processed_subcluster_data["group_lgs"] = group_by_identical_values(
+                    subcluster_data_for_report["nodes_data"]
+                )
+            else:
+                processed_subcluster_data["group_lgs"] = {}
             try:
+                subcluster_html_content = routes_subclustering_report(
+                    tree,
+                    processed_subcluster_data,  # Pass the specific post-processed subcluster data
+                    user_input_cluster_num_display,
+                    selected_subcluster_idx,
+                    r_route_cgrs_for_report,  # Pass the whole r_route_cgrs dict
+                    if_lg_group=True,  # This parameter was in the original call
+                )
+                st.download_button(
+                    label=f"Download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}",
+                    data=subcluster_html_content,
+                    file_name=f"subcluster_{user_input_cluster_num_display}.{selected_subcluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_subcluster_{user_input_cluster_num_display}_{selected_subcluster_idx}",
+                )
             except Exception as e:
+                st.error(
+                    f"Error generating download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}: {e}"
+                )
+        # else:
+        # This case is handled by the display logic mostly, download button just won't appear or will be for previous valid selection.
+def implement_restart():
+    """14. Restart: Implementing the logic to reset or restart the application state."""
+    st.divider()
+    st.header("Restart Application State")
+    if st.button("Clear All Results & Restart", key="restart_button"):
+        keys_to_clear = [
+            "planning_done",
+            "tree",
+            "res",
+            "target_smiles",
+            "clustering_done",
+            "clusters",
+            "reactions_dict",
+            "num_clusters_setting",
+            "route_cgrs_dict",
+            "r_route_cgrs_dict",
+            "subclustering_done",
+            "subclusters",  # "sub" was renamed
+            "clusters_downloaded",
+            # Potentially ketcher related keys if they need manual reset beyond new input
+            "ketcher_widget",
+            "smiles_text_input_key",  # Keys for widgets
+            "subcluster_num_select_key",
+            "subcluster_index_select_key",
+        ]
+        for key in keys_to_clear:
+            if key in st.session_state:
+                del st.session_state[key]
+        # Reset ketcher input to default by resetting its session state variable
+        st.session_state.ketcher = DEFAULT_MOL
+        # Also explicitly set target_smiles to empty or default to avoid stale data
+        st.session_state.target_smiles = ""
+        # It's generally better to let Streamlit manage widget state if possible,
+        # but for a full reset, clearing their explicit session state keys might be needed.
+        st.rerun()
+# --- Main Application Flow ---
+def main():
+    initialize_app()
+    setup_sidebar()
+    current_smile_code = handle_molecule_input()
+    # Update session_state.ketcher if current_smile_code has changed from ketcher output
+    if st.session_state.get("ketcher") != current_smile_code:
+        st.session_state.ketcher = current_smile_code
+        # No rerun here, let the flow continue. handle_molecule_input already warns.
+    setup_planning_options()  # This function now also handles the button press and logic for planning
+    # Display planning results and download options together
+    if st.session_state.get("planning_done", False):
+        display_planning_results()  # Displays stats and routes
+        if st.session_state.res and st.session_state.res.get("solved", False):
+            stat_col, download_col = st.columns(
+                2, gap="medium"
+            )  # Placeholder for download column
+            with stat_col:
+                st.subheader("Statistics")
+                try:
+                    res = st.session_state.res
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_routes",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:  # Ensure there are columns to show
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display from planning results.")
+                except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)  # Show raw dict if DataFrame fails
+            with download_col:
+                st.subheader("Planning Downloads")  # Adding a subheader for clarity
+                download_planning_results()
+    # Clustering section (setup button, display, download)
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        setup_clustering()  # Contains the "Run Clustering" button and logic
+        if st.session_state.get("clustering_done", False):
+            display_clustering_results()  # Displays cluster routes and stats
+            cluster_stat_col, cluster_download_col = st.columns(2, gap="medium")
+            with cluster_stat_col:
+                clusters = st.session_state.clusters
+                cluster_sizes = [
+                    cluster.get("group_size", 0)
+                    for cluster in clusters.values()
+                    if cluster
+                ]  # Safe get
+                st.subheader("Cluster Statistics")
+                if cluster_sizes:
+                    cluster_df = pd.DataFrame(
+                        {
+                            "Cluster": [
+                                k for k, v in clusters.items() if v
+                            ],  # Filter out empty clusters
+                            "Number of Routes": [
+                                v["group_size"] for v in clusters.values() if v
+                            ],
+                        }
+                    )
+                    if not cluster_df.empty:
+                        cluster_df.index += 1
+                        st.dataframe(cluster_df)
+                        best_route_html = html_top_routes_cluster(
+                            clusters,
+                            st.session_state.tree,
+                            st.session_state.target_smiles,
+                        )
+                        st.download_button(
+                            label=f"Download best route from each cluster",
+                            data=best_route_html,
+                            file_name=f"cluster_best_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_best",
+                        )
+                    else:
+                        st.write("No valid cluster data to display statistics for.")
+                    # download_top_routes_cluster()
+                else:
+                    st.write("No cluster data to display statistics for.")
+            with cluster_download_col:
+                download_clustering_results()
+    # Subclustering section (setup button, display, download)
+    if st.session_state.get("clustering_done", False):  # Depends on clustering
+        setup_subclustering()  # Contains "Run Subclustering" button
+        if st.session_state.get("subclustering_done", False):
+            display_subclustering_results()  # Displays subcluster details and routes
+            download_subclustering_results()  # This needs to be called after selections are made in display.
+    implement_restart()
+if __name__ == "__main__":
+    main()

cluster/clustering.py DELETED Viewed

@@ -1,174 +0,0 @@
-import numpy as np
-import pandas as pd
-from scipy.spatial.distance import squareform
-from scipy.cluster.hierarchy import fcluster
-from sklearn.metrics import silhouette_score, calinski_harabasz_score
-import fastcluster
-def tanimoto_similarity_continuous(matrix_1, matrix_2):
-    """
-    "The Tanimoto coefficient is a measure of the similarity between two sets.
-    It is defined as the size of the intersection divided by the size of the union of the sample sets."
-    The Tanimoto coefficient is also known as the Jaccard index
-    Adoppted from https://github.com/cimm-kzn/CIMtools/blob/master/CIMtools/metrics/pairwise.py
-    :param matrix_1: 2D array of features.
-    :param matrix_2: 2D array of features.
-    :return: The Tanimoto coefficient between the two arrays.
-    """
-    x_dot = np.dot(matrix_1, matrix_2.T)
-    x2 = (matrix_1**2).sum(axis=1)
-    y2 = (matrix_2**2).sum(axis=1)
-    len_x2 = len(x2)
-    len_y2 = len(y2)
-    result = x_dot / (np.array([x2] * len_y2).T + np.array([y2] * len_x2) - x_dot)
-    result[np.isnan(result)] = 0
-    if matrix_1.shape == matrix_2.shape:
-        np.fill_diagonal(result, 1.0)
-    return result
-def calculate_fingerprints(cgrs, fingerprint_method):
-    """Calculate fingerprints for a collection of CGRs.
-    Args:
-        cgrs (dict): Dictionary of CGRs
-        fingerprint_method: Initialized fingerprint calculator (e.g., MorganFingerprint instance)
-    Returns:
-        np.ndarray: Array of fingerprints
-    """
-    fingerprints = []
-    for cgr in cgrs.values():
-        fp = fingerprint_method.transform([cgr])[0]
-        fingerprints.append(fp)
-    return np.array(fingerprints)
-def create_similarity_matrix(fingerprints, labels):
-    """Create a similarity matrix from fingerprints.
-    Args:
-        fingerprints (np.ndarray): Array of fingerprints
-        labels (list): Labels for the fingerprints
-    Returns:
-        pd.DataFrame: Similarity matrix as a DataFrame
-    """
-    similarity_matrix = tanimoto_similarity_continuous(fingerprints, fingerprints)
-    return pd.DataFrame(similarity_matrix, columns=labels, index=labels)
-def calculate_linkage(similarity_df, method='average'):
-    """Calculate linkage matrix for hierarchical clustering.
-    Args:
-        similarity_df (pd.DataFrame): Similarity matrix
-        method (str): Linkage method
-    Returns:
-        np.ndarray: Linkage matrix
-    """
-    distance_matrix = 1 - similarity_df
-    condensed_distance = squareform(distance_matrix)
-    return fastcluster.linkage(condensed_distance, method=method)
-def optimal_cluster_num(Z, distance_matrix, max_clusters=10):
-    """Find optimal number of clusters using silhouette score.
-    Args:
-        Z (np.ndarray): Linkage matrix
-        distance_matrix (np.ndarray): Distance matrix
-        max_clusters (int): Maximum number of clusters to consider
-    Returns:
-        int: Optimal number of clusters
-    """
-    cluster_range = range(2, max_clusters)
-    silhouette_scores = []
-    for n_clusters in cluster_range:
-        cluster_labels = fcluster(Z, n_clusters, criterion='maxclust')
-        score = silhouette_score(distance_matrix, cluster_labels, metric='precomputed')
-        silhouette_scores.append(score)
-    return cluster_range[np.argmax(silhouette_scores)]
-def perform_clustering(Z, similarity_df, threshold=0.0, max_clusters=10):
-    """Perform hierarchical clustering with automatic cluster number optimization.
-    Args:
-        Z (np.ndarray): Linkage matrix
-        threshold (float): Distance threshold for initial clustering
-        max_clusters (int): Maximum number of clusters
-    Returns:
-        np.ndarray: Cluster labels
-    """
-    cluster_labels = fcluster(Z, t=threshold, criterion='distance')
-    unique_clusters = np.unique(cluster_labels)
-    if max(unique_clusters) > max_clusters:
-        optimal_n_clusters = optimal_cluster_num(Z, 1 - similarity_df, max_clusters)
-        cluster_labels = fcluster(Z, optimal_n_clusters, criterion='maxclust')
-    return cluster_labels
-def create_clusters_dict(cluster_labels, labels):
-    """Create a dictionary of clusters with their members.
-    Args:
-        cluster_labels (np.ndarray): Cluster assignments
-        labels (list): Labels for the items
-    Returns:
-        dict: Dictionary mapping cluster numbers to lists of member labels
-    """
-    unique_clusters = np.unique(cluster_labels)
-    clusters_dict = {}
-    for cluster in unique_clusters:
-        cluster_indices = np.where(cluster_labels == cluster)[0]
-        clusters_dict[cluster] = list(labels[cluster_indices])
-    return clusters_dict
-def cluster_molecules(cgrs, fingerprint_method, threshold=0.0, max_clusters=10, linkage_method='average'):
-    """Main function to perform molecular clustering.
-    Args:
-        cgrs (dict): Dictionary of CGRs
-        fingerprint_method: Initialized fingerprint calculator
-        threshold (float): Distance threshold for clustering
-        max_clusters (int): Maximum number of clusters
-        linkage_method (str): Method for hierarchical clustering
-    Returns:
-        dict: Clustering results containing clusters_dict and cluster_labels
-    """
-    # Calculate fingerprints
-    fingerprints = calculate_fingerprints(cgrs, fingerprint_method)
-    # Create similarity matrix
-    labels = list(cgrs.keys())
-    similarity_df = create_similarity_matrix(fingerprints, labels)
-    # Calculate linkage
-    Z = calculate_linkage(similarity_df, method=linkage_method)
-    # Perform clustering
-    cluster_labels = perform_clustering(Z, similarity_df, threshold, max_clusters)
-    # Create clusters dictionary
-    clusters_dict = create_clusters_dict(cluster_labels, np.array(labels))
-    return {
-        'clusters_dict': clusters_dict,
-        'cluster_labels': cluster_labels,
-        'similarity_matrix': similarity_df,
-        'linkage_matrix': Z
-    }

cluster/generalized_cgr.py DELETED Viewed

@@ -1,204 +0,0 @@
-def find_next_atom_num(accum_cgr, reactions):
-    """Find the next available atom number."""
-    max_num = 0
-    for reaction in reactions:
-        cgr = reaction.compose()
-        max_num = max(max_num, max(cgr._atoms.keys()))
-    return max_num + 1
-def get_clean_mapping(curr_prod, prod, reverse=False):
-    """Get clean mapping between molecules while avoiding number conflicts."""
-    dict_map = {}
-    m = list(curr_prod.get_mapping(prod))
-    if len(m) == 0:
-        return dict_map
-    # Get existing atom numbers in both molecules
-    curr_atoms = set(curr_prod._atoms.keys())
-    prod_atoms = set(prod._atoms.keys())
-    rr = m[0]
-    # Build mapping while checking for conflicts
-    for key, value in rr.items():
-        if key != value:
-            if value in rr.keys() and rr[value] != key:
-                # Skip cyclic mappings that could cause conflicts
-                continue
-            source = value if reverse else key
-            target = key if reverse else value
-            # Check if target number already exists in the molecule
-            if reverse and target in curr_atoms:
-                continue
-            if not reverse and target in prod_atoms:
-                continue
-            dict_map[source] = target
-    return dict_map
-def validate_molecule_components(curr_mol, node_id):
-    """Validate that molecule has only one connected component."""
-    new_rmol = [curr_mol.substructure(c) for c in curr_mol.connected_components]
-    if len(new_rmol) > 1:
-        print(f'Error tree {node_id}: We have more than one molecule in one node')
-def get_leaving_groups(products):
-    """Extract leaving group atom numbers from products."""
-    lg_atom_nums = []
-    for i, prod in enumerate(products):
-        if i != 0:  # Skip first product (main product)
-            lg_atom_nums.extend(prod._atoms.keys())
-    return lg_atom_nums
-def process_first_reaction(first_react, tree, node_id, min_mol_size):
-    """Process first reaction in the route and initialize building block set."""
-    bb_set = set()
-    for curr_mol in first_react.reactants:
-        react_key = tuple(curr_mol._atoms)
-        react_key_set = set(react_key)
-        if len(curr_mol) <= min_mol_size or str(curr_mol) in tree.building_blocks:
-            bb_set = react_key_set
-        validate_molecule_components(curr_mol, node_id)
-    return bb_set
-def update_reaction_dict(reaction, node_id, mapping, react_dict, tree, min_mol_size, bb_set, prev_remap=None):
-    """Update reaction dictionary with new mappings."""
-    for curr_mol in reaction.reactants:
-        react_key = tuple(curr_mol._atoms)
-        react_key_set = set(react_key)
-        validate_molecule_components(curr_mol, node_id)
-        if len(curr_mol) <= min_mol_size or str(curr_mol) in tree.building_blocks:
-            bb_set = bb_set.union(react_key_set)
-        # Filter the mapping to include only keys present in the current react_key
-        filtered_mapping = {k: v for k, v in mapping.items() if k in react_key_set}
-        if prev_remap:
-            prev_remappping = {k: v for k, v in prev_remap.items() if k in react_key_set}
-            filtered_mapping.update(prev_remappping)
-        react_dict[react_key] = filtered_mapping
-    return react_dict, bb_set
-def process_target_blocks(curr_products, curr_prod, lg_atom_nums, curr_lg_atom_nums, bb_set):
-    """Process and collect target blocks for remapping."""
-    target_block = []
-    if len(curr_products) > 1:
-        for prod in curr_products:
-            dict_map = get_clean_mapping(curr_prod, prod)
-            if prod._atoms.keys() != curr_prod._atoms.keys():
-                for key in list(prod._atoms.keys()):
-                    if key in lg_atom_nums or key in curr_lg_atom_nums:
-                        target_block.append(key)
-                    if key in bb_set:
-                        target_block.append(key)
-    return target_block
-def process_single_route(tree, node_id, min_mol_size=6):
-    """Process a single synthesis route maintaining consistent state."""
-    try:
-        reactions = tree.synthesis_route(node_id)
-        first_react = reactions[-1]
-        accum_cgr = first_react.compose()
-        bb_set = process_first_reaction(first_react, tree, node_id, min_mol_size)
-        react_dict = {}
-        max_num = find_next_atom_num(accum_cgr, reactions)
-        for step in range(len(reactions) - 2, -1, -1):
-            # print("\nProcessing step:", step + 1)
-            reaction = reactions[step]
-            curr_cgr = reaction.compose()
-            curr_prod = reaction.products[0]
-            accum_products = accum_cgr.decompose()[1].split()
-            lg_atom_nums = get_leaving_groups(accum_products)
-            curr_products = curr_cgr.decompose()[1].split()
-            tuple_atoms = tuple(curr_prod._atoms)
-            prev_remap = {}
-            if tuple_atoms in react_dict.keys() and len(react_dict[tuple_atoms]) != 0:
-                prev_remap = react_dict[tuple_atoms]
-                curr_cgr = curr_cgr.remap(prev_remap, copy=True)
-            curr_lg_atom_nums = []
-            for i in range(1, len(curr_products)):
-                prod = curr_products[i]
-                curr_lg_atom_nums += list(prod._atoms.keys())
-            target_block = process_target_blocks(curr_products, curr_prod, lg_atom_nums, curr_lg_atom_nums, bb_set)
-            mapping = {}
-            for atom_num in sorted(target_block):
-                if atom_num in accum_cgr._atoms and atom_num not in mapping:
-                    mapping[atom_num] = max_num
-                    max_num += 1
-            for i in range(len(accum_products)):
-                accum_prod = accum_products[i]
-                dict_map = get_clean_mapping(curr_prod, accum_prod, reverse=True)
-            if dict_map:
-                curr_cgr.remap(dict_map)
-            #maybe remap, then decompose and to BB
-            react_dict, bb_set = update_reaction_dict(reaction, node_id, mapping, react_dict, tree, min_mol_size, bb_set, prev_remap)
-            if mapping:
-                curr_cgr.remap(mapping)
-            accum_cgr = curr_cgr.compose(accum_cgr)
-        return {
-            'cgr': accum_cgr,
-        }
-    except Exception as e:
-        print(f"Error processing node {node_id}: {e}")
-        return None
-def reassign_nums(tree, node_id=None, min_mol_size=6):
-    """
-    Process routes and reassign atom numbers.
-    Args:
-        tree: Synthesis tree
-        node_id: Optional specific node ID to process. If None, processes all winning nodes
-        min_mol_size: Minimum size for building blocks
-    Returns:
-        If node_id is None:
-            dict: Dictionary mapping node IDs to their processed CGRs
-        If node_id is specified:
-            dict: Information about the processed route
-    """
-    if node_id is not None:
-        return process_single_route(tree, node_id, min_mol_size)
-    complex_cgr_dict = {}
-    reactions_dict = {}
-    cgrs_list = []
-    for node_id in set(tree.winning_nodes):
-        result = process_single_route(tree, node_id, min_mol_size)
-        if result:
-            complex_cgr_dict[node_id] = result['cgr']
-    return dict(sorted(complex_cgr_dict.items()))

cluster/reduced_g_cgr.py DELETED Viewed

@@ -1,159 +0,0 @@
-from CGRtools.containers.bonds import DynamicBond
-def reducing_g_cgr(g_cgr):
-    """
-    Reduces a Generalized Condensed Graph of reaction (G-CGR) by performing the following steps:
-    1. Extracts substructures corresponding to connected components from the input G-CGR.
-    2. Selects the first substructure as the target to work on.
-    3. Iterates over all bonds in the target G-CGR:
-       - If a bond is identified as a "leaving group" (its primary order is None while its original order is defined),
-         the bond is removed.
-       - If a bond has a modified order (both primary and original orders are integers) and the primary order is less than the original,
-         the bond is deleted and then re-added with a new dynamic bond using the primary order (this updates the bond to the reduced form).
-    4. After bond modifications, re-extracts the substructure from the target G-CGR (now called the reduced G-CGR or RG-CGR).
-    5. If the charge distributions (_p_charges vs. _charges) differ, neutralizes the charges by setting them to zero.
-    Finally, returns the reduced G-CGR.
-    """
-    # Get all connected components of the G-CGR as separate substructures.
-    cgr_prods = [g_cgr.substructure(c) for c in g_cgr.connected_components]
-    target_cgr = cgr_prods[0]  # Choose the first substructure (main product) for further reduction.
-    # Iterate over each bond in the target G-CGR.
-    bond_items = list(target_cgr._bonds.items())
-    for atom1, bond_set in bond_items:
-        bond_set_items = list(bond_set.items())
-        for atom2, bond in bond_set_items:
-            # Removing bonds corresponding to leaving groups:
-            # If product bond order is None (indicating a leaving group) but an original bond order exists,
-            # delete the bond.
-            if bond.p_order is None and bond.order is not None:
-                target_cgr.delete_bond(atom1, atom2)
-            # For bonds that have been modified (not leaving groups) where the new (primary) order is less than the original:
-            # Remove the bond and re-add it using the DynamicBond with the primary order for both bond orders.
-            elif type(bond.p_order) is int and type(bond.order) is int and bond.p_order != bond.order:
-                p_order = int(bond.p_order)
-                target_cgr.delete_bond(atom1, atom2)
-                target_cgr.add_bond(atom1, atom2, DynamicBond(p_order, p_order))
-    # After modifying bonds, extract the reduced G-CGR from the target's connected components.
-    rg_cgr = [target_cgr.substructure(c) for c in target_cgr.connected_components][0]
-    # Neutralize charges if the primary charges and current charges differ.
-    if rg_cgr._p_charges != rg_cgr._charges:
-        for num, charge in rg_cgr._charges.items():
-            if charge != 0:
-                rg_cgr._atoms[num].charge = 0
-    return rg_cgr
-def process_all_rg_cgrs(g_cgrs_dict):
-    """
-    Processes a collection (dictionary) of G-CGRs to generate their reduced forms (RG-CGRs).
-    Iterates over each G-CGR in the provided dictionary and applies the reducing_g_cgr function.
-    Note: There is an apparent bug in the code since it uses an undefined variable 'super_cgrs_dict'
-    and assigns to 'all_rs_cgrs' instead of 'all_rg_cgrs'. The intended behavior is to iterate over
-    the input dictionary (g_cgrs_dict) and store the reduced RG-CGR for each key.
-    Returns:
-        A dictionary where each key corresponds to the RG-CGR obtained from the input G-CGR.
-    """
-    all_rg_cgrs = dict()
-    for num, cgr in g_cgrs_dict.items():
-        all_rg_cgrs[num] = reducing_g_cgr(cgr)
-    return all_rg_cgrs
-def report_strategic_bonds(result, target_cgr):
-    """
-    Reports strategic bonds from a provided result list.
-    Each element in 'result' is expected to be a list with two elements:
-        - A tuple (atom pair) indicating the connected atoms.
-        - The primary bond order (p_order) associated with that bond.
-    The function prints out the atoms (accessed from target_cgr._atoms) and the bond order.
-    """
-    for value in result:
-        atom_pair = value[0]
-        # Print the two atoms and the associated primary bond order.
-        print('\t', target_cgr._atoms[atom_pair[0]], target_cgr._atoms[atom_pair[1]], value[1])
-def extract_strategic_bonds(target_cgr, report=True):
-    """
-    Extracts and optionally reports strategic bonds from a reduced G-CGR (RG-CGR).
-    Strategic bonds are defined as those with:
-        - No current bond order (order is None) but a defined primary bond order (p_order is not None).
-    The function goes through all bonds in the target_cgr, collects each unique bond (avoiding duplicates by using a set)
-    along with its primary bond order, and optionally prints them out.
-    Returns:
-        A list where each element is a pair: [bond_key (tuple of atom indices), primary bond order]
-    """
-    result = []
-    seen = set()
-    # Loop through all bonds in the RG-CGR.
-    for atom1, bond_set in target_cgr._bonds.items():
-        for atom2, bond in bond_set.items():
-            # Check for strategic bonds (order undefined but p_order defined).
-            if bond.order is None and bond.p_order is not None:
-                # Create a sorted tuple of the atom pair to ensure uniqueness.
-                bond_key = tuple(sorted((atom1, atom2)))
-                if bond_key not in seen:
-                    seen.add(bond_key)
-                    result.append([bond_key, bond.p_order])
-    # If reporting is enabled, print the strategic bonds.
-    if report:
-        print('Strategic bonds in RG-CGR:')
-        report_strategic_bonds(result, target_cgr)
-    return result
-def compare_rg_cgr_by_strategic_bonds(rg_cgr1, rg_cgr2, report=True):
-    """
-    Compares two reduced G-CGRs (RG-CGRs) based on their strategic bonds.
-    The function performs the following steps:
-    1. Extracts the list of strategic bonds for each RG-CGR.
-    2. Converts each list into a set of tuples (bond key and bond order) for easy set operations.
-    3. Identifies common bonds, and bonds unique to each RG-CGR.
-    4. Converts these sets back into lists for reporting.
-    5. Prints out the common bonds, bonds unique to the first RG-CGR, and bonds unique to the second RG-CGR.
-    The reporting uses the report_strategic_bonds function to output the atom details and bond orders.
-    """
-    # Extract strategic bonds from both RG-CGRs without reporting.
-    l1 = extract_strategic_bonds(rg_cgr1, report=False)
-    l2 = extract_strategic_bonds(rg_cgr2, report=False)
-    # Create sets of (atom pair, bond order) tuples for both RG-CGRs.
-    set_l1 = { (tuple(item[0]), item[1]) for item in l1 }
-    set_l2 = { (tuple(item[0]), item[1]) for item in l2 }
-    # Identify common bonds and bonds unique to each list.
-    common = set_l1 & set_l2
-    unique_l1 = set_l1 - set_l2
-    unique_l2 = set_l2 - set_l1
-    # Convert the sets back to list format for reporting.
-    common_list = [ [atom_pair, order] for atom_pair, order in common ]
-    unique_l1_list = [ [atom_pair, order] for atom_pair, order in unique_l1 ]
-    unique_l2_list = [ [atom_pair, order] for atom_pair, order in unique_l2 ]
-    if report:
-        print("Common:")
-        report_strategic_bonds(common_list, rg_cgr1)
-        print("Unique for first RG-CGR:")
-        report_strategic_bonds(unique_l1_list, rg_cgr1)
-        print("Unique for second RG-CGR:")
-        report_strategic_bonds(unique_l2_list, rg_cgr1)

cluster/subcluster.py DELETED Viewed

@@ -1,33 +0,0 @@
-from collections import defaultdict
-def split_ids_by_length(ids, data):
-    length_to_ids = defaultdict(list)
-    for id_ in ids:
-        if id_ in data:
-            length_to_ids[len(data[id_])].append(id_)
-    return length_to_ids
-def group_ids_by_intermediate_products(ids, reactions_dict):
-    groups = defaultdict(list)
-    for id_ in ids:
-        # Build a key: a tuple of the first product for each reaction.
-        # This assumes that reactions_dict[id_] is a tuple of Reaction objects
-        # and each Reaction object has an attribute 'products' that is indexable.
-        key = tuple(reaction.products[0] for reaction in reactions_dict[id_])
-        groups[key].append(id_)
-    return list(groups.values())
-def sublcuster_all(cluster_dict, reactions_dict):
-    subcluster_dict = {}
-    for num, cluster in cluster_dict.items():
-        step_split_dict = split_ids_by_length(cluster, reactions_dict)
-        subcluster = {}
-        for steps in step_split_dict.keys():
-            ids_to_group = step_split_dict[steps]
-            grouped_ids = group_ids_by_intermediate_products(ids_to_group, reactions_dict)
-            subcluster[steps] = grouped_ids
-        subcluster_dict[num] = subcluster
-    return subcluster_dict

cluster/utils.py DELETED Viewed

@@ -1,314 +0,0 @@
-from synplan.mcts.tree import Tree
-from synplan.utils.visualisation import get_route_svg
-from CGRtools.containers import MoleculeContainer
-import pickle
-import os
-def extract_reactions(tree):
-    reactions_dict = {}
-    for node_id in set(tree.winning_nodes):
-        reactions = tree.synthesis_route(node_id)
-        reactions_dict[node_id] = reactions
-    return reactions_dict
-def extract_rules_from_route(node_id, tree):
-    nodes = tree.route_to_node(node_id)
-    found_rules_ids = []
-    for i in range(len(nodes)):
-        precursor = nodes[i].new_precursors[0]
-        if len(precursor) != 0:
-            if 'reactor_id' in precursor.molecule.meta.keys():
-                found_rules_ids.append(precursor.molecule.meta['reactor_id'])
-    return found_rules_ids[::-1]
-def save_smarts(mol_id, config, reactions_dict):
-    with open(f'smarts/smarts_mol_{mol_id}_{config}.txt', "w") as file:
-        for node_id, reactions in reactions_dict.items():
-            file.write(f"{node_id}\n")
-            for reaction in reactions:
-                file.write(f"{reaction}\n")
-def get_highest_route_nodes(tree, node_dict):
-    highest_nodes = {}
-    for key, node_ids in node_dict.items():
-        max_score = float('-inf')
-        best_nodes = []
-        for node_id in node_ids:
-            score = round(tree.route_score(node_id), 3)
-            if score > max_score:
-                max_score = score
-                best_nodes = [node_id]
-            elif score == max_score:
-                best_nodes.append(node_id)
-        highest_nodes[key] = best_nodes
-    return highest_nodes
-class TreeWrapper:
-    BASE_DIR = 'forest'
-    def __init__(self, tree, mol_id, config):
-        """Initializes the TreeWrapper."""
-        self.tree = tree
-        self.mol_id = mol_id
-        self.config = config
-        # Ensure the directory exists before creating the filename
-        os.makedirs(self.BASE_DIR, exist_ok=True)
-        self.filename = os.path.join(self.BASE_DIR, f'tree_{mol_id}_{config}.pkl')
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        tree_state = self.tree.__dict__.copy()
-        # Reset or remove non-pickleable attributes (e.g., _tqdm, policy_network, value_network)
-        if '_tqdm' in tree_state:
-            tree_state['_tqdm'] = True  # Reset to a simple flag
-        for attr in ['policy_network', 'value_network']:
-            if attr in tree_state:
-                tree_state[attr] = None
-        state['tree_state'] = tree_state
-        del state['tree']
-        return state
-    def __setstate__(self, state):
-        tree_state = state.pop('tree_state')
-        self.__dict__.update(state)
-        new_tree = Tree.__new__(Tree)
-        new_tree.__dict__.update(tree_state)
-        self.tree = new_tree
-    def save_tree(self):
-        """Saves the TreeWrapper instance (including the tree state) to a file."""
-        try:
-            with open(self.filename, 'wb') as f:
-                pickle.dump(self, f)
-            print(f"Tree wrapper for mol_id '{self.mol_id}', config '{self.config}' saved to '{self.filename}'.")
-        except Exception as e:
-            print(f"Error saving tree to {self.filename}: {e}")
-    @classmethod
-    def load_tree_from_id(cls, mol_id, config):
-        """
-        Loads a Tree object from a saved file using mol_id and config.
-        Args:
-            mol_id: The molecule ID used for saving.
-            config: The configuration used for saving.
-        Returns:
-            The loaded Tree object, or None if loading fails.
-        """
-        filename = os.path.join(cls.BASE_DIR, f'tree_{mol_id}_{config}.pkl')
-        print(f"Attempting to load tree from: {filename}")
-        try:
-            # Ensure the 'Tree' class is defined in the current scope
-            if 'Tree' not in globals() and 'Tree' not in locals():
-                 raise NameError("The 'Tree' class definition is required to load the object.")
-            with open(filename, 'rb') as f:
-                loaded_wrapper = pickle.load(f) # This implicitly calls __setstate__
-            # Check if the loaded object is indeed a TreeWrapper instance (optional sanity check)
-            if not isinstance(loaded_wrapper, cls):
-                print(f"Warning: Loaded object from {filename} is not a TreeWrapper instance.")
-                return None # Or raise an error
-            print(f"Tree object for mol_id '{mol_id}', config '{config}' successfully loaded from '{filename}'.")
-            # The __setstate__ method already reconstructed the tree inside the wrapper
-            return loaded_wrapper.tree
-        except FileNotFoundError:
-            print(f"Error: File not found at {filename}")
-            return None
-        except (pickle.UnpicklingError, EOFError) as e:
-            print(f"Error: Could not unpickle file {filename}. It might be corrupted or empty. Details: {e}")
-            return None
-        except NameError as e:
-             print(f"Error during loading: {e}. Ensure 'Tree' class is defined.")
-             return None
-        except Exception as e:
-            print(f"An unexpected error occurred loading tree from {filename}: {e}")
-            return None
-def generate_cluster_html(
-        tree: Tree,
-        cluster_node_ids: list,
-        cluster_num: int,
-        rg_cgrs_dict: dict, # <--- New parameter
-        aam: bool = False,
-    ) -> str:
-        # ... (initial setup, validation, filtering routes remains the same) ...
-    """
-    Generates an HTML page report for a specific cluster's synthesis routes.
-    :param tree: The built MCTS tree.
-    :param cluster_node_ids: List of route node IDs belonging to this cluster.
-    :param cluster_num: The identifier number for this cluster (used in title/header).
-    :param aam: If True, depict atom-to-atom mapping in route SVGs.
-    # :param scg_svg: Optional SVG string for the cluster's representative SCG.
-    :return: A string containing the complete HTML report.
-    """
-    # --- Depict Settings (Optional: Keep if get_route_svg depends on it) ---
-    # Uncomment if MoleculeContainer is used and needed:
-    try:
-        if aam:
-            MoleculeContainer.depict_settings(aam=True)
-        else:
-            MoleculeContainer.depict_settings(aam=False)
-    except NameError:
-         # If MoleculeContainer isn't available/needed, just pass
-         pass
-    except Exception as e:
-         print(f"Warning: Error setting MoleculeContainer depict settings: {e}")
-    # --- Validate Input ---
-    if not isinstance(cluster_node_ids, list):
-        return "<html><body>Error: cluster_node_ids must be a list.</body></html>"
-    if not tree or not isinstance(tree, Tree):
-        return "<html><body>Error: Invalid tree object provided.</body></html>"
-    # Filter out node IDs not actually present or not solved in the tree
-    valid_routes_in_cluster = []
-    for node_id in cluster_node_ids:
-        if node_id in tree.nodes and tree.nodes[node_id].is_solved():
-             valid_routes_in_cluster.append(node_id)
-        # Optionally log or warn about invalid/unsolved nodes removed
-    if not valid_routes_in_cluster:
-        # Return a minimal HTML page indicating no valid routes
-        return f"""
-        <!doctype html><html lang="en"><head><meta charset="utf-8">
-        <title>Cluster {cluster_num} Report</title></head><body>
-        <h3>Cluster {cluster_num} Report</h3>
-        <p>No valid/solved routes found for this cluster.</p>
-        </body></html>"""
-    # --- HTML Templates & Tags ---
-    # (Keep tags like th, td, fonts as they were)
-    th = '<th style="text-align: left; background-color:#978785; border: 1px solid black; border-spacing: 0">'
-    td = '<td style="text-align: left; border: 1px solid black; border-spacing: 0">'
-    # font_red = "<font color='red' style='font-weight: bold'>" # Consider using CSS classes instead
-    # font_green = "<font color='light-green' style='font-weight: bold'>"
-    font_head = "<font style='font-weight: bold; font-size: 18px'>"
-    font_normal = "<font style='font-weight: normal; font-size: 18px'>"
-    font_close = "</font>"
-    template_begin = f"""
-    <!doctype html>
-    <html lang="en">
-    <head>
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
-    rel="stylesheet"
-    integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3"
-    crossorigin="anonymous">
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <title>Cluster {cluster_num} Routes Report</title>
-    <style>
-        /* Optional: Add some basic styling */
-        .table {{ border-collapse: collapse; width: 100%; }}
-        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
-        tr:nth-child(even) {{ background-color: #f2f2f2; }}
-        caption {{ caption-side: top; font-size: 1.5em; margin: 1em 0; }}
-        svg {{ max-width: 100%; height: auto; }} /* Make SVGs responsive */
-    </style>
-    </head>
-    <body>
-    <div class="container"> """
-    template_end = """
-    </div> <script
-    src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"
-    integrity="sha384-ka7Sk0Gln4gmtz2MlQnikT1wXgYsOg+OMhuP+IlRH9sENBO0LRn5q+8nbTov4+1p"
-    crossorigin="anonymous">
-    </script>
-    </body>
-    </html>
-    """
-    box_mark = """
-    <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg" style="vertical-align: middle; margin-right: 5px;">
-    <circle cx="0.5" cy="0.5" r="0.5" fill="rgb()" fill-opacity="0.35" />
-    </svg>
-    """
-    # --- Build HTML Table ---
-    table = f"""
-    <table class="table table-striped table-hover caption-top">
-    <caption><h3>Retrosynthetic Routes Report - Cluster {cluster_num}</h3></caption>
-    <tbody>"""
-    try:
-        target_smiles_str = str(tree.nodes[1].curr_precursor) if 1 in tree.nodes else "N/A"
-    except Exception:
-        target_smiles_str = "Error retrieving target SMILES"
-    table += f"<tr>{td}{font_normal}Target Molecule: {target_smiles_str}{font_close}</td></tr>"
-    table += f"<tr>{td}{font_normal}Cluster Number: {cluster_num}{font_close}</td></tr>"
-    table += f"<tr>{td}{font_normal}Size of Cluster: {len(valid_routes_in_cluster)}{font_close} routes</td></tr>"
-    # --- Add RG-CGR Image ---
-    # Get the node_id of the first valid route in the cluster
-    first_route_id = valid_routes_in_cluster[0] if valid_routes_in_cluster else None
-    if first_route_id and rg_cgrs_dict and first_route_id in rg_cgrs_dict:
-        try:
-            rg_cgr = rg_cgrs_dict[first_route_id]
-            rg_cgr.clean2d()
-            rg_cgr_svg = rg_cgr.depict()
-            # Validate if it looks like SVG (basic check)
-            if rg_cgr_svg.strip().startswith("<svg"):
-                    table += f"<tr>{td}{font_normal}Cluster Representative RG-CGR (from Route {first_route_id}):{font_close}<br>{rg_cgr_svg}</td></tr>"
-            else:
-                    # Handle case where it's not SVG as expected
-                    table += f"<tr>{td}{font_normal}Cluster Representative RG-CGR (from Route {first_route_id}):{font_close}<br><i>Invalid SVG format retrieved.</i></td></tr>"
-                    print(f"Warning: Expected SVG for RG-CGR of node {first_route_id}, but got: {rg_cgr_svg[:100]}...") # Log a warning
-        except Exception as e:
-            table += f"<tr>{td}{font_normal}Cluster Representative RG-CGR (from Route {first_route_id}):{font_close}<br><i>Error retrieving/displaying RG-CGR: {e}</i></td></tr>"
-    else:
-        # Handle cases where RG-CGR data is missing
-        if first_route_id:
-                table += f"<tr>{td}{font_normal}Cluster Representative RG-CGR (from Route {first_route_id}):{font_close}<br><i>Not found in provided RG-CGR dictionary.</i></td></tr>"
-        else:
-                # This case shouldn't happen due to earlier check, but as fallback:
-                table += f"<tr>{td}{font_normal}Cluster Representative RG-CGR:{font_close}<br><i>No valid routes in cluster to select from.</i></td></tr>"
-    # --- Legend ---
-    table += f"""
-    <tr>{td}
-        <div style="display: flex; align-items: center; flex-wrap: wrap; gap: 15px;">
-            <span>{box_mark.replace("rgb()", "rgb(152, 238, 255)")} Target Molecule</span>
-            <span>{box_mark.replace("rgb()", "rgb(240, 171, 144)")} Molecule Not In Stock</span>
-            <span>{box_mark.replace("rgb()", "rgb(155, 250, 179)")} Molecule In Stock</span>
-        </div>
-    </td></tr>
-    """
-    # --- Add Routes for this Cluster ---
-    for route_id in valid_routes_in_cluster:
-        try:
-            svg = get_route_svg(tree, route_id)  # get SVG
-            full_route = tree.synthesis_route(route_id)  # get route steps
-            reactions = ""
-            for i, synth_step in enumerate(full_route):
-                reactions += f"<b>Step {i + 1}:</b> {str(synth_step)}<br>"
-            route_score = round(tree.route_score(route_id), 3)
-            table += (
-                f'<tr style="line-height: 1.8;">{td}{font_head}Route {route_id} | ' # Use | for separation
-                f"Steps: {len(full_route)} | "
-                f"Score: {route_score}{font_close}</td></tr>"
-            )
-            table += f"<tr>{td}{svg if svg else '<i>Error generating route visualization</i>'}</td></tr>"
-            table += f"<tr>{td}{reactions if reactions else '<i>No reaction steps found</i>'}</td></tr>"
-        except Exception as e:
-            table += f'<tr><td colspan="1" style="color: red;">Error processing route {route_id}: {e}</td></tr>' # Use colspan if needed based on final table structure
-    table += "</tbody></table>"
-    # --- Combine and Return Full HTML ---
-    full_html = template_begin + table + template_end
-    return full_html

cluster/visualize.py DELETED Viewed

@@ -1,481 +0,0 @@
-import os
-import re
-from collections import Counter
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from IPython.display import SVG, display
-import io
-import sys
-from synplan.utils.visualisation import get_route_svg
-from scipy.cluster.hierarchy import dendrogram
-from .reduced_g_cgr import extract_strategic_bonds, compare_rg_cgr_by_strategic_bonds
-def report_2_dissimilar(similarity_df, tree, rg_cgrs_dict):
-    min_index = similarity_df.stack().idxmin()
-    row_index, col_index = min_index
-    print(f'Most dissimilar routes are {row_index} and {col_index}, Tanimoto index = {"%.2f" % similarity_df[row_index][col_index]}')
-    print('Route ID', row_index)
-    rg_cgr_1 = rg_cgrs_dict[row_index]
-    rg_cgr_1.clean2d()
-    display(SVG(rg_cgr_1.depict()))
-    extract_strategic_bonds(rg_cgr_1)
-    display(SVG(get_route_svg(tree, row_index)))
-    print('Route ID', col_index)
-    rg_cgr_2 = rg_cgrs_dict[col_index]
-    rg_cgr_2.clean2d()
-    display(SVG(rg_cgr_2.depict()))
-    extract_strategic_bonds(rg_cgr_2)
-    display(SVG(get_route_svg(tree, col_index)))
-    print('Summary:')
-    compare_rg_cgr_by_strategic_bonds(rg_cgr_1, rg_cgr_2)
-def save_clusters_html(clusters, best_by_score, tree, rg_cgrs_dict, mol_id, config):
-    # Prepare a list to accumulate HTML parts for each cluster
-    os.makedirs("./final_clusters", exist_ok=True)
-    html_parts = []
-    # Loop over your clusters
-    for cluster_num, node_id_list in clusters.items():
-        parts = []  # to accumulate parts for this cluster
-        # Generate text output
-        best_route_in_cluster = best_by_score[cluster_num][0]
-        score = round(tree.route_score(best_route_in_cluster), 3)
-        parts.append(f"{cluster_num} ||| Size: {len(clusters[cluster_num])}\n")
-        parts.append(f"Example: {best_route_in_cluster}  Route score: {score}\n")
-        # Insert the first SVG immediately after its marker text
-        svg1 = get_route_svg(tree, best_route_in_cluster)
-        parts.append(svg1 + "\n")
-        # Continue with additional text and SVGs
-        parts.append("The RG-CGR:\n")
-        rg_cgr = rg_cgrs_dict[best_route_in_cluster]
-        rg_cgr.clean2d()
-        svg2 = rg_cgr.depict()
-        parts.append(svg2 + "\n")
-        # Capture output from extract_strategic_bonds, if it prints something
-        buf = io.StringIO()
-        old_stdout = sys.stdout
-        sys.stdout = buf
-        extract_strategic_bonds(rg_cgr)
-        sys.stdout = old_stdout
-        strategic_text = buf.getvalue()
-        parts.append(strategic_text + "\n")
-        # Wrap this cluster's output in a <pre> tag for formatting and add some spacing
-        cluster_html = f'<div class="cluster" style="margin-bottom: 2em;"><pre>{"".join(parts)}</pre></div>'
-        html_parts.append(cluster_html)
-    # Combine all parts into a full HTML document
-    html_content = f"""
-    <html>
-      <head>
-        <meta charset="utf-8">
-        <title>Captured Cluster Outputs</title>
-      </head>
-      <body>
-        {''.join(html_parts)}
-      </body>
-    </html>
-    """
-    # Write the HTML content to a file
-    with open(f"final_clusters/htmls/mol_{mol_id}_{config}.html", "w", encoding="utf-8") as f:
-        f.write(html_content)
-def report_2_dissimilar_to_html(similarity_df, tree, rg_cgrs_dict, mol_id=1, config=2,output_filename=None):
-    """Generates an HTML report of the two most dissimilar routes based on a similarity DataFrame."""
-    os.makedirs("./dissimilars", exist_ok=True)
-    output_filename=f"dissimilars/report_dissimilar_mol_{mol_id}_{config}.html"
-    # Identify the two most dissimilar routes
-    min_index = similarity_df.stack().idxmin()
-    row_index, col_index = min_index
-    # Capture text output in a buffer
-    buf = io.StringIO()
-    old_stdout = sys.stdout
-    sys.stdout = buf
-    print(f'Most dissimilar routes are {row_index} and {col_index}, Tanimoto index = {"%.2f" % similarity_df[row_index][col_index]}')
-    # Store HTML content
-    html_parts = []
-    # Function to capture and append text, SVGs, and function outputs
-    def capture_route_info(route_id):
-        rg_cgr = rg_cgrs_dict[route_id]
-        rg_cgr.clean2d()
-        # Capture the first SVG (RG-CGR depiction)
-        svg1 = rg_cgr.depict()
-        # Capture the second SVG (Route depiction)
-        svg2 = get_route_svg(tree, route_id)
-        # Capture output of extract_strategic_bonds
-        buf_extract = io.StringIO()
-        sys.stdout = buf_extract
-        extract_strategic_bonds(rg_cgr)
-        sys.stdout = old_stdout
-        extract_output = buf_extract.getvalue()
-        # Store text + SVGs in HTML format
-        html_parts.append(f"""
-        <div class="route-section">
-            <pre>{buf.getvalue()}</pre>
-            <div class="svg1">{svg1}</div>
-            <pre>{extract_output}</pre>
-            <div class="svg2">{svg2}</div>
-        </div>
-        """)
-        buf.truncate(0)  # Clear buffer for next route
-        buf.seek(0)
-    # Process the first route
-    capture_route_info(row_index)
-    # Process the second route
-    capture_route_info(col_index)
-    # Capture and store final summary
-    buf_summary = io.StringIO()
-    sys.stdout = buf_summary
-    compare_rg_cgr_by_strategic_bonds(rg_cgrs_dict[row_index], rg_cgrs_dict[col_index])
-    sys.stdout = old_stdout
-    summary_output = buf_summary.getvalue()
-    html_parts.append(f"<h2>Summary</h2><pre>{summary_output}</pre>")
-    # Restore standard stdout
-    sys.stdout = old_stdout
-    # Build the full HTML file
-    html_content = f"""
-    <html>
-    <head>
-        <meta charset="utf-8">
-        <title>Route Dissimilarity Report</title>
-    </head>
-    <body>
-        {''.join(html_parts)}
-    </body>
-    </html>
-    """
-    # Write the HTML file
-    with open(output_filename, "w", encoding="utf-8") as f:
-        f.write(html_content)
-    print(f"Report saved as {output_filename}")
-def pie_chart(cluster_sizes, sub='', input_cluster_num=1, input_step_nums=None):
-    labels = [f'{sub}Cluster {i+1}' for i in range(len(cluster_sizes))]
-    sns.set_style("whitegrid")
-    fig, ax = plt.subplots(figsize=(6, 6))
-    wedges, texts, autotexts = ax.pie(
-        cluster_sizes, labels=None, autopct='%1.1f%%', colors=sns.color_palette("pastel"),
-        startangle=140, wedgeprops={'edgecolor': 'black'}
-    )
-    ax.legend(wedges, labels, title=f"{sub}Clusters", loc="center left", bbox_to_anchor=(1, 0.5))
-    if sub == '':
-        ax.set_title(f"{sub}Cluster Size Distribution for {sum(cluster_sizes)} routes")
-    else:
-        ax.set_title(f"{sub}cluster Size Distribution for {sum(cluster_sizes)} routes in cluster {input_cluster_num} with number of steps {input_step_nums}")
-    plt.close(fig)
-    # plt.show()
-    return fig
-def save_dendrogram(df, Z, mol_id, config):
-    plt.figure(figsize=(14, 7)) # figsize=(14, 7)
-    dendrogram(Z, labels=df.columns, leaf_rotation=90)
-    plt.title(f"Hierarchical Clustering Dendrogram for routes generated for molecule #{mol_id}")
-    plt.xlabel("Route node id")
-    plt.ylabel("Distance (1 - Similarity)")
-    # Get current y-axis limits and add a gap below zero
-    ax = plt.gca()
-    ymin, ymax = ax.get_ylim()
-    # Add a gap that is 5% of the current y-range below zero
-    gap = 0.05 * (ymax - ymin)
-    ax.set_ylim(ymin - gap, ymax)
-    ax.grid(False)
-    ax.autoscale(enable=None, axis="x", tight=True)
-    plt.tight_layout()
-    plt.savefig(f'dendrograms/av_link_mol{mol_id}_{config}.png', dpi=100)
-def distribution_by_depth(tree, complex_cgr_dict):
-    if len(complex_cgr_dict) == 0:
-        print('Error: Empty dictionary')
-        return None
-    depths = np.zeros(len(complex_cgr_dict))
-    for n, node in enumerate(complex_cgr_dict.keys()):
-        reactions = tree.synthesis_route(node)
-        depths[n] = len(reactions)
-    return depths
-def histogram_by_depth(depths, mol_id=1, config=1, save=False):
-    if len(depths) == 0:
-        print('Error no depths')
-        return None
-    # Count frequency of each depth
-    counter = Counter(depths)
-    bins, counts = zip(*sorted(counter.items()))
-    # Plot the histogram
-    plt.bar(bins, counts, width=0.5, color='skyblue', edgecolor='black')
-    plt.xlabel('Number of reactions')
-    plt.ylabel('Frequency')
-    plt.title(f'Frequency Histogram of Number of reactions in one tree of total {len(depths)}')
-    plt.xticks(bins)
-    if save:
-        plt.savefig(f'histograms/by_depth_mol{mol_id}_{config}.png', dpi=100)
-    else:
-        plt.show()
-def group_routes_by_depth(depths):
-    """
-    Group route IDs by their reaction count (depth).
-    Args:
-        depths: Dictionary with node_ids as keys and reaction tuples as values
-    Returns:
-        dict: Dictionary with depths as keys and lists of node_ids as values
-    """
-    depth_groups = {}
-    for node_id, reactions in depths.items():
-        depth = len(reactions)
-        if depth not in depth_groups:
-            depth_groups[depth] = []
-        depth_groups[depth].append(node_id)
-    return depth_groups
-def create_route_svg(tree, node_ids, mol_id, config, depths, depth=None):
-    """Create SVG file for specified routes with optimized spacing."""
-    # First pass: analyze all SVGs to find maximum width
-    max_width_cm = 0
-    all_route_svgs = []  # Store SVGs to avoid calling get_route_svg twice
-    for g in node_ids:
-        route_svg = get_route_svg(tree, g)
-        all_route_svgs.append(route_svg)
-        # Extract the actual SVG content
-        svg_match = re.search(r'<svg[^>]*>', route_svg)
-        if svg_match:
-            svg_header = svg_match.group(0)
-            # Try to get width from cm attribute
-            width_match = re.search(r'width="([0-9.]+)cm"', svg_header)
-            if width_match:
-                try:
-                    width_cm = float(width_match.group(1))
-                    max_width_cm = max(max_width_cm, width_cm)
-                except ValueError:
-                    pass
-    # Convert cm to pixels (1cm ≈ 37.8 pixels)
-    CM_TO_PX = 37.8
-    max_width_px = max_width_cm * CM_TO_PX
-    # Add margins
-    left_margin = 50
-    right_margin = 100
-    composite_width = max_width_px + left_margin + right_margin
-    # Continue with SVG creation using calculated width
-    vertical_spacing = 20
-    text_height = 20
-    route_spacing = 250
-    current_y = 30
-    entries = []
-    size = len(node_ids)
-    for num, (g, route_svg_str) in enumerate(zip(node_ids, all_route_svgs), 1):
-        # Calculate dimensions
-        route_px_height = 200
-        # Create entry with optimized spacing
-        entry_parts = []
-        entry_parts.append(f'<g transform="translate({left_margin}, {current_y})">')
-        entry_parts.append(f'  <text x="0" y="{text_height}" font-size="12" fill="black">{num}  (Node ID: {g}, Number of reactions: {len(depths[g])})</text>')
-        inner_y = text_height + 25
-        entry_parts.append(f'  <g transform="translate(0, {inner_y})">{route_svg_str}</g>')
-        total_entry_height = inner_y + route_px_height + 250
-        entry_parts.append('</g>')
-        entry_block = "\n".join(entry_parts)
-        entry_bottom_y = current_y + total_entry_height
-        entries.append((entry_block, entry_bottom_y))
-        current_y = entry_bottom_y + route_spacing - 50
-    # Create master SVG with adjusted dimensions
-    master_width = composite_width
-    master_height = current_y + vertical_spacing
-    final_parts = []
-    for entry_block, bottom_y in entries:
-        final_parts.append(entry_block)
-        final_parts.append(f'<line x1="0" y1="{bottom_y}" x2="{master_width}" y2="{bottom_y}" stroke="black" stroke-width="1" />')
-    master_svg = f'<svg xmlns="http://www.w3.org/2000/svg" width="{master_width}" height="{master_height}" viewBox="0 0 {master_width} {master_height}">\n'
-    master_svg += "\n".join(final_parts)
-    master_svg += "\n</svg>"
-    # Save file with appropriate name
-    if depth is None:
-        path_name = f"./routes_img/mol_{mol_id}/mol{mol_id}_{config}_all_{size}.svg"
-    else:
-        path_name = f"./routes_img/mol_{mol_id}/mol{mol_id}_{config}_depth_{depth}_{size}.svg"
-    with open(path_name, "w") as f:
-        f.write(master_svg)
-    print(f"Saved: {path_name}")
-def create_route_svg_cluster(tree, node_ids, mol_id, config, depths, cluster_num):
-    """
-    Create SVG file for specified routes with optimized spacing, grouped by cluster.
-    """
-    # First pass: analyze all SVGs to find maximum width
-    max_width_cm = 0
-    all_route_svgs = []  # Store SVGs to avoid calling get_route_svg twice
-    for g in node_ids:
-        route_svg = get_route_svg(tree, g)
-        all_route_svgs.append(route_svg)
-        # Extract the actual SVG content
-        svg_match = re.search(r'<svg[^>]*>', route_svg)
-        if svg_match:
-            svg_header = svg_match.group(0)
-            # Try to get width from cm attribute
-            width_match = re.search(r'width="([0-9.]+)cm"', svg_header)
-            if width_match:
-                try:
-                    width_cm = float(width_match.group(1))
-                    max_width_cm = max(max_width_cm, width_cm)
-                except ValueError:
-                    pass
-    # Convert cm to pixels (1cm ≈ 37.8 pixels)
-    CM_TO_PX = 37.8
-    max_width_px = max_width_cm * CM_TO_PX
-    # Add margins
-    left_margin = 50
-    right_margin = 100
-    composite_width = max_width_px + left_margin + right_margin
-    # Continue with SVG creation using calculated width
-    vertical_spacing = 20
-    text_height = 20
-    route_spacing = 250
-    current_y = 30
-    entries = []
-    size = len(node_ids)
-    for num, (g, route_svg_str) in enumerate(zip(node_ids, all_route_svgs), 1):
-        # Calculate dimensions
-        route_px_height = 200
-        # Create entry with optimized spacing
-        entry_parts = []
-        entry_parts.append(f'<g transform="translate({left_margin}, {current_y})">')
-        entry_parts.append(f'  <text x="0" y="{text_height}" font-size="12" fill="black">{num}  (Node ID: {g}, Number of reactions: {len(depths[g])})</text>')
-        inner_y = text_height + 25
-        entry_parts.append(f'  <g transform="translate(0, {inner_y})">{route_svg_str}</g>')
-        total_entry_height = inner_y + route_px_height + 350
-        entry_parts.append('</g>')
-        entry_block = "\n".join(entry_parts)
-        entry_bottom_y = current_y + total_entry_height
-        entries.append((entry_block, entry_bottom_y))
-        current_y = entry_bottom_y + route_spacing - 50
-    # Create master SVG with adjusted dimensions
-    master_width = composite_width
-    master_height = current_y + vertical_spacing
-    final_parts = []
-    for entry_block, bottom_y in entries:
-        final_parts.append(entry_block)
-        final_parts.append(f'<line x1="0" y1="{bottom_y}" x2="{master_width}" y2="{bottom_y}" stroke="black" stroke-width="1" />')
-    master_svg = f'<svg xmlns="http://www.w3.org/2000/svg" width="{master_width}" height="{master_height}" viewBox="0 0 {master_width} {master_height}">\n'
-    master_svg += "\n".join(final_parts)
-    master_svg += "\n</svg>"
-    # Save file with cluster-specific name
-    path_name = f"./routes_img/mol_{mol_id}/mol{mol_id}_{config}_cluster_{cluster_num}_{size}.svg"
-    with open(path_name, "w") as f:
-        f.write(master_svg)
-    print(f"Saved: {path_name}")
-def save_route_images(tree, depths, mol_id, config, cluster_dict=None):
-    """
-    Save route images grouped by depth and/or cluster.
-    Args:
-        tree: Synthesis tree
-        routes: Dictionary of routes
-        depths: Dictionary of reaction depths
-        mol_id: Molecule ID
-        config: Configuration value
-        cluster_dict: Optional dictionary mapping cluster numbers to lists of node_ids
-    """
-    # Create directory if it doesn't exist
-    os.makedirs("./routes_img", exist_ok=True)
-    os.makedirs(f"./routes_img/mol_{mol_id}", exist_ok=True)
-    # Save complete image with all routes
-    all_node_ids = sorted(depths.keys())
-    create_route_svg(tree, all_node_ids, mol_id, config, depths)
-    # Group routes by depth and save separate images
-    depth_groups = group_routes_by_depth(depths)
-    for depth, node_ids in depth_groups.items():
-        create_route_svg(tree, sorted(node_ids), mol_id, config, depths, depth)
-    # If cluster dictionary is provided, save routes grouped by cluster
-    if cluster_dict is not None:
-        for cluster_num, node_ids in cluster_dict.items():
-            # Filter node_ids to only include those that exist in routes
-            valid_node_ids = [nid for nid in node_ids if nid in depths]
-            if valid_node_ids:
-                create_route_svg_cluster(tree, sorted(valid_node_ids),
-                                      mol_id, config, depths, cluster_num)

synplan/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .mcts import *
2	+
3	+ __all__ = ["Tree"]

synplan/chem/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from CGRtools.files import SMILESRead
2	+
3	+ smiles_parser = SMILESRead.create_parser(ignore=True)

{cluster → synplan/chem/data}/__init__.py RENAMED Viewed

File without changes

synplan/chem/data/filtering.py ADDED Viewed

	@@ -0,0 +1,962 @@

+"""Module containing classes abd functions for reactions filtering."""
+import logging
+from dataclasses import dataclass
+from io import TextIOWrapper
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+import numpy as np
+import ray
+import yaml
+from CGRtools.containers import CGRContainer, MoleculeContainer, ReactionContainer
+from chython.algorithms.fingerprints.morgan import MorganFingerprint
+from tqdm import tqdm
+from synplan.chem.data.standardizing import (
+    AromaticFormStandardizer,
+    KekuleFormStandardizer,
+    RemoveReagentsStandardizer,
+)
+from synplan.chem.utils import cgrtools_to_chython_molecule
+from synplan.utils.config import ConfigABC, convert_config_to_dict
+from synplan.utils.files import ReactionReader, ReactionWriter
+@dataclass
+class CompeteProductsConfig(ConfigABC):
+    fingerprint_tanimoto_threshold: float = 0.3
+    mcs_tanimoto_threshold: float = 0.6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "CompeteProductsConfig":
+        """Create an instance of CompeteProductsConfig from a dictionary."""
+        return CompeteProductsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "CompeteProductsConfig":
+        """Deserialize a YAML file into a CompeteProductsConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return CompeteProductsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        if not isinstance(params.get("fingerprint_tanimoto_threshold"), float) or not (
+            0 <= params["fingerprint_tanimoto_threshold"] <= 1
+        ):
+            raise ValueError(
+                "Invalid 'fingerprint_tanimoto_threshold'; expected a float between 0 and 1"
+            )
+        if not isinstance(params.get("mcs_tanimoto_threshold"), float) or not (
+            0 <= params["mcs_tanimoto_threshold"] <= 1
+        ):
+            raise ValueError(
+                "Invalid 'mcs_tanimoto_threshold'; expected a float between 0 and 1"
+            )
+class CompeteProductsFilter:
+    """Checks if there are compete reactions."""
+    def __init__(
+        self,
+        fingerprint_tanimoto_threshold: float = 0.3,
+        mcs_tanimoto_threshold: float = 0.6,
+    ):
+        self.fingerprint_tanimoto_threshold = fingerprint_tanimoto_threshold
+        self.mcs_tanimoto_threshold = mcs_tanimoto_threshold
+    @staticmethod
+    def from_config(config: CompeteProductsConfig) -> "CompeteProductsFilter":
+        """Creates an instance of CompeteProductsFilter from a configuration object."""
+        return CompeteProductsFilter(
+            config.fingerprint_tanimoto_threshold, config.mcs_tanimoto_threshold
+        )
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """Checks if the reaction has competing products, else False.
+        :param reaction: Input reaction.
+        :return: Returns True if the reaction has competing products, else False.
+        """
+        mf = MorganFingerprint()
+        is_compete = False
+        # check for compete products using both fingerprint similarity and maximum common substructure (MCS) similarity
+        for mol in reaction.reagents:
+            for other_mol in reaction.products:
+                if len(mol) > 6 and len(other_mol) > 6:
+                    # compute fingerprint similarity
+                    molf = mf.transform([cgrtools_to_chython_molecule(mol)])
+                    other_molf = mf.transform([cgrtools_to_chython_molecule(other_mol)])
+                    fingerprint_tanimoto = tanimoto_kernel(molf, other_molf)[0][0]
+                    # if fingerprint similarity is high enough, check for MCS similarity
+                    if fingerprint_tanimoto > self.fingerprint_tanimoto_threshold:
+                        try:
+                            # find the maximum common substructure (MCS) and compute its size
+                            clique_size = len(
+                                next(mol.get_mcs_mapping(other_mol, limit=100))
+                            )
+                            # calculate MCS similarity based on MCS size
+                            mcs_tanimoto = clique_size / (
+                                len(mol) + len(other_mol) - clique_size
+                            )
+                            # if MCS similarity is also high enough, mark the reaction as having compete products
+                            if mcs_tanimoto > self.mcs_tanimoto_threshold:
+                                is_compete = True
+                                break
+                        except StopIteration:
+                            continue
+        return is_compete
+@dataclass
+class DynamicBondsConfig(ConfigABC):
+    min_bonds_number: int = 1
+    max_bonds_number: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "DynamicBondsConfig":
+        """Create an instance of DynamicBondsConfig from a dictionary."""
+        return DynamicBondsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "DynamicBondsConfig":
+        """Deserialize a YAML file into a DynamicBondsConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return DynamicBondsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        if (
+            not isinstance(params.get("min_bonds_number"), int)
+            or params["min_bonds_number"] < 0
+        ):
+            raise ValueError(
+                "Invalid 'min_bonds_number'; expected a non-negative integer"
+            )
+        if (
+            not isinstance(params.get("max_bonds_number"), int)
+            or params["max_bonds_number"] < 0
+        ):
+            raise ValueError(
+                "Invalid 'max_bonds_number'; expected a non-negative integer"
+            )
+        if params["min_bonds_number"] > params["max_bonds_number"]:
+            raise ValueError(
+                "'min_bonds_number' cannot be greater than 'max_bonds_number'"
+            )
+class DynamicBondsFilter:
+    """Checks if there is an unacceptable number of dynamic bonds in CGR."""
+    def __init__(self, min_bonds_number: int = 1, max_bonds_number: int = 6):
+        self.min_bonds_number = min_bonds_number
+        self.max_bonds_number = max_bonds_number
+    @staticmethod
+    def from_config(config: DynamicBondsConfig):
+        """Creates an instance of DynamicBondsChecker from a configuration object."""
+        return DynamicBondsFilter(config.min_bonds_number, config.max_bonds_number)
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return not (
+            self.min_bonds_number <= len(cgr.center_bonds) <= self.max_bonds_number
+        )
+@dataclass
+class SmallMoleculesConfig(ConfigABC):
+    mol_max_size: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "SmallMoleculesConfig":
+        """Creates an instance of SmallMoleculesConfig from a dictionary."""
+        return SmallMoleculesConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "SmallMoleculesConfig":
+        """Deserialize a YAML file into a SmallMoleculesConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return SmallMoleculesConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        if (
+            not isinstance(params.get("mol_max_size"), int)
+            or params["mol_max_size"] < 1
+        ):
+            raise ValueError("Invalid 'mol_max_size'; expected a positive integer")
+class SmallMoleculesFilter:
+    """Checks if there are only small molecules in the reaction or if there is only one
+    small reactant or product."""
+    def __init__(self, mol_max_size: int = 6):
+        self.limit = mol_max_size
+    @staticmethod
+    def from_config(config: SmallMoleculesConfig) -> "SmallMoleculesFilter":
+        """Creates an instance of SmallMoleculesChecker from a configuration object."""
+        return SmallMoleculesFilter(config.mol_max_size)
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        if (
+            (
+                len(reaction.reactants) == 1
+                and self.are_only_small_molecules(reaction.reactants)
+            )
+            or (
+                len(reaction.products) == 1
+                and self.are_only_small_molecules(reaction.products)
+            )
+            or (
+                self.are_only_small_molecules(reaction.reactants)
+                and self.are_only_small_molecules(reaction.products)
+            )
+        ):
+            return True
+        return False
+    def are_only_small_molecules(self, molecules: Iterable[MoleculeContainer]) -> bool:
+        """Checks if all molecules in the given iterable are small molecules."""
+        return all(len(molecule) <= self.limit for molecule in molecules)
+@dataclass
+class CGRConnectedComponentsConfig:
+    pass
+class CGRConnectedComponentsFilter:
+    """Checks if CGR contains unrelated components (without reagents)."""
+    @staticmethod
+    def from_config(
+        config: CGRConnectedComponentsConfig,
+    ) -> "CGRConnectedComponentsFilter":
+        """Creates an instance of CGRConnectedComponentsChecker from a configuration
+        object."""
+        return CGRConnectedComponentsFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        tmp_reaction = ReactionContainer(reaction.reactants, reaction.products)
+        cgr = ~tmp_reaction
+        return cgr.connected_components_count > 1
+@dataclass
+class RingsChangeConfig:
+    pass
+class RingsChangeFilter:
+    """Checks if there is changing rings number in the reaction."""
+    @staticmethod
+    def from_config(config: RingsChangeConfig) -> "RingsChangeFilter":
+        """Creates an instance of RingsChecker from a configuration object."""
+        return RingsChangeFilter()
+    def __call__(self, reaction: ReactionContainer):
+        """
+        Returns True if there are valence mistakes in the reaction or there is a
+        reaction with mismatch numbers of all rings or aromatic rings in reactants and
+        products (reaction in rings)
+        :param reaction: Input reaction.
+        :return: Returns True if there are valence mistakes in the reaction.
+        """
+        r_rings, r_arom_rings = self._calc_rings(reaction.reactants)
+        p_rings, p_arom_rings = self._calc_rings(reaction.products)
+        return (r_arom_rings != p_arom_rings) or (r_rings != p_rings)
+    @staticmethod
+    def _calc_rings(molecules: Iterable) -> Tuple[int, int]:
+        """
+        Calculates number of all rings and number of aromatic rings in molecules.
+        :param molecules: Set of molecules.
+        :return: Number of all rings and number of aromatic rings in molecules
+        """
+        rings, arom_rings = 0, 0
+        for mol in molecules:
+            rings += mol.rings_count
+            arom_rings += len(mol.aromatic_rings)
+        return rings, arom_rings
+@dataclass
+class StrangeCarbonsConfig:
+    # currently empty, but can be extended in the future if needed
+    pass
+class StrangeCarbonsFilter:
+    """Checks if there are 'strange' carbons in the reaction."""
+    @staticmethod
+    def from_config(config: StrangeCarbonsConfig) -> "StrangeCarbonsFilter":
+        """Creates an instance of StrangeCarbonsChecker from a configuration object."""
+        return StrangeCarbonsFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        for molecule in reaction.reactants + reaction.products:
+            atoms_types = {
+                a.atomic_symbol for _, a in molecule.atoms()
+            }  # atoms types in molecule
+            if len(atoms_types) == 1 and atoms_types.pop() == "C":
+                if len(molecule) == 1:  # methane
+                    return True
+                bond_types = {int(b) for _, _, b in molecule.bonds()}
+                if len(bond_types) == 1 and bond_types.pop() != 4:
+                    return True  # C molecules with only one type of bond (not aromatic)
+        return False
+@dataclass
+class NoReactionConfig:
+    # Currently empty, but can be extended in the future if needed
+    pass
+class NoReactionFilter:
+    """Checks if there is no reaction in the provided reaction container."""
+    @staticmethod
+    def from_config(config: NoReactionConfig) -> "NoReactionFilter":
+        """Creates an instance of NoReactionChecker from a configuration object."""
+        return NoReactionFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return not cgr.center_atoms and not cgr.center_bonds
+@dataclass
+class MultiCenterConfig:
+    pass
+class MultiCenterFilter:
+    """Checks if there is a multicenter reaction."""
+    @staticmethod
+    def from_config(config: MultiCenterConfig) -> "MultiCenterFilter":
+        return MultiCenterFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return len(cgr.centers_list) > 1
+@dataclass
+class WrongCHBreakingConfig:
+    pass
+class WrongCHBreakingFilter:
+    """Checks for incorrect C-C bond formation from breaking a C-H bond."""
+    @staticmethod
+    def from_config(config: WrongCHBreakingConfig) -> "WrongCHBreakingFilter":
+        return WrongCHBreakingFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Determines if a reaction involves incorrect C-C bond formation from breaking
+        a C-H bond.
+        :param reaction: The reaction to be filtered.
+        :return: True if incorrect C-C bond formation is found, False otherwise.
+        """
+        if reaction.check_valence():
+            return False
+        copy_reaction = reaction.copy()
+        copy_reaction.explicify_hydrogens()
+        cgr = ~copy_reaction
+        reduced_cgr = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+        return self.is_wrong_c_h_breaking(reduced_cgr)
+    @staticmethod
+    def is_wrong_c_h_breaking(cgr: CGRContainer) -> bool:
+        """
+        Checks for incorrect C-C bond formation from breaking a C-H bond in a CGR.
+        :param cgr: The CGR with explicified hydrogens.
+        :return: True if incorrect C-C bond formation is found, False otherwise.
+        """
+        for atom_id in cgr.center_atoms:
+            if cgr.atom(atom_id).atomic_symbol == "C":
+                is_c_h_breaking, is_c_c_formation = False, False
+                c_with_h_id, another_c_id = None, None
+                for neighbour_id, bond in cgr._bonds[atom_id].items():
+                    neighbour = cgr.atom(neighbour_id)
+                    if (
+                        bond.order
+                        and not bond.p_order
+                        and neighbour.atomic_symbol == "H"
+                    ):
+                        is_c_h_breaking = True
+                        c_with_h_id = atom_id
+                    elif (
+                        not bond.order
+                        and bond.p_order
+                        and neighbour.atomic_symbol == "C"
+                    ):
+                        is_c_c_formation = True
+                        another_c_id = neighbour_id
+                if is_c_h_breaking and is_c_c_formation:
+                    # check for presence of heteroatoms in the first environment of 2 bonding carbons
+                    if any(
+                        cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+                        for neighbour_id in cgr._bonds[c_with_h_id]
+                    ) or any(
+                        cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+                        for neighbour_id in cgr._bonds[another_c_id]
+                    ):
+                        return False
+                    return True
+        return False
+@dataclass
+class CCsp3BreakingConfig:
+    pass
+class CCsp3BreakingFilter:
+    """Checks if there is C(sp3)-C bond breaking."""
+    @staticmethod
+    def from_config(config: CCsp3BreakingConfig) -> "CCsp3BreakingFilter":
+        return CCsp3BreakingFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if there is C(sp3)-C bonds breaking, else False.
+        :param reaction: Input reaction
+        :return: Returns True if there is C(sp3)-C bonds breaking, else False.
+        """
+        cgr = ~reaction
+        reaction_center = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+        for atom_id, neighbour_id, bond in reaction_center.bonds():
+            atom = reaction_center.atom(atom_id)
+            neighbour = reaction_center.atom(neighbour_id)
+            is_bond_broken = bond.order is not None and bond.p_order is None
+            are_atoms_carbons = (
+                atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+            )
+            is_atom_sp3 = atom.hybridization == 1 or neighbour.hybridization == 1
+            if is_bond_broken and are_atoms_carbons and is_atom_sp3:
+                return True
+        return False
+@dataclass
+class CCRingBreakingConfig:
+    """
+    Object to pass to ReactionFilterConfig if you want to enable C-C ring breaking filter
+    """
+    pass
+class CCRingBreakingFilter:
+    """Checks if a reaction involves ring C-C bond breaking."""
+    @staticmethod
+    def from_config(config: CCRingBreakingConfig):
+        return CCRingBreakingFilter()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if the reaction involves ring C-C bond breaking, else False.
+        :param reaction: Input reaction
+        :return: Returns True if the reaction involves ring C-C bond breaking, else
+            False.
+        """
+        cgr = ~reaction
+        # Extract reactants' center atoms and their rings
+        reactants_center_atoms = {}
+        reactants_rings = set()
+        for reactant in reaction.reactants:
+            reactants_rings.update(reactant.sssr)
+            for n, atom in reactant.atoms():
+                if n in cgr.center_atoms:
+                    reactants_center_atoms[n] = atom
+        # identify reaction center based on center atoms
+        reaction_center = cgr.augmented_substructure(atoms=cgr.center_atoms, deep=0)
+        # iterate over bonds in the reaction center and filter for ring C-C bond breaking
+        for atom_id, neighbour_id, bond in reaction_center.bonds():
+            try:
+                # Retrieve corresponding atoms from reactants
+                atom = reactants_center_atoms[atom_id]
+                neighbour = reactants_center_atoms[neighbour_id]
+            except KeyError:
+                continue
+            else:
+                # Check if the bond is broken and both atoms are carbons in rings of size 5, 6, or 7
+                is_bond_broken = (bond.order is not None) and (bond.p_order is None)
+                are_atoms_carbons = (
+                    atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+                )
+                are_atoms_in_ring = (
+                    set(atom.ring_sizes).intersection({5, 6, 7})
+                    and set(neighbour.ring_sizes).intersection({5, 6, 7})
+                    and any(
+                        atom_id in ring and neighbour_id in ring
+                        for ring in reactants_rings
+                    )
+                )
+                # If all conditions are met, indicate ring C-C bond breaking
+                if is_bond_broken and are_atoms_carbons and are_atoms_in_ring:
+                    return True
+        return False
+@dataclass
+class ReactionFilterConfig(ConfigABC):
+    """
+    Configuration class for reaction filtering. This class manages configuration
+    settings for various reaction filters, including paths, file formats, and filter-
+    specific parameters.
+    :ivar dynamic_bonds_config: Configuration for dynamic bonds checking.
+    :ivar small_molecules_config: Configuration for small molecules checking.
+    :ivar strange_carbons_config: Configuration for strange carbons checking.
+    :ivar compete_products_config: Configuration for competing products checking.
+    :ivar cgr_connected_components_config: Configuration for CGR connected components checking.
+    :ivar rings_change_config: Configuration for rings change checking.
+    :ivar no_reaction_config: Configuration for no reaction checking.
+    :ivar multi_center_config: Configuration for multi-center checking.
+    :ivar wrong_ch_breaking_config: Configuration for wrong C-H breaking checking.
+    :ivar cc_sp3_breaking_config: Configuration for CC sp3 breaking checking.
+    :ivar cc_ring_breaking_config: Configuration for CC ring breaking checking.
+    """
+    # configuration for reaction filters
+    dynamic_bonds_config: Optional[DynamicBondsConfig] = None
+    small_molecules_config: Optional[SmallMoleculesConfig] = None
+    strange_carbons_config: Optional[StrangeCarbonsConfig] = None
+    compete_products_config: Optional[CompeteProductsConfig] = None
+    cgr_connected_components_config: Optional[CGRConnectedComponentsConfig] = None
+    rings_change_config: Optional[RingsChangeConfig] = None
+    no_reaction_config: Optional[NoReactionConfig] = None
+    multi_center_config: Optional[MultiCenterConfig] = None
+    wrong_ch_breaking_config: Optional[WrongCHBreakingConfig] = None
+    cc_sp3_breaking_config: Optional[CCsp3BreakingConfig] = None
+    cc_ring_breaking_config: Optional[CCRingBreakingConfig] = None
+    def to_dict(self):
+        """Converts the configuration into a dictionary."""
+        config_dict = {
+            "dynamic_bonds_config": convert_config_to_dict(
+                self.dynamic_bonds_config, DynamicBondsConfig
+            ),
+            "small_molecules_config": convert_config_to_dict(
+                self.small_molecules_config, SmallMoleculesConfig
+            ),
+            "compete_products_config": convert_config_to_dict(
+                self.compete_products_config, CompeteProductsConfig
+            ),
+            "cgr_connected_components_config": (
+                {} if self.cgr_connected_components_config is not None else None
+            ),
+            "rings_change_config": {} if self.rings_change_config is not None else None,
+            "strange_carbons_config": (
+                {} if self.strange_carbons_config is not None else None
+            ),
+            "no_reaction_config": {} if self.no_reaction_config is not None else None,
+            "multi_center_config": {} if self.multi_center_config is not None else None,
+            "wrong_ch_breaking_config": (
+                {} if self.wrong_ch_breaking_config is not None else None
+            ),
+            "cc_sp3_breaking_config": (
+                {} if self.cc_sp3_breaking_config is not None else None
+            ),
+            "cc_ring_breaking_config": (
+                {} if self.cc_ring_breaking_config is not None else None
+            ),
+        }
+        filtered_config_dict = {k: v for k, v in config_dict.items() if v is not None}
+        return filtered_config_dict
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "ReactionFilterConfig":
+        """Create an instance of ReactionCheckConfig from a dictionary."""
+        # Instantiate configuration objects if their corresponding dictionary is present
+        dynamic_bonds_config = (
+            DynamicBondsConfig(**config_dict["dynamic_bonds_config"])
+            if "dynamic_bonds_config" in config_dict
+            else None
+        )
+        small_molecules_config = (
+            SmallMoleculesConfig(**config_dict["small_molecules_config"])
+            if "small_molecules_config" in config_dict
+            else None
+        )
+        compete_products_config = (
+            CompeteProductsConfig(**config_dict["compete_products_config"])
+            if "compete_products_config" in config_dict
+            else None
+        )
+        cgr_connected_components_config = (
+            CGRConnectedComponentsConfig()
+            if "cgr_connected_components_config" in config_dict
+            else None
+        )
+        rings_change_config = (
+            RingsChangeConfig() if "rings_change_config" in config_dict else None
+        )
+        strange_carbons_config = (
+            StrangeCarbonsConfig() if "strange_carbons_config" in config_dict else None
+        )
+        no_reaction_config = (
+            NoReactionConfig() if "no_reaction_config" in config_dict else None
+        )
+        multi_center_config = (
+            MultiCenterConfig() if "multi_center_config" in config_dict else None
+        )
+        wrong_ch_breaking_config = (
+            WrongCHBreakingConfig()
+            if "wrong_ch_breaking_config" in config_dict
+            else None
+        )
+        cc_sp3_breaking_config = (
+            CCsp3BreakingConfig() if "cc_sp3_breaking_config" in config_dict else None
+        )
+        cc_ring_breaking_config = (
+            CCRingBreakingConfig() if "cc_ring_breaking_config" in config_dict else None
+        )
+        return ReactionFilterConfig(
+            dynamic_bonds_config=dynamic_bonds_config,
+            small_molecules_config=small_molecules_config,
+            compete_products_config=compete_products_config,
+            cgr_connected_components_config=cgr_connected_components_config,
+            rings_change_config=rings_change_config,
+            strange_carbons_config=strange_carbons_config,
+            no_reaction_config=no_reaction_config,
+            multi_center_config=multi_center_config,
+            wrong_ch_breaking_config=wrong_ch_breaking_config,
+            cc_sp3_breaking_config=cc_sp3_breaking_config,
+            cc_ring_breaking_config=cc_ring_breaking_config,
+        )
+    @staticmethod
+    def from_yaml(file_path: str) -> "ReactionFilterConfig":
+        """Deserializes a YAML file into a ReactionCheckConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return ReactionFilterConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        pass
+    def create_filters(self):
+        filter_instances = []
+        if self.dynamic_bonds_config is not None:
+            filter_instances.append(
+                DynamicBondsFilter.from_config(self.dynamic_bonds_config)
+            )
+        if self.small_molecules_config is not None:
+            filter_instances.append(
+                SmallMoleculesFilter.from_config(self.small_molecules_config)
+            )
+        if self.strange_carbons_config is not None:
+            filter_instances.append(
+                StrangeCarbonsFilter.from_config(self.strange_carbons_config)
+            )
+        if self.compete_products_config is not None:
+            filter_instances.append(
+                CompeteProductsFilter.from_config(self.compete_products_config)
+            )
+        if self.cgr_connected_components_config is not None:
+            filter_instances.append(
+                CGRConnectedComponentsFilter.from_config(
+                    self.cgr_connected_components_config
+                )
+            )
+        if self.rings_change_config is not None:
+            filter_instances.append(
+                RingsChangeFilter.from_config(self.rings_change_config)
+            )
+        if self.no_reaction_config is not None:
+            filter_instances.append(
+                NoReactionFilter.from_config(self.no_reaction_config)
+            )
+        if self.multi_center_config is not None:
+            filter_instances.append(
+                MultiCenterFilter.from_config(self.multi_center_config)
+            )
+        if self.wrong_ch_breaking_config is not None:
+            filter_instances.append(
+                WrongCHBreakingFilter.from_config(self.wrong_ch_breaking_config)
+            )
+        if self.cc_sp3_breaking_config is not None:
+            filter_instances.append(
+                CCsp3BreakingFilter.from_config(self.cc_sp3_breaking_config)
+            )
+        if self.cc_ring_breaking_config is not None:
+            filter_instances.append(
+                CCRingBreakingFilter.from_config(self.cc_ring_breaking_config)
+            )
+        return filter_instances
+def tanimoto_kernel(x: MorganFingerprint, y: MorganFingerprint) -> float:
+    """Calculate the Tanimoto coefficient between each element of arrays x and y."""
+    x = x.astype(np.float64)
+    y = y.astype(np.float64)
+    x_dot = np.dot(x, y.T)
+    x2 = np.sum(x**2, axis=1)
+    y2 = np.sum(y**2, axis=1)
+    denominator = np.array([x2] * len(y2)).T + np.array([y2] * len(x2)) - x_dot
+    result = np.divide(
+        x_dot, denominator, out=np.zeros_like(x_dot), where=denominator != 0
+    )
+    return result
+def filter_reaction(
+    reaction: ReactionContainer, config: ReactionFilterConfig, filters: list
+) -> Tuple[bool, ReactionContainer]:
+    """Checks the input reaction. Returns True if reaction is detected as erroneous and
+    returns reaction itself, which sometimes is modified and does not necessarily
+    correspond to the initial reaction.
+    :param reaction: Reaction to be filtered.
+    :param config: Reaction filtration configuration.
+    :param filters: The list of reaction filters.
+    :return: False and reaction if reaction is correct and True and reaction if reaction
+        is filtered (erroneous).
+    """
+    is_filtered = False
+    # run reaction standardization
+    standardizers = [
+        RemoveReagentsStandardizer(),
+        KekuleFormStandardizer(),
+        AromaticFormStandardizer(),
+    ]
+    for reaction_standardizer in standardizers:
+        reaction = reaction_standardizer(reaction)
+        if not reaction:
+            is_filtered = True
+            break
+    # run reaction filtration
+    if not is_filtered:
+        for reaction_filter in filters:
+            try:  # CGRTools ValueError: mapping of graphs is not disjoint
+                if reaction_filter(reaction):
+                    # if filter returns True it means the reaction doesn't pass the filter
+                    reaction.meta["filtration_log"] = reaction_filter.__class__.__name__
+                    is_filtered = True
+            except Exception as e:
+                logging.debug(e)
+                is_filtered = True
+    return is_filtered, reaction
+@ray.remote
+def process_batch(
+    batch: List[Tuple[int, ReactionContainer]],
+    config: ReactionFilterConfig,
+    filters: list,
+) -> List[Tuple[bool, ReactionContainer]]:
+    """
+    Processes a batch of reactions to extract reaction rules based on the given
+    configuration. This function operates as a remote task in a distributed system using
+    Ray.
+    :param batch: A list where each element is a tuple containing an index (int) and a
+        ReactionContainer object. The index is typically used to keep track of the
+        reaction's position in a larger dataset.
+    :param config: Reaction filtration configuration.
+    :param filters: The list of reaction filters.
+    :return: The list of tuples where each tuple include the reaction index, is ir
+        filtered or not (True/False) and reaction itself.
+    """
+    processed_reaction_list = []
+    for reaction in batch:
+        try:  # CGRtools.exceptions.MappingError: atoms with number {52} not equal
+            is_filtered, processed_reaction = filter_reaction(reaction, config, filters)
+            processed_reaction_list.append((is_filtered, processed_reaction))
+        except Exception as e:
+            logging.debug(e)
+            processed_reaction_list.append((True, reaction))
+    return processed_reaction_list
+def process_completed_batch(
+    futures: Dict,
+    result_file: TextIOWrapper,
+    n_filtered: int = 0,
+) -> int:
+    """
+    Processes completed batches of reactions.
+    :param futures: A dictionary of futures representing ongoing batch processing tasks.
+    :param result_file: The path to the file where filtered reactions will be stored.
+    :param n_filtered: The number of processed reactions.
+    :return: The numbers of filtered and correct reactions.
+    """
+    ready_id, running_id = ray.wait(list(futures.keys()), num_returns=1)
+    completed_batch = ray.get(ready_id[0])
+    # write results of the completed batch to file
+    for is_filtered, reaction in completed_batch:
+        if not is_filtered:
+            result_file.write(reaction)
+            n_filtered += 1
+    # remove completed future and update progress bar
+    del futures[ready_id[0]]
+    return n_filtered
+def filter_reactions_from_file(
+    config: ReactionFilterConfig,
+    input_reaction_data_path: str,
+    filtered_reaction_data_path: str = "reaction_data_filtered.smi",
+    num_cpus: int = 1,
+    batch_size: int = 100,
+) -> None:
+    """
+    Processes reaction data, applying reaction filters based on the provided
+    configuration, and writes the results to specified files.
+    :param config: ReactionCheckConfig object containing all filtration configuration
+        settings.
+    :param input_reaction_data_path: Path to the reaction data file.
+    :param filtered_reaction_data_path: Name for the file that will contain filtered
+        reactions.
+    :param num_cpus: Number of CPUs to use for processing.
+    :param batch_size: Size of the batch for processing reactions.
+    :return: None. The function writes the processed reactions to specified RDF/smi
+        files.
+    """
+    filters = config.create_filters()
+    ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+    max_concurrent_batches = num_cpus  # limit the number of concurrent batches
+    lines_counter = 0
+    with ReactionReader(input_reaction_data_path) as reactions, ReactionWriter(
+        filtered_reaction_data_path
+    ) as result_file:
+        batches_to_process, batch = {}, []
+        n_filtered = 0
+        for index, reaction in tqdm(
+            enumerate(reactions),
+            desc="Number of reactions processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            lines_counter += 1
+            batch.append(reaction)
+            if len(batch) == batch_size:
+                batch_results = process_batch.remote(batch, config, filters)
+                batches_to_process[batch_results] = None
+                batch = []
+                # check and process completed tasks if we've reached the concurrency limit
+                while len(batches_to_process) >= max_concurrent_batches:
+                    n_filtered = process_completed_batch(
+                        batches_to_process,
+                        result_file,
+                        n_filtered,
+                    )
+        # process the last batch if it's not empty
+        if batch:
+            batch_results = process_batch.remote(batch, config, filters)
+            batches_to_process[batch_results] = None
+        # process remaining batches
+        while batches_to_process:
+            n_filtered = process_completed_batch(
+                batches_to_process,
+                result_file,
+                n_filtered,
+            )
+    ray.shutdown()
+    print(f"Initial number of reactions: {lines_counter}")
+    print(f"Filtered number of reactions: {n_filtered}")

synplan/chem/data/standardizing.py ADDED Viewed

	@@ -0,0 +1,1187 @@

+"""Module containing classes and functions for reactions standardizing.
+This module contains the open-source code from
+https://github.com/Laboratoire-de-Chemoinformatique/Reaction_Data_Cleaning/blob/master/scripts/standardizer.py
+"""
+from __future__ import annotations
+import logging
+from contextlib import suppress
+from dataclasses import dataclass
+from io import TextIOWrapper
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, Sequence, TextIO
+from abc import ABC, abstractmethod
+from pathlib import Path
+import sys
+import ray
+import yaml
+from CGRtools import smiles as smiles_cgrtools
+from CGRtools.containers import MoleculeContainer
+from CGRtools.containers import ReactionContainer
+from CGRtools.containers import ReactionContainer as ReactionContainerCGRTools
+from chython import ReactionContainer as ReactionContainerChython
+from chython import smiles as smiles_chython
+from tqdm.auto import tqdm
+from synplan.chem.utils import unite_molecules
+from synplan.utils.config import ConfigABC
+from synplan.utils.files import ReactionReader, ReactionWriter
+from synplan.utils.logging import init_logger, init_ray_logging
+logger = logging.getLogger("synplan.chem.data.standardizing")
+class StandardizationError(RuntimeError):
+    """Wraps the original exception and the reaction string that failed."""
+    def __init__(self, stage: str, reaction: str, original: Exception):
+        super().__init__(f"{stage} failed on {reaction}: {original}")
+        self.stage = stage
+        self.reaction = reaction
+        self.original = original
+class BaseStandardizer(ABC):
+    """Template: subclasses override `_run` only."""
+    @classmethod
+    def from_config(cls, _cfg: object) -> "BaseStandardizer":
+        return cls()
+    @abstractmethod
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Run the standardization step on the reaction.
+        Args:
+            rxn: The reaction to standardize
+        Returns:
+            The standardized reaction
+        Raises:
+            StandardizationError: If standardization fails
+        """
+        ...
+    def __call__(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Execute the standardization step with proper error handling.
+        Args:
+            rxn: The reaction to standardize
+        Returns:
+            The standardized reaction
+        Raises:
+            StandardizationError: If standardization fails
+        """
+        try:
+            return self._run(rxn)
+        except Exception as exc:
+            logging.debug("%s: %s", self.__class__.__name__, exc, exc_info=True)
+            raise StandardizationError(self.__class__.__name__, str(rxn), exc)
+# Configuration classes
+@dataclass
+class ReactionMappingConfig:
+    pass
+class ReactionMappingStandardizer(BaseStandardizer):
+    """Maps atoms of the reaction using chython (chytorch)."""
+    def _map_and_remove_reagents(
+        self, reaction: ReactionContainerChython
+    ) -> ReactionContainerChython:
+        """Map and remove reagents from the reaction.
+        Args:
+            reaction: Input reaction
+        Returns:
+            The mapped reaction with reagents removed
+        """
+        reaction.reset_mapping()
+        reaction.remove_reagents()
+        return reaction
+    def _run(self, rxn: ReactionContainerCGRTools) -> ReactionContainerCGRTools:
+        """Map atoms of the reaction using chython.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The mapped reaction
+        Raises:
+            StandardizationError: If mapping fails
+        """
+        try:
+            # Convert to chython format
+            if isinstance(rxn, str):
+                chython_reaction = smiles_chython(rxn)
+            else:
+                # Convert CGRtools reaction to SMILES string, preserving reagents
+                reactants = ".".join(str(m) for m in rxn.reactants)
+                reagents = ".".join(str(m) for m in rxn.reagents)
+                products = ".".join(str(m) for m in rxn.products)
+                smiles = f"{reactants}>{reagents}>{products}"
+                # Parse SMILES string with chython
+                chython_reaction = smiles_chython(smiles)
+            # Map and remove reagents
+            reaction_mapped = self._map_and_remove_reagents(chython_reaction)
+            if not reaction_mapped:
+                raise StandardizationError(
+                    "ReactionMapping", str(rxn), ValueError("Mapping failed")
+                )
+            # Convert back to CGRtools format
+            mapped_smiles = format(chython_reaction, "m")
+            result = smiles_cgrtools(mapped_smiles)
+            result.meta.update(rxn.meta)  # Preserve metadata
+            return result
+        except Exception as e:
+            raise StandardizationError("ReactionMapping", str(rxn), e)
+@dataclass
+class FunctionalGroupsConfig:
+    pass
+class FunctionalGroupsStandardizer(BaseStandardizer):
+    """Functional groups standardization."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Standardize functional groups in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with standardized functional groups
+        Raises:
+            StandardizationError: If standardization fails
+        """
+        rxn.standardize()
+        return rxn
+@dataclass
+class KekuleFormConfig:
+    pass
+class KekuleFormStandardizer(BaseStandardizer):
+    """Reactants/reagents/products kekulization."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Kekulize the reaction.
+        Args:
+            rxn: The reaction to kekulize
+        Returns:
+            The kekulized reaction
+        Raises:
+            StandardizationError: If kekulization fails
+        """
+        rxn.kekule()
+        return rxn
+@dataclass
+class CheckValenceConfig:
+    pass
+class CheckValenceStandardizer(BaseStandardizer):
+    """Check valence."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Check valence of atoms in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction if valences are correct
+        Raises:
+            StandardizationError: If valence check fails
+        """
+        for molecule in rxn.reactants + rxn.products + rxn.reagents:
+            valence_mistakes = molecule.check_valence()
+            if valence_mistakes:
+                raise StandardizationError(
+                    "CheckValence",
+                    str(rxn),
+                    ValueError(f"Valence errors: {valence_mistakes}"),
+                )
+        return rxn
+@dataclass
+class ImplicifyHydrogensConfig:
+    pass
+class ImplicifyHydrogensStandardizer(BaseStandardizer):
+    """Implicify hydrogens."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Implicify hydrogens in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with implicified hydrogens
+        Raises:
+            StandardizationError: If hydrogen implicification fails
+        """
+        rxn.implicify_hydrogens()
+        return rxn
+@dataclass
+class CheckIsotopesConfig:
+    pass
+class CheckIsotopesStandardizer(BaseStandardizer):
+    """Check isotopes."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Check and clean isotopes in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with cleaned isotopes
+        Raises:
+            StandardizationError: If isotope check/cleaning fails
+        """
+        is_isotope = False
+        for molecule in rxn.reactants + rxn.products:
+            for _, atom in molecule.atoms():
+                if atom.isotope:
+                    is_isotope = True
+                    break
+            if is_isotope:
+                break
+        if is_isotope:
+            rxn.clean_isotopes()
+        return rxn
+@dataclass
+class SplitIonsConfig:
+    pass
+class SplitIonsStandardizer(BaseStandardizer):
+    """Computing charge of molecule."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Split ions in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with split ions
+        Raises:
+            StandardizationError: If ion splitting fails
+        """
+        reaction, return_code = self._split_ions(rxn)
+        if return_code == 2:  # ions were split but the reaction is imbalanced
+            raise StandardizationError(
+                "SplitIons",
+                str(rxn),
+                ValueError("Reaction is imbalanced after ion splitting"),
+            )
+        return reaction
+    def _calc_charge(self, molecule: MoleculeContainer) -> int:
+        """Compute total charge of a molecule.
+        Args:
+            molecule: Input molecule
+        Returns:
+            The total charge of the molecule
+        """
+        return sum(molecule._charges.values())
+    def _split_ions(self, reaction: ReactionContainer) -> Tuple[ReactionContainer, int]:
+        """Split ions in a reaction.
+        Args:
+            reaction: Input reaction
+        Returns:
+            A tuple containing:
+            - The reaction with split ions
+            - Return code (0: nothing changed, 1: ions split, 2: ions split but imbalanced)
+        """
+        meta = reaction.meta
+        reaction_parts = []
+        return_codes = []
+        for molecules in (reaction.reactants, reaction.reagents, reaction.products):
+            # Split molecules into individual components
+            divided_molecules = []
+            for molecule in molecules:
+                if isinstance(molecule, str):
+                    # If it's a string, try to parse it as a molecule
+                    try:
+                        molecule: MoleculeContainer = smiles_cgrtools(molecule)
+                    except Exception as e:
+                        logging.warning("Failed to parse molecule %s: %s", molecule, e)
+                        continue
+                # Use the split method from CGRtools
+                try:
+                    components = molecule.split()
+                    divided_molecules.extend(components)
+                except Exception as e:
+                    logging.warning("Failed to split molecule %s: %s", molecule, e)
+                    divided_molecules.append(molecule)
+            total_charge = 0
+            ions_present = False
+            for molecule in divided_molecules:
+                try:
+                    mol_charge = self._calc_charge(molecule)
+                    total_charge += mol_charge
+                    if mol_charge != 0:
+                        ions_present = True
+                except Exception as e:
+                    logging.warning(
+                        "Failed to calculate charge for molecule %s: %s", molecule, e
+                    )
+                    continue
+            if ions_present and total_charge:
+                return_codes.append(2)
+            elif ions_present:
+                return_codes.append(1)
+            else:
+                return_codes.append(0)
+            reaction_parts.append(tuple(divided_molecules))
+        return (
+            ReactionContainer(
+                reactants=reaction_parts[0],
+                reagents=reaction_parts[1],
+                products=reaction_parts[2],
+                meta=meta,
+            ),
+            max(return_codes),
+        )
+@dataclass
+class AromaticFormConfig:
+    pass
+class AromaticFormStandardizer(BaseStandardizer):
+    """Aromatize molecules in reaction."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Aromatize molecules in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with aromatized molecules
+        Raises:
+            StandardizationError: If aromatization fails
+        """
+        rxn.thiele()
+        return rxn
+@dataclass
+class MappingFixConfig:
+    pass
+class MappingFixStandardizer(BaseStandardizer):
+    """Fix atom-to-atom mapping in reaction."""
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Fix atom-to-atom mapping in the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with fixed atom-to-atom mapping
+        Raises:
+            StandardizationError: If mapping fix fails
+        """
+        rxn.fix_mapping()
+        return rxn
+@dataclass
+class UnchangedPartsConfig:
+    pass
+class UnchangedPartsStandardizer(BaseStandardizer):
+    """Ungroup molecules, remove unchanged parts from reactants and products."""
+    def __init__(
+        self,
+        add_reagents_to_reactants: bool = False,
+        keep_reagents: bool = False,
+    ):
+        self.add_reagents_to_reactants = add_reagents_to_reactants
+        self.keep_reagents = keep_reagents
+    @classmethod
+    def from_config(cls, config: UnchangedPartsConfig) -> "UnchangedPartsStandardizer":
+        return cls()
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Remove unchanged parts from the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction with unchanged parts removed
+        Raises:
+            StandardizationError: If unchanged parts removal fails
+        """
+        meta = rxn.meta
+        new_reactants = list(rxn.reactants)
+        new_reagents = list(rxn.reagents)
+        if self.add_reagents_to_reactants:
+            new_reactants.extend(new_reagents)
+            new_reagents = []
+        reactants = new_reactants.copy()
+        new_products = list(rxn.products)
+        for reactant in reactants:
+            if reactant in new_products:
+                new_reagents.append(reactant)
+                new_reactants.remove(reactant)
+                new_products.remove(reactant)
+        if not self.keep_reagents:
+            new_reagents = []
+        if not new_reactants and new_products:
+            raise StandardizationError(
+                "UnchangedParts", str(rxn), ValueError("No reactants left")
+            )
+        if not new_products and new_reactants:
+            raise StandardizationError(
+                "UnchangedParts", str(rxn), ValueError("No products left")
+            )
+        if not new_reactants and not new_products:
+            raise StandardizationError(
+                "UnchangedParts", str(rxn), ValueError("No molecules left")
+            )
+        new_reaction = ReactionContainer(
+            reactants=tuple(new_reactants),
+            reagents=tuple(new_reagents),
+            products=tuple(new_products),
+            meta=meta,
+        )
+        new_reaction.name = rxn.name
+        return new_reaction
+@dataclass
+class SmallMoleculesConfig:
+    mol_max_size: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "SmallMoleculesConfig":
+        """Create an instance of SmallMoleculesConfig from a dictionary."""
+        return SmallMoleculesConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "SmallMoleculesConfig":
+        """Deserialize a YAML file into a SmallMoleculesConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return SmallMoleculesConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        mol_max_size = params.get("mol_max_size", self.mol_max_size)
+        if not isinstance(mol_max_size, int) or not (0 < mol_max_size):
+            raise ValueError("Invalid 'mol_max_size'; expected an integer more than 1")
+class SmallMoleculesStandardizer(BaseStandardizer):
+    """Remove small molecule from reaction."""
+    def __init__(self, mol_max_size: int = 6):
+        self.mol_max_size = mol_max_size
+    @classmethod
+    def from_config(cls, config: SmallMoleculesConfig) -> "SmallMoleculesStandardizer":
+        return cls(config.mol_max_size)
+    def _split_molecules(
+        self, molecules: Iterable, number_of_atoms: int
+    ) -> Tuple[List[MoleculeContainer], List[MoleculeContainer]]:
+        """Split molecules according to the number of heavy atoms.
+        Args:
+            molecules: Iterable of molecules
+            number_of_atoms: Threshold for splitting molecules
+        Returns:
+            Tuple of lists containing "big" molecules and "small" molecules
+        """
+        big_molecules, small_molecules = [], []
+        for molecule in molecules:
+            if len(molecule) > number_of_atoms:
+                big_molecules.append(molecule)
+            else:
+                small_molecules.append(molecule)
+        return big_molecules, small_molecules
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Remove small molecules from the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction without small molecules
+        Raises:
+            StandardizationError: If small molecule removal fails
+        """
+        new_reactants, small_reactants = self._split_molecules(
+            rxn.reactants, self.mol_max_size
+        )
+        new_products, small_products = self._split_molecules(
+            rxn.products, self.mol_max_size
+        )
+        if not new_reactants or not new_products:
+            raise StandardizationError(
+                "SmallMolecules",
+                str(rxn),
+                ValueError("No molecules left after removing small ones"),
+            )
+        new_reaction = ReactionContainer(
+            new_reactants, new_products, rxn.reagents, rxn.meta
+        )
+        new_reaction.name = rxn.name
+        # Save small molecules to meta
+        united_small_reactants = unite_molecules(small_reactants)
+        new_reaction.meta["small_reactants"] = str(united_small_reactants)
+        united_small_products = unite_molecules(small_products)
+        new_reaction.meta["small_products"] = str(united_small_products)
+        return new_reaction
+@dataclass
+class RemoveReagentsConfig:
+    reagent_max_size: int = 7
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "RemoveReagentsConfig":
+        """Create an instance of RemoveReagentsConfig from a dictionary."""
+        return RemoveReagentsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "RemoveReagentsConfig":
+        """Deserialize a YAML file into a RemoveReagentsConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return RemoveReagentsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        reagent_max_size = params.get("reagent_max_size", self.reagent_max_size)
+        if not isinstance(reagent_max_size, int) or not (0 < reagent_max_size):
+            raise ValueError(
+                "Invalid 'reagent_max_size'; expected an integer more than 1"
+            )
+class RemoveReagentsStandardizer(BaseStandardizer):
+    """Remove reagents from reaction."""
+    def __init__(self, reagent_max_size: int = 7):
+        self.reagent_max_size = reagent_max_size
+    @classmethod
+    def from_config(cls, config: RemoveReagentsConfig) -> "RemoveReagentsStandardizer":
+        return cls(config.reagent_max_size)
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Remove reagents from the reaction.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The reaction without reagents
+        Raises:
+            StandardizationError: If reagent removal fails
+        """
+        not_changed_molecules = set(rxn.reactants).intersection(rxn.products)
+        cgr = ~rxn
+        center_atoms = set(cgr.center_atoms)
+        new_reactants = []
+        new_products = []
+        new_reagents = []
+        for molecule in rxn.reactants:
+            if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+                new_reagents.append(molecule)
+            else:
+                new_reactants.append(molecule)
+        for molecule in rxn.products:
+            if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+                new_reagents.append(molecule)
+            else:
+                new_products.append(molecule)
+        if not new_reactants or not new_products:
+            raise StandardizationError(
+                "RemoveReagents",
+                str(rxn),
+                ValueError("No molecules left after removing reagents"),
+            )
+        # Filter reagents by size
+        new_reagents = {
+            molecule
+            for molecule in new_reagents
+            if len(molecule) <= self.reagent_max_size
+        }
+        new_reaction = ReactionContainer(
+            new_reactants, new_products, new_reagents, rxn.meta
+        )
+        new_reaction.name = rxn.name
+        return new_reaction
+@dataclass
+class RebalanceReactionConfig:
+    pass
+class RebalanceReactionStandardizer(BaseStandardizer):
+    """Rebalance reaction."""
+    @classmethod
+    def from_config(
+        cls, config: RebalanceReactionConfig
+    ) -> "RebalanceReactionStandardizer":
+        return cls()
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        """Rebalances the reaction by assembling CGR and then decomposing it. Works for
+        all reactions for which the correct CGR can be assembled.
+        Args:
+            rxn: Input reaction
+        Returns:
+            The rebalanced reaction
+        Raises:
+            StandardizationError: If rebalancing fails
+        """
+        try:
+            tmp_rxn = ReactionContainer(rxn.reactants, rxn.products)
+            cgr = ~tmp_rxn
+            reactants, products = ~cgr
+            new_rxn = ReactionContainer(
+                reactants.split(), products.split(), rxn.reagents, rxn.meta
+            )
+            new_rxn.name = rxn.name
+            return new_rxn
+        except Exception as e:
+            logging.debug(f"Rebalancing attempt failed: {e}")
+            raise StandardizationError(
+                "RebalanceReaction",
+                str(rxn),
+                ValueError("Failed to rebalance reaction"),
+            )
+@dataclass
+class DuplicateReactionConfig:
+    pass
+class DuplicateReactionStandardizer(BaseStandardizer):
+    """Cluster‑wide duplicate removal via a Ray actor."""
+    def __init__(self, dedup_actor: "ray.actor.ActorHandle"):
+        self._actor = dedup_actor  # global singleton handle
+        # local fast‑path cache to avoid actor call on obvious repeats *in
+        # the same worker*; purely an optimisation, not required.
+        self._local_seen: set[int] = set()
+    @classmethod
+    def from_config(cls, config: DuplicateReactionConfig):
+        # fallback for single‑process mode: create a dummy in‑proc actor
+        if ray.is_initialized():
+            dedup_actor = ray.get_actor("duplicate_rxn_actor")
+        else:
+            dedup_actor = None
+        return cls(dedup_actor)
+    # ------------------------------------------------------------------
+    def safe_reaction_smiles(self, reaction: ReactionContainer) -> str:
+        reactants_smi = ".".join(str(i) for i in reaction.reactants)
+        products_smi = ".".join(str(i) for i in reaction.products)
+        return f"{reactants_smi}>>{products_smi}"
+    def _run(self, rxn: ReactionContainer) -> ReactionContainer:
+        h = hash(self.safe_reaction_smiles(rxn))
+        # local cache fast‑path (helps in large batches processed by same
+        # worker; no correctness impact).
+        if h in self._local_seen:
+            raise StandardizationError(
+                "DuplicateReaction", str(rxn), ValueError("Duplicate reaction found")
+            )
+        # ------------------- cluster‑wide check ------------------------
+        if self._actor is None:  # single‑CPU fall‑back
+            is_new = h not in self._local_seen
+        else:
+            # synchronous, returns True/False
+            is_new = ray.get(self._actor.check_and_add.remote(h))
+        if is_new:
+            self._local_seen.add(h)
+            return rxn
+        raise StandardizationError(
+            "DuplicateReaction", str(rxn), ValueError("Duplicate reaction found")
+        )
+@ray.remote
+class DedupActor:
+    """Cluster‑wide set of reaction hashes."""
+    def __init__(self):
+        self._seen: set[int] = set()
+    def check_and_add(self, h: int) -> bool:
+        """
+        Returns True **iff** the hash was not present yet and is now stored.
+        Cluster‑wide uniqueness is guaranteed because this method executes
+        serially inside the actor process.
+        """
+        if h in self._seen:
+            return False
+        self._seen.add(h)
+        return True
+# Registry mapping config field names to standardizer classes
+STANDARDIZER_REGISTRY = {
+    "reaction_mapping_config": ReactionMappingStandardizer,
+    "functional_groups_config": FunctionalGroupsStandardizer,
+    "kekule_form_config": KekuleFormStandardizer,
+    "check_valence_config": CheckValenceStandardizer,
+    "implicify_hydrogens_config": ImplicifyHydrogensStandardizer,
+    "check_isotopes_config": CheckIsotopesStandardizer,
+    "split_ions_config": SplitIonsStandardizer,
+    "aromatic_form_config": AromaticFormStandardizer,
+    "mapping_fix_config": MappingFixStandardizer,
+    "unchanged_parts_config": UnchangedPartsStandardizer,
+    "small_molecules_config": SmallMoleculesStandardizer,
+    "remove_reagents_config": RemoveReagentsStandardizer,
+    "rebalance_reaction_config": RebalanceReactionStandardizer,
+    "duplicate_reaction_config": DuplicateReactionStandardizer,
+}
+@dataclass
+class ReactionStandardizationConfig(ConfigABC):
+    """Configuration class for reaction filtering. This class manages configuration
+    settings for various reaction filters, including paths, file formats, and filter-
+    specific parameters.
+    :param reaction_mapping_config: Configuration for reaction mapping.
+    :param functional_groups_config: Configuration for functional groups
+        standardization.
+    :param kekule_form_config: Configuration for reactants/reagents/products
+        kekulization.
+    :param check_valence_config: Configuration for atom valence checking.
+    :param implicify_hydrogens_config: Configuration for hydrogens removal.
+    :param check_isotopes_config: Configuration for isotopes checking and cleaning.
+    :param split_ions_config: Configuration for computing charge of molecule.
+    :param aromatic_form_config: Configuration for molecules aromatization.
+    :param unchanged_parts_config: Configuration for removal of unchanged parts in
+        reaction.
+    :param small_molecules_config: Configuration for removal of small molecule from
+        reaction.
+    :param remove_reagents_config: Configuration for removal of reagents from reaction.
+    :param rebalance_reaction_config: Configuration for reaction rebalancing.
+    :param duplicate_reaction_config: Configuration for removal of duplicate reactions.
+    """
+    # configuration for reaction standardizers
+    reaction_mapping_config: Optional[ReactionMappingConfig] = None
+    functional_groups_config: Optional[FunctionalGroupsConfig] = None
+    kekule_form_config: Optional[KekuleFormConfig] = None
+    check_valence_config: Optional[CheckValenceConfig] = None
+    implicify_hydrogens_config: Optional[ImplicifyHydrogensConfig] = None
+    check_isotopes_config: Optional[CheckIsotopesConfig] = None
+    split_ions_config: Optional[SplitIonsConfig] = None
+    aromatic_form_config: Optional[AromaticFormConfig] = None
+    mapping_fix_config: Optional[MappingFixConfig] = None
+    unchanged_parts_config: Optional[UnchangedPartsConfig] = None
+    small_molecules_config: Optional[SmallMoleculesConfig] = None
+    remove_reagents_config: Optional[RemoveReagentsConfig] = None
+    rebalance_reaction_config: Optional[RebalanceReactionConfig] = None
+    duplicate_reaction_config: Optional[DuplicateReactionConfig] = None
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        """Validate configuration parameters."""
+        for field_name, config in self.__dict__.items():
+            if config is not None and hasattr(config, "_validate_params"):
+                config._validate_params(params.get(field_name, {}))
+    def to_dict(self):
+        """Converts the configuration into a dictionary."""
+        config_dict = {}
+        for field_name in STANDARDIZER_REGISTRY:
+            config = getattr(self, field_name)
+            if config is not None:
+                config_dict[field_name] = {}
+        return config_dict
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "ReactionStandardizationConfig":
+        """Create an instance of ReactionCheckConfig from a dictionary."""
+        config_kwargs = {}
+        for field_name, std_cls in STANDARDIZER_REGISTRY.items():
+            if field_name in config_dict:
+                config_kwargs[field_name] = std_cls.__name__.replace(
+                    "Standardizer", "Config"
+                )()
+        return ReactionStandardizationConfig(**config_kwargs)
+    @staticmethod
+    def from_yaml(file_path: str) -> "ReactionStandardizationConfig":
+        """Deserializes a YAML file into a ReactionCheckConfig object."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return ReactionStandardizationConfig.from_dict(config_dict)
+    def create_standardizers(self):
+        """Create standardizer instances based on configuration."""
+        standardizers = []
+        for field_name, std_cls in STANDARDIZER_REGISTRY.items():
+            config = getattr(self, field_name)
+            if config is not None:
+                standardizers.append(std_cls.from_config(config))
+        return standardizers
+def standardize_reaction(
+    reaction: ReactionContainer,
+    standardizers: Sequence,
+) -> ReactionContainer | None:
+    """
+    Apply each standardizer in order.
+    Returns
+    -------
+    ReactionContainer | None
+        - the fully‑standardised reaction, or
+        - None if *any* standardizer decides to filter it out.
+    Raises
+    ------
+    StandardizationError
+        Propagated untouched so the caller can decide what to do.
+    """
+    std_rxn = reaction
+    for std in standardizers:
+        logger.debug("  › %s(%s)", std.__class__.__name__, std_rxn)
+        try:
+            std_rxn = std(std_rxn)  # may return None
+            if std_rxn is None:  # soft filter
+                logger.info("%s filtered out reaction", std.__class__.__name__)
+                return None
+        except StandardizationError as exc:
+            # Log *once*, then re‑raise with full traceback intact
+            logger.warning(
+                "%s failed on reaction %s : %s",
+                std.__class__.__name__,
+                std_rxn,
+                exc,
+            )
+            raise  # re‑raise same object
+    return std_rxn
+def safe_standardize(
+    item: str | ReactionContainer,
+    standardizers: Sequence,
+) -> Tuple[ReactionContainer, bool]:
+    """
+    Always returns a ReactionContainer. The boolean flags real success.
+    """
+    try:
+        # Parse only if needed
+        reaction = (
+            item if isinstance(item, ReactionContainer) else smiles_cgrtools(item)
+        )
+        std = standardize_reaction(reaction, standardizers)
+        if std is None:
+            return reaction, False  # filtered → keep original
+        return std, True
+    except Exception as exc:  # noqa: BLE001
+        # keep the original container (parse if it was a string)
+        if isinstance(item, ReactionContainer):
+            return item, False
+        return smiles_cgrtools(item), False
+def _process_batch(
+    batch: Sequence[str | ReactionContainer],
+    standardizers: Sequence,
+) -> Tuple[List[ReactionContainer], int]:
+    results: List[ReactionContainer] = []
+    n_std = 0
+    for item in batch:
+        rxn, ok = safe_standardize(item, standardizers)
+        results.append(rxn)
+        n_std += ok
+    return results, n_std
+@ray.remote
+def process_batch_remote(
+    batch: Sequence[str | ReactionContainer],
+    std_param: ray.ObjectRef,  # <-- receives a ref
+    log_file_path: str | Path | None = None,
+) -> Tuple[List[ReactionContainer], int]:
+    # Ray keeps a local cache of fetched objects, so the list is
+    # deserialised only once per worker process, not once per task.
+    if isinstance(std_param, ray.ObjectRef):  # handle?   get it
+        standardizers = ray.get(std_param)  # • O(once)
+    else:  # plain list? use as is
+        standardizers = std_param
+    # --- Worker-specific logging setup ---
+    worker_logger = logging.getLogger("synplan.chem.data.standardizing")
+    if log_file_path:
+        log_file_path = Path(log_file_path)  # Ensure it's a Path object
+        # Check if a handler for this file already exists for this logger
+        handler_exists = any(
+            isinstance(h, logging.FileHandler) and Path(h.baseFilename) == log_file_path
+            for h in worker_logger.handlers
+        )
+        if not handler_exists:
+            try:
+                fh = logging.FileHandler(log_file_path, encoding="utf-8")
+                # Use a simple format for worker logs, or match driver's format
+                formatter = logging.Formatter(
+                    "%(asctime)s | %(name)s (worker) | %(levelname)-8s | %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                )
+                fh.setFormatter(formatter)
+                fh.setLevel(logging.INFO)  # Or DEBUG, or use worker_log_level if passed
+                worker_logger.addHandler(fh)
+                worker_logger.setLevel(
+                    logging.INFO
+                )  # Ensure logger passes messages to handler
+                worker_logger.propagate = (
+                    False  # Avoid double logging if driver also logs
+                )
+                # Optional: Log that the handler was added
+                # worker_logger.info(f"Worker process attached file handler: {log_file_path}")
+            except Exception as e:
+                # Log error if handler creation fails (e.g., permissions)
+                logging.error(
+                    f"Worker failed to create file handler {log_file_path}: {e}"
+                )
+    return _process_batch(batch, standardizers)
+def chunked(iterable: Iterable, size: int):
+    chunk = []
+    for it in iterable:
+        chunk.append(it)
+        if len(chunk) == size:
+            yield chunk
+            chunk = []
+    if chunk:
+        yield chunk
+def standardize_reactions_from_file(
+    config: "ReactionStandardizationConfig",
+    input_reaction_data_path: str | Path,
+    standardized_reaction_data_path: str | Path = "reaction_data_standardized.smi",
+    *,
+    num_cpus: int = 1,
+    batch_size: int = 1_000,  # larger batches amortise overhead
+    silent: bool = True,
+    max_pending_factor: int = 4,  # tasks in flight = factor × CPUs
+    worker_log_level: int | str = logging.WARNING,
+    log_file_path: str | Path | None = None,
+) -> None:
+    """
+    Reads reactions, standardises them in parallel with Ray, writes results.
+    The function keeps at most `max_pending_factor * num_cpus` Ray tasks in
+    flight to avoid flooding the scheduler and blowing up the object store.
+    Standardisers are broadcast once with `ray.put`, removing per‑task
+    pickling cost.  All other logic is unchanged.
+    Args:
+        config: Configuration object for standardizers.
+        input_reaction_data_path: Path to the input reaction data file.
+        standardized_reaction_data_path: Path to save the standardized reactions.
+        num_cpus: Number of CPU cores to use for parallel processing.
+        batch_size: Number of reactions to process in each batch.
+        silent: If True, suppress the progress bar.
+        max_pending_factor: Controls the number of pending Ray tasks.
+        worker_log_level: Logging level for Ray workers (e.g., logging.INFO, logging.WARNING).
+        log_file_path: Path to the log file for workers to write to.
+    """
+    output_path = Path(standardized_reaction_data_path)
+    standardizers = config.create_standardizers()
+    logger.info(
+        "Standardizers: %s",
+        ", ".join(s.__class__.__name__ for s in standardizers),
+    )
+    # -----------------------  Ray initialisation  -----------------------
+    if num_cpus > 1:
+        if not ray.is_initialized():
+            ray.init(
+                num_cpus=num_cpus,
+                ignore_reinit_error=True,
+                logging_level=worker_log_level,
+                log_to_driver=False,
+            )
+        DEDUP_NAME = "duplicate_rxn_actor"
+        try:
+            dedup_actor = ray.get_actor(DEDUP_NAME)  # already running?
+        except ValueError:
+            dedup_actor = DedupActor.options(
+                name=DEDUP_NAME, lifetime="detached"  # survives driver exit
+            ).remote()
+        std_ref: ray.ObjectRef | None = None
+        if num_cpus > 1 and std_ref is None:  # broadcast once
+            std_ref = ray.put(standardizers)
+    max_pending = max_pending_factor * num_cpus
+    pending: Dict[ray.ObjectRef, None] = {}
+    n_processed = n_std = 0
+    bar = tqdm(
+        total=0,
+        unit="rxn",
+        desc="Standardising",
+        disable=silent,
+        dynamic_ncols=True,
+    )
+    # ------------------------  Helper function  ------------------------
+    def _flush(ref: ray.ObjectRef, write_fn) -> None:
+        """Fetch finished task, write its results, update counters & bar."""
+        nonlocal n_processed, n_std
+        res, ok = ray.get(ref)
+        write_fn(res)
+        bar.update(len(res))
+        n_processed += len(res)
+        n_std += ok
+    # -----------------------------  I/O  -------------------------------
+    with ReactionReader(input_reaction_data_path) as reader, ReactionWriter(
+        output_path
+    ) as writer:
+        write_fn = lambda reactions: [writer.write(r) for r in reactions]
+        # ---------------------  Main read/compute loop  -----------------
+        for chunk in chunked(reader, batch_size):
+            bar.total += len(chunk)
+            bar.refresh()
+            if num_cpus > 1:
+                # ---------- back‑pressure: keep ≤ max_pending ----------
+                while len(pending) >= max_pending:
+                    done, _ = ray.wait(list(pending), num_returns=1)
+                    _flush(done[0], write_fn)
+                    pending.pop(done[0], None)
+                # ----------- schedule new task -------------------------
+                ref = process_batch_remote.remote(chunk, std_ref, log_file_path)
+                pending[ref] = None
+            else:
+                # --------------- serial fall‑back ----------------------
+                res, ok = _process_batch(chunk, standardizers)
+                write_fn(res)
+                bar.update(len(res))
+                n_processed += len(res)
+                n_std += ok
+        # ------------------  Drain remaining Ray tasks  -----------------
+        while pending:
+            done, _ = ray.wait(list(pending), num_returns=1)
+            _flush(done[0], write_fn)
+            pending.pop(done[0], None)
+    bar.close()
+    ray.shutdown()
+    logger.info(
+        "Finished: processed %d, standardised %d, filtered %d",
+        n_processed,
+        n_std,
+        n_processed - n_std,
+    )

synplan/chem/precursor.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Module containing a class Precursor that represents a precursor (extend molecule object) in
+the search tree."""
+from typing import Set
+from CGRtools.containers import MoleculeContainer
+from synplan.chem.utils import safe_canonicalization
+class Precursor:
+    """Precursor class is used to extend the molecule behavior needed for interaction with
+    a tree in MCTS."""
+    def __init__(self, molecule: MoleculeContainer, canonicalize: bool = True):
+        """It initializes a Precursor object with a molecule container as a parameter.
+        :param molecule: A molecule.
+        """
+        self.molecule = safe_canonicalization(molecule) if canonicalize else molecule
+        self.prev_precursors = []
+    def __len__(self) -> int:
+        """Return the number of atoms in Precursor."""
+        return len(self.molecule)
+    def __hash__(self) -> hash:
+        """Returns the hash value of Precursor."""
+        return hash(self.molecule)
+    def __str__(self) -> str:
+        """Returns a SMILES of the Precursor."""
+        return str(self.molecule)
+    def __eq__(self, other: "Precursor") -> bool:
+        """Checks if the current Precursor is equal to another Precursor."""
+        return self.molecule == other.molecule
+    def __repr__(self) -> str:
+        """Returns a SMILES of the Precursor."""
+        return str(self.molecule)
+    def is_building_block(self, bb_stock: Set[str], min_mol_size: int = 6) -> bool:
+        """Checks if a Precursor is a building block.
+        :param bb_stock: The list of building blocks. Each building block is represented
+            by a canonical SMILES.
+        :param min_mol_size: If the size of the Precursor is equal or smaller than
+            min_mol_size it is automatically classified as building block.
+        :return: True is Precursor is a building block.
+        """
+        if len(self.molecule) <= min_mol_size:
+            return True
+        return str(self.molecule) in bb_stock
+def compose_precursors(
+    precursors: list = None, exclude_small: bool = True, min_mol_size: int = 6
+) -> MoleculeContainer:
+    """
+    Takes a list of precursors, excludes small precursors if specified, and composes them
+    into a single molecule. The composed molecule then is used for the prediction of
+    synthesisability of the characterizing the possible success of the route including
+    the nodes with the given precursor.
+    :param precursors: The list of precursor to be composed.
+    :param exclude_small: The parameter that determines whether small precursor should be excluded from the composition
+                          process. If `exclude_small` is set to `True`,
+                          only precursor with a length greater than min_mol_size will be composed.
+    :param min_mol_size: The parameter used with exclude_small.
+    :return: A composed precursor as a MoleculeContainer object.
+    """
+    if len(precursors) == 1:
+        return precursors[0].molecule
+    if len(precursors) > 1:
+        if exclude_small:
+            big_precursor = [
+                precursor
+                for precursor in precursors
+                if len(precursor.molecule) > min_mol_size
+            ]
+            if big_precursor:
+                precursors = big_precursor
+        tmp_mol = precursors[0].molecule.copy()
+        transition_mapping = {}
+        for mol in precursors[1:]:
+            for n, atom in mol.molecule.atoms():
+                new_number = tmp_mol.add_atom(atom.atomic_symbol)
+                transition_mapping[n] = new_number
+            for atom, neighbor, bond in mol.molecule.bonds():
+                tmp_mol.add_bond(
+                    transition_mapping[atom], transition_mapping[neighbor], bond
+                )
+            transition_mapping = {}
+        return tmp_mol

synplan/chem/reaction.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""Module containing classes and functions for manipulating reactions and reaction
+rules."""
+from typing import Any, Iterator, List, Optional
+from CGRtools.containers import MoleculeContainer, ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+class Reaction(ReactionContainer):
+    """Reaction class used for a general representation of reaction."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+def add_small_mols(
+    big_mol: MoleculeContainer, small_molecules: Optional[Any] = None
+) -> List[MoleculeContainer]:
+    """Takes a molecule and returns a list of modified molecules where each small
+    molecule has been added to the big molecule.
+    :param big_mol: A molecule.
+    :param small_molecules: A list of small molecules that need to be added to the
+        molecule.
+    :return: Returns a list of molecules.
+    """
+    if small_molecules:
+        tmp_mol = big_mol.copy()
+        transition_mapping = {}
+        for small_mol in small_molecules:
+            for n, atom in small_mol.atoms():
+                new_number = tmp_mol.add_atom(atom.atomic_symbol)
+                transition_mapping[n] = new_number
+            for atom, neighbor, bond in small_mol.bonds():
+                tmp_mol.add_bond(
+                    transition_mapping[atom], transition_mapping[neighbor], bond
+                )
+            transition_mapping = {}
+        return tmp_mol.split()
+    return [big_mol]
+def apply_reaction_rule(
+    molecule: MoleculeContainer,
+    reaction_rule: Reactor,
+    sort_reactions: bool = False,
+    top_reactions_num: int = 3,
+    validate_products: bool = True,
+    rebuild_with_cgr: bool = False,
+) -> Iterator[List[MoleculeContainer,]]:
+    """Applies a reaction rule to a given molecule.
+    :param molecule: A molecule to which reaction rule will be applied.
+    :param reaction_rule: A reaction rule to be applied.
+    :param sort_reactions:
+    :param top_reactions_num: The maximum amount of reactions after the application of
+        reaction rule.
+    :param validate_products: If True, validates the final products.
+    :param rebuild_with_cgr: If True, the products are extracted from CGR decomposition.
+    :return: An iterator yielding the products of reaction rule application.
+    """
+    reactants = add_small_mols(molecule, small_molecules=False)
+    try:
+        if sort_reactions:
+            unsorted_reactions = list(reaction_rule(reactants))
+            sorted_reactions = sorted(
+                unsorted_reactions,
+                key=lambda react: len(
+                    list(filter(lambda mol: len(mol) > 6, react.products))
+                ),
+                reverse=True,
+            )
+            # take top-N reactions from reactor
+            reactions = sorted_reactions[:top_reactions_num]
+        else:
+            reactions = []
+            for reaction in reaction_rule(reactants):
+                reactions.append(reaction)
+                if len(reactions) == top_reactions_num:
+                    break
+    except IndexError:
+        reactions = []
+    for reaction in reactions:
+        # temporary solution - incorrect leaving groups
+        reactant_atom_nums = []
+        for i in reaction.reactants:
+            reactant_atom_nums.extend(i.atoms_numbers)
+        product_atom_nums = []
+        for i in reaction.products:
+            product_atom_nums.extend(i.atoms_numbers)
+        leaving_atom_nums = set(reactant_atom_nums) - set(product_atom_nums)
+        if len(leaving_atom_nums) > len(product_atom_nums):
+            continue
+        # check reaction
+        if rebuild_with_cgr:
+            cgr = reaction.compose()
+            reactants = cgr.decompose()[1].split()
+        else:
+            reactants = reaction.products  # reactants are products in retro reaction
+        reactants = [mol for mol in reactants if len(mol) > 0]
+        # validate products
+        if validate_products:
+            for mol in reactants:
+                try:
+                    mol.kekule()
+                    if mol.check_valence():
+                        yield None
+                    mol.thiele()
+                except InvalidAromaticRing:
+                    yield None
+        yield reactants

synplan/chem/reaction_routes/__init__.py ADDED Viewed

File without changes

synplan/chem/reaction_routes/clustering.py ADDED Viewed

	@@ -0,0 +1,857 @@

+from collections import defaultdict
+from pathlib import Path
+import pickle
+import re
+from CGRtools.containers import ReactionContainer, CGRContainer
+from CGRtools.containers.bonds import DynamicBond
+from synplan.chem.reaction_routes.leaving_groups import *
+from synplan.chem.reaction_routes.visualisation import *
+from synplan.chem.reaction_routes.route_cgr import *
+from synplan.chem.reaction_routes.io import (
+    read_routes_csv,
+    read_routes_json,
+    make_dict,
+    make_json,
+)
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+)
+def run_cluster_cli(
+    routes_file: str,
+    cluster_results_dir: str,
+    perform_subcluster: bool = False,
+    subcluster_results_dir: Path = None,
+):
+    """
+    Read routes from a CSV or JSON file, perform clustering, and optionally subclustering.
+    Args:
+        routes_file: Path to the input routes file (.csv or .json).
+        cluster_results_dir: Directory where clustering results are stored.
+        perform_subcluster: Whether to run subclustering on each cluster.
+        subcluster_results_dir: Subdirectory for subclustering results (if enabled).
+    """
+    import click
+    routes_file = Path(routes_file)
+    match = re.search(r"_(\d+)\.", routes_file.name)
+    if not match:
+        raise ValueError(f"Could not extract index from filename: {routes_file.name}")
+    file_index = int(match.group(1))
+    ext = routes_file.suffix.lower()
+    if ext == ".csv":
+        routes_dict = read_routes_csv(str(routes_file))
+        routes_json = make_json(routes_dict)
+    elif ext == ".json":
+        routes_json = read_routes_json(str(routes_file))
+        routes_dict = make_dict(routes_json)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+    # Compose condensed graph representations
+    route_cgrs = compose_all_route_cgrs(routes_dict)
+    click.echo(f"Generating RouteCGR")
+    reduced_cgrs = compose_all_reduced_route_cgrs(route_cgrs)
+    click.echo(f"Generating ReducedRouteCGR")
+    # Perform clustering
+    click.echo(f"\nClustering")
+    clusters = cluster_routes(reduced_cgrs, use_strat=False)
+    click.echo(f"Total number of routes: {len(routes_dict)}")
+    click.echo(f"Found number of clusters: {len(clusters)} ({list(clusters.keys())})")
+    # Ensure output directory exists
+    cluster_results_dir = Path(cluster_results_dir)
+    cluster_results_dir.mkdir(parents=True, exist_ok=True)
+    # Save clusters to pickle
+    with open(cluster_results_dir / f"clusters_{file_index}.pickle", "wb") as f:
+        pickle.dump(clusters, f)
+    # Generate HTML reports for each cluster
+    for idx in clusters:
+        report_path = cluster_results_dir / f"{file_index}_cluster_{idx}.html"
+        routes_clustering_report(
+            routes_json, clusters, idx, reduced_cgrs, html_path=str(report_path)
+        )
+    # Optional subclustering (Under development)
+    if perform_subcluster and subcluster_results_dir:
+        click.echo("\nSubClustering")
+        sub_dir = cluster_results_dir / subcluster_results_dir
+        sub_dir.mkdir(parents=True, exist_ok=True)
+        subclusters = subcluster_all_clusters(clusters, reduced_cgrs, route_cgrs)
+        for cluster_idx, sub in subclusters.items():
+            click.echo(f"Cluster {cluster_idx} has {len(sub)} subclusters")
+            for sub_idx, subcluster in sub.items():
+                subreport_path = (
+                    sub_dir / f"{file_index}_subcluster_{cluster_idx}.{sub_idx}.html"
+                )
+                routes_subclustering_report(
+                    routes_json,
+                    subcluster,
+                    cluster_idx,
+                    sub_idx,
+                    reduced_cgrs,
+                    aam=False,
+                    html_path=str(subreport_path),
+                )
+def cluster_route_from_csv(routes_file: str):
+    """
+    Reads retrosynthetic routes from a CSV file, processes them, and performs clustering.
+    This function orchestrates the process of loading retrosynthetic route data
+    from a specified CSV file, converting the routes into Condensed Graph of
+    Reactions (CGRs), reducing these CGRs to a simplified form (ReducedRouteCGRs),
+    and finally clustering the routes based on these reduced representations.
+    It uses strategic bonds for clustering by default (as indicated by `use_strat=False`
+    in `cluster_routes`, which implies clustering based on the graph structure
+    derived from the reduced CGRs, which often highlight strategic bonds).
+    Args:
+        routes_file (str): The path to the CSV file containing the retrosynthetic
+                           route data.
+    Returns:
+        object: The result of the clustering process, typically a data structure
+                representing the identified clusters. The exact type depends on
+                the implementation of the `cluster_routes` function.
+    """
+    routes_dict = read_routes_csv(routes_file)
+    route_cgrs_dict = compose_all_route_cgrs(routes_dict)
+    reduced_route_cgrs_dict = compose_all_reduced_route_cgrs(route_cgrs_dict)
+    clusters = cluster_routes(reduced_route_cgrs_dict, use_strat=False)
+    return clusters
+def cluster_route_from_json(routes_file: str):
+    """
+    Reads retrosynthetic routes from a JSON file, processes them, and performs clustering.
+    This function is similar to `cluster_route_from_csv` but loads the
+    retrosynthetic route data from a specified JSON file. It reads the JSON,
+    converts it into a suitable dictionary format, composes and reduces the
+    Condensed Graph of Reactions (CGRs) for each route, and then clusters
+    the routes based on these reduced representations, typically using
+    strategic bonds as the basis for clustering.
+    Args:
+        routes_file (str): The path to the JSON file containing the retrosynthetic
+                           route data.
+    Returns:
+        object: The result of the clustering process, typically a data structure
+                representing the identified clusters. The exact type depends on
+                the implementation of the `cluster_routes` function.
+    """
+    routes_json = read_routes_json(routes_file)
+    routes_dict = make_dict(routes_json)
+    route_cgrs_dict = compose_all_route_cgrs(routes_dict)
+    reduced_route_cgrs_dict = compose_all_reduced_route_cgrs(route_cgrs_dict)
+    clusters = cluster_routes(reduced_route_cgrs_dict, use_strat=False)
+    return clusters
+def extract_strat_bonds(target_cgr: CGRContainer):
+    """
+    Extracts strategic bonds from a CGRContainer object.
+    Strategic bonds are identified as bonds where the original bond order
+    (`bond.order`) is None (indicating a bond that was not present in the
+    reactants) but the primary bond order (`bond.p_order`) is not None
+    (indicating a bond that was formed in the product). This function iterates
+    through all bonds in the input CGR, identifies those matching the criteria
+    for strategic bonds, and returns a sorted list of unique strategic bonds
+    represented as tuples of sorted atom indices.
+    Args:
+        target_cgr (CGRContainer): The CGRContainer object from which to extract
+                                   strategic bonds.
+    Returns:
+        list: A sorted list of tuples, where each tuple represents a strategic
+              bond by the sorted integer indices of the two atoms involved in the bond.
+    """
+    result = []
+    seen = set()
+    for atom1, bond_set in target_cgr._bonds.items():
+        for atom2, bond in bond_set.items():
+            if atom1 >= atom2:
+                continue
+            if bond.order is None and bond.p_order is not None:
+                bond_key = tuple(sorted((atom1, atom2)))
+                if bond_key not in seen:
+                    seen.add(bond_key)
+                    result.append(bond_key)
+    return sorted(result)
+def cluster_routes(r_route_cgrs: dict, use_strat=False):
+    """
+    Cluster routes objects based on their strategic bonds
+      or CGRContainer object signature (not avoid mapping)
+    Args:
+        r_route_cgrs: Dictionary mapping node_id to r_route_cgr objects.
+    Returns:
+        Dictionary with groups keyed by '{length}.{index}' containing
+        'r_route_cgr', 'node_ids', and 'strat_bonds'.
+    """
+    temp_groups = defaultdict(
+        lambda: {"node_ids": [], "r_route_cgr": None, "strat_bonds": None}
+    )
+    # 1. Initial grouping based on the content of strategic bonds
+    for node_id, r_route_cgr in r_route_cgrs.items():
+        strat_bonds_list = extract_strat_bonds(r_route_cgr)
+        if use_strat == True:
+            group_key = tuple(strat_bonds_list)
+        else:
+            group_key = str(r_route_cgr)
+        if not temp_groups[group_key]["node_ids"]:  # First time seeing this group
+            temp_groups[group_key][
+                "r_route_cgr"
+            ] = r_route_cgr  # Store the first CGR as representative
+            temp_groups[group_key][
+                "strat_bonds"
+            ] = strat_bonds_list  # Store the actual list
+        temp_groups[group_key]["node_ids"].append(node_id)
+        temp_groups[group_key][
+            "node_ids"
+        ].sort()  # Keep node_ids sorted for consistency
+    for group_key in temp_groups.keys():
+        temp_groups[group_key]["group_size"] = len(temp_groups[group_key]["node_ids"])
+    # 2. Format the output dictionary with desired keys '{length}.{index}'
+    final_grouped_results = {}
+    group_indices = defaultdict(int)  # To track index for each length
+    # Sort items by length of bonds first, then potentially by bonds themselves for consistent indexing
+    # Sorting by the group_key (tuple of tuples) provides a deterministic order
+    sorted_groups = sorted(
+        temp_groups.items(), key=lambda item: (len(item[0]), item[0])
+    )
+    for group_key, group_data in sorted_groups:
+        num_bonds = len(group_data["strat_bonds"])
+        group_indices[num_bonds] += 1  # Increment index for this length (1-based)
+        final_key = f"{num_bonds}.{group_indices[num_bonds]}"
+        final_grouped_results[final_key] = group_data
+    return final_grouped_results
+def lg_process_reset(lg_cgr: CGRContainer, atom_num: int):
+    """
+    Normalize bonds in an extracted leaving group (X) fragment and flag the attachment atom as a radical.
+    Scans all bonds in `lg_cgr`, converting any bond with undefined `p_order`
+    but defined `order` into a `DynamicBond` of matching integer order. Then sets
+    the atom at `atom_num` to a radical.
+    Parameters
+    ----------
+    target_cgr : CGRContainer
+        The CGR representing the isolated leaving-group fragment.
+    atom_num : int
+        Index of the attachment atom to mark as a radical.
+    Returns
+    -------
+    CGRContainer
+        The modified `lg_cgr` with normalized bonds and the specified atom
+        flagged as a radical.
+    """
+    bond_items = list(lg_cgr._bonds.items())
+    for atom1, bond_set in bond_items:
+        bond_set_items = list(bond_set.items())
+        for atom2, bond in bond_set_items:
+            if bond.p_order is None and bond.order is not None:
+                order = int(bond.order)
+                lg_cgr.delete_bond(atom1, atom2)
+                lg_cgr.add_bond(atom1, atom2, DynamicBond(order, order))
+    lg_cgr._atoms[atom_num].is_radical = True
+    return lg_cgr
+def lg_replacer(route_cgr: CGRContainer):
+    """
+    Extract dynamic leaving-groups from a CGR and mark attachment points.
+    Scans the input CGRContainer for bonds lacking explicit p_order (i.e., leaving-group attachments),
+    severs those bonds, captures each leaving-group as its own CGRContainer, and inserts DynamicX
+    markers at the attachment sites. Finally, reindexes the markers to ensure unique labels.
+    Parameters
+    ----------
+    route_cgr : CGRContainer
+        A CGR representing the full synthethic route.
+    Returns
+    -------
+    synthon_cgr : CGRContainer
+        The core synthon CGR with DynamicX atoms marking each former leaving-group site.
+    lg_groups : dict[int, tuple[CGRContainer, int]]
+        Mapping from each marker label to a tuple of:
+        - the extracted leaving-group CGRContainer
+        - the atom index where it was attached.
+    """
+    lg_groups = {}
+    cgr_prods = [route_cgr.substructure(c) for c in route_cgr.connected_components]
+    target_cgr = cgr_prods[0]
+    bond_items = list(target_cgr._bonds.items())
+    reaction = ReactionContainer.from_cgr(target_cgr)
+    target_mol = reaction.products[0]
+    max_in_target_mol = max(target_mol._atoms)
+    k = 1
+    atom_nums = []
+    for atom1, bond_set in bond_items:
+        bond_set_items = list(bond_set.items())
+        for atom2, bond in bond_set_items:
+            if bond.p_order is None and bond.order is not None:
+                if atom1 <= max_in_target_mol:
+                    lg = DynamicX()
+                    lg.mark = k
+                    lg.isotope = k
+                    order = bond.order
+                    p_order = bond.p_order
+                    target_cgr.delete_bond(atom1, atom2)
+                    lg_cgrs = [
+                        target_cgr.substructure(c)
+                        for c in target_cgr.connected_components
+                    ]
+                    if len(lg_cgrs) == 2:
+                        lg_cgr = lg_cgrs[1]
+                        lg_cgr = lg_process_reset(lg_cgr, atom2)
+                        lg_cgr.clean2d()
+                    else:
+                        continue
+                    lg_groups[k] = (lg_cgr, atom2)
+                    target_cgr = [
+                        target_cgr.substructure(c)
+                        for c in target_cgr.connected_components
+                    ][0]
+                    target_cgr.add_atom(lg, atom2)
+                    if order == 4 and p_order == None:
+                        order = 1
+                    target_cgr.add_bond(atom1, atom2, DynamicBond(order, p_order))
+                    target_cgr = [
+                        target_cgr.substructure(c)
+                        for c in target_cgr.connected_components
+                    ][0]
+                    k += 1
+                    atom_nums.append(atom2)
+    synthon_cgr = [target_cgr.substructure(c) for c in target_cgr.connected_components][
+        0
+    ]
+    reaction = ReactionContainer.from_cgr(synthon_cgr)
+    reactants = reaction.reactants
+    atom_mark_map = {}  # To map atom numbers to their new marks
+    g = 1
+    for n, r in enumerate(reactants):
+        for atom_num in atom_nums:
+            if atom_num in r._atoms:
+                synthon_cgr._atoms[atom_num].mark = g
+                atom_mark_map[atom_num] = g
+                g += 1
+    new_lg_groups = {}
+    for original_mark in lg_groups:
+        cgr_obj, a_num = lg_groups[original_mark]
+        new_mark = atom_mark_map.get(a_num)
+        if new_mark is not None:
+            new_lg_groups[new_mark] = (cgr_obj, a_num)
+    lg_groups = new_lg_groups
+    return synthon_cgr, lg_groups
+def lg_reaction_replacer(
+    synthon_reaction: ReactionContainer, lg_groups: dict, max_in_target_mol: int
+):
+    """
+    Replace marked leaving-groups (X) into synthon reactants.
+    For each reactant in `synthon_reaction`, finds placeholder atoms
+    (indices > `max_in_target_mol`) that match entries in `lg_groups`,
+    replaces them with `MarkedAt` atoms labeled by their leaving-group key (X),
+    and preserves original bond connectivity.
+    Parameters
+    ----------
+    synthon_reaction : ReactionContainer
+        Reaction containing reactants with X placeholders.
+    lg_groups : dict[int, tuple[CGRContainer, int]]
+        Mapping from X label to (X CGR, attachment atom index).
+    max_in_target_mol : int
+        Highest atom index of the core product; any atom_num above this is a placeholder.
+    Returns
+    -------
+    List[Molecule]
+        Reactant molecules with `MarkedAt` atoms reinserted at X attachment sites.
+    """
+    new_reactants = []
+    for reactant in synthon_reaction.reactants:
+        atom_keys = list(reactant._atoms.keys())
+        for atom_num in atom_keys:
+            if atom_num > max_in_target_mol:
+                for k, val in lg_groups.items():
+                    lg = MarkedAt()
+                    if atom_num == val[1]:
+                        lg.mark = k
+                        lg.isotope = k
+                        atom1 = list(reactant._bonds[atom_num].keys())[0]
+                        bond = reactant._bonds[atom_num][atom1]
+                        reactant.delete_bond(atom1, atom_num)
+                        reactant.delete_atom(atom_num)
+                        reactant.add_atom(lg, atom_num)
+                        reactant.add_bond(atom1, atom_num, bond)
+        new_reactants.append(reactant)
+    return new_reactants
+class SubclusterError(Exception):
+    """Raised when subcluster_one_cluster cannot complete successfully."""
+def subcluster_one_cluster(group, r_route_cgrs_dict, route_cgrs_dict):
+    """
+    Generate synthon data for each route in a single cluster.
+    For each route (node ID) in `group['node_ids']`, replaces RouteCGRs with
+    SynthonCGR, builds ReactionContainers before and after X replacement,
+    and collects relevant data.
+    Parameters
+    ----------
+    group : dict
+        Must include `'node_ids'`, a list of node identifiers.
+    r_route_cgrs_dict : dict
+        Maps node IDs to their ReducedRouteCGR.
+    route_cgrs_dict : dict
+        Maps node IDs to their RouteCGR.
+    Returns
+    -------
+    dict or None
+        If successful, returns a dict mapping each `node_id` to a tuple:
+        `(r_route_cgr, original_reaction, synthon_cgr, new_reaction, lg_groups)`.
+        Or raises SubclusterError on any failure: if any step (X replacement or reaction
+        parsing) fails for a node.
+    """
+    node_ids = group.get("node_ids")
+    if not isinstance(node_ids, (list, tuple)):
+        raise SubclusterError(
+            f"'node_ids' must be a list or tuple, got {type(node_ids).__name__}"
+        )
+    result = {}
+    for node_id in node_ids:
+        r_route_cgr = r_route_cgrs_dict[node_id]
+        route_cgr = route_cgrs_dict[node_id]
+        # 1) Replace leaving groups in RouteCGR
+        try:
+            synthon_cgr, lg_groups = lg_replacer(route_cgr)
+        except (KeyError, ValueError) as e:
+            raise SubclusterError(f"LG replacement failed for node {node_id}") from e
+        # 2) Build ReactionContainer for Abstracted RouteCGR
+        try:
+            synthon_rxn = ReactionContainer.from_cgr(synthon_cgr)
+        except:  # replace with the actual exception class
+            raise SubclusterError(
+                f"Failed to parse synthon CGR for node {node_id}"
+            ) from e
+        # 3) Prepare for X-based reaction replacement
+        try:
+            old_reactants = synthon_rxn.reactants
+            target_mol = synthon_rxn.products[0]
+            max_atom_idx = max(target_mol._atoms)
+            new_reactants = lg_reaction_replacer(synthon_rxn, lg_groups, max_atom_idx)
+            new_rxn = ReactionContainer(reactants=new_reactants, products=[target_mol])
+        except (IndexError, TypeError) as e:
+            raise SubclusterError(
+                f"Leaving group (X) reaction replacement failed for node {node_id}"
+            ) from e
+        result[node_id] = (
+            r_route_cgr,
+            ReactionContainer(reactants=old_reactants, products=[target_mol]),
+            synthon_cgr,
+            new_rxn,
+            lg_groups,
+        )
+    return result
+def group_nodes_by_synthon_detail(data_dict: dict):
+    """
+    Groups nodes based on synthon CGR (result[0]) and reaction (result[1]).
+    The output includes a dictionary mapping node IDs to their result[2] value.
+    Args:
+        data_dict: Dictionary {node_id: [synthon_cgr, synthon_reaction, node_data, ...]}.
+    Returns:
+        Dictionary {group_index: {'r_route_cgr': ... ,'synthon_cgr': ..., 'synthon_reaction': ...,
+                                  'nodes_data': {node_id1: node_data1, ...}}}.
+    """
+    temp_groups = defaultdict(list)
+    for node_id, result_list in data_dict.items():
+        if len(result_list) < 4:
+            group_key = (result_list[0], None)  # Handle missing reaction
+        else:
+            try:
+                group_key = (
+                    result_list[0],
+                    result_list[1],
+                    result_list[2],
+                    result_list[3],
+                )
+            except TypeError:
+                print(
+                    f"Warning: Skipping node {node_id} because reaction data is not hashable: {type(result_list[1])}"
+                )
+                continue
+        temp_groups[group_key].append(node_id)
+    # 2. Format the output dictionary with sequential integer keys
+    #    and include the node-specific data (result[2]) in a sub-dictionary.
+    final_grouped_results = {}
+    group_index = 1
+    sorted_temp_groups = sorted(temp_groups.items(), key=lambda item: item[1])
+    for group_key, node_ids in sorted_temp_groups:
+        r_route_cgr, unlabeled_reaction, synthon_cgr, synthon_reaction = group_key
+        nodes_data_dict = {}
+        # Iterate through the node IDs belonging to this group
+        for node_id in sorted(node_ids):  # Sort node IDs for consistent dict order
+            original_result = data_dict.get(
+                node_id, []
+            )  # Get original list for this node
+            node_specific_data = None  # Default value if index 2 is missing
+            if len(original_result) > 4:
+                node_specific_data = original_result[4]  # Get the third element
+            nodes_data_dict[node_id] = node_specific_data  # Add to the sub-dictionary
+        final_grouped_results[group_index] = {
+            "r_route_cgr": r_route_cgr,
+            "unlabeled_reaction": unlabeled_reaction,
+            "synthon_cgr": synthon_cgr,
+            "synthon_reaction": synthon_reaction,
+            "nodes_data": nodes_data_dict,
+            "post_processed": False,
+        }
+        group_index += 1
+    return final_grouped_results
+def subcluster_all_clusters(groups, r_route_cgrs_dict, route_cgrs_dict):
+    """
+    Subdivide each reaction cluster into detailed synthon-based subgroups.
+    Iterates over all clusters in `groups`, applies `subcluster_one_cluster`
+    to generate per-cluster synthons, then organizes nodes by synthon detail.
+    Parameters
+    ----------
+    groups : dict
+        Mapping of cluster indices to cluster data.
+    r_route_cgrs_dict : dict
+        Dictionary of ReducedRoteCGRs
+    route_cgrs_dict : dict
+        Dictionary of RoteCGRs
+    Returns
+    -------
+    dict or None
+        A dict mapping each cluster index to its subgroups dict,
+        or None if any cluster fails to subcluster.
+    """
+    all_subgroups = {}
+    for group_index, group in groups.items():
+        group_synthons = subcluster_one_cluster(
+            group, r_route_cgrs_dict, route_cgrs_dict
+        )
+        if group_synthons is None:
+            return None
+        all_subgroups[group_index] = group_nodes_by_synthon_detail(group_synthons)
+    return all_subgroups
+def all_lg_collect(subgroup):
+    """
+    Gather all leaving-group CGRContainers by node index.
+    Scans `subgroup['nodes_data']`, collects every CGRContainer per index,
+    and returns a mapping from each index to the list of distinct containers.
+    Parameters
+    ----------
+    subgroup : dict
+        Must contain 'nodes_data', a dict mapping pathway keys to
+        dicts of {node_index: (CGRContainer, …)}.
+    Returns
+    -------
+    dict[int, list[CGRContainer]]
+        For each node index, a list of unique CGRContainer objects
+        (duplicates by string are filtered out).
+    """
+    all_indices = set()
+    for sub_dict in subgroup["nodes_data"].values():
+        all_indices.update(sub_dict.keys())
+    # Dynamically initialize result and seen dictionaries
+    result = {idx: [] for idx in all_indices}
+    seen = {idx: set() for idx in all_indices}
+    # Populate the result with unique CGRContainer objects
+    for sub_dict in subgroup["nodes_data"].values():
+        for idx in sub_dict:
+            cgr_container = sub_dict[idx][0]
+            cgr_str = str(cgr_container)
+            if cgr_str not in seen[idx]:
+                seen[idx].add(cgr_str)
+                result[idx].append(cgr_container)
+    return result
+def replace_leaving_groups_in_synthon(subgroup, to_remove):  # Under development
+    """
+    Replace specified leaving groups (LG) in a synthon CGR with new fragments and return the updated CGR
+    along with a mapping from adjusted LG marks to their atom indices.
+    Parameters:
+        subgroup (dict): Must contain:
+            - 'synthon_cgr': the CGR object representing the synthon graph
+            - 'nodes_data': mapping of node indices to LG replacement data
+        to_remove (List[int]): List of LG marks to remove and replace.
+    Returns:
+        Tuple[CGR, Dict[int, int]]:
+            - The updated CGR with replacements
+            - A dict mapping new LG marks to their atom indices in the updated CGR
+    """
+    # Extract the original CGR and leaving group replacement table
+    original_cgr = subgroup["synthon_cgr"]
+    lg_table = next(iter(subgroup["nodes_data"].values()))
+    updated_cgr = original_cgr
+    removed_count = 0
+    new_lgs = {}
+    # Iterate through all atoms (index, atom_obj) in the CGR
+    for atom_idx, atom_obj in list(updated_cgr.atoms()):
+        # Skip non-X atoms
+        if atom_obj.__class__.__name__ != "DynamicX":
+            continue
+        current_mark = atom_obj.mark
+        if current_mark in to_remove:
+            # Remove old LG (X): delete bond and atom
+            neighbors = list(updated_cgr._bonds[atom_idx].keys())
+            if neighbors:
+                neighbor_idx = neighbors[0]
+                bond = updated_cgr._bonds[atom_idx][neighbor_idx]
+                updated_cgr.delete_bond(atom_idx, neighbor_idx)
+                updated_cgr.delete_atom(atom_idx)
+                # Attach new LG(X) fragment from the table
+                lg_fragment = lg_table[current_mark][0]
+                updated_cgr = updated_cgr.union(lg_fragment)
+                # Reset radical flag on the new atom and restore the bond
+                updated_cgr._atoms[atom_idx].is_radical = False
+                updated_cgr.add_bond(atom_idx, neighbor_idx, bond)
+            removed_count += 1
+        else:
+            # Adjust the marks of remaining LGs to account for removed ones
+            atom_obj.mark -= removed_count
+            new_lgs[atom_obj.mark] = atom_idx
+    # Reorder atoms dict and update 2D coordinates for depiction
+    updated_cgr._atoms = dict(sorted(updated_cgr._atoms.items()))
+    return updated_cgr, new_lgs
+def new_lg_reaction_replacer(synthon_reaction, new_lgs, max_in_target_mol):
+    """
+    Replace placeholder atom indices with marked leaving-group atoms in reactants.
+    Iterates through each reactant in a `ReactionContainer`, finds atom indices
+    corresponding to newly detached leaving-groups (those greater than the
+    core’s maximum index), and replaces them with `MarkedAt` atoms bearing
+    the correct X labels and isotopes. Bonds to the original attachment points
+    are preserved.
+    Parameters
+    ----------
+    synthon_reaction : ReactionContainer
+        A reaction container whose `reactants` list contains molecules with
+        dummy atoms (by index) marking where leaving-groups were removed.
+    new_lgs : dict[int, int]
+        Mapping from leaving-group label (int) to the atom index (int) in each
+        reactant that should be replaced.
+    max_in_target_mol : int
+        The highest atom index used by the core product. Any atom index in a
+        reactant greater than this is treated as a leaving-group placeholder.
+    Returns
+    -------
+    List[Molecule]
+        A list of reactant molecules where each placeholder atom has been
+        replaced by a `MarkedAt` atom with its `.mark` and `.isotope` set
+        to the leaving-group label, and original bonds reattached.
+    """
+    new_reactants = []
+    for reactant in synthon_reaction.reactants:
+        atom_keys = list(reactant._atoms.keys())
+        for atom_num in atom_keys:
+            if atom_num > max_in_target_mol:
+                for k, val in new_lgs.items():
+                    lg = MarkedAt()
+                    if atom_num == val:
+                        lg.mark = k
+                        lg.isotope = k
+                        atom1 = list(reactant._bonds[atom_num].keys())[0]
+                        bond = reactant._bonds[atom_num][atom1]
+                        reactant.delete_bond(atom1, atom_num)
+                        reactant.delete_atom(atom_num)
+                        reactant.add_atom(lg, atom_num)
+                        reactant.add_bond(atom1, atom_num, bond)
+        new_reactants.append(reactant)
+    return new_reactants
+def post_process_subgroup(
+    subgroup,
+):  # Under development: Error in replace_leaving_groups_in_synthon , 'cuz synthon_reaction.clean2d crashes
+    """
+    Drop leaving-groups common to all pathways and rebuild a minimal synthon.
+    Scans the subgroup for leaving-groups present in every route, removes those
+    from the CGR, re-assembles a clean ReactionContainer with the original core,
+    updates `nodes_data`, and flags the dict as processed.
+    Parameters
+    ----------
+    subgroup : dict
+        Must include keys for `nodes_data` and the helpers
+        (`all_lg_collect`, `find_const_lg`, etc.). If already
+        post_processed, returns immediately.
+    Returns
+    -------
+    dict
+        The same dict, now with:
+        - `'synthon_reaction'`: cleaned ReactionContainer
+        - `'nodes_data'`: filtered node table
+        - `'post_processed'`: True
+    """
+    if "post_processed" in subgroup.keys() and subgroup["post_processed"] == True:
+        return subgroup
+    result = all_lg_collect(subgroup)
+    # to find constant lg that need to be removed
+    to_remove = [ind for ind, cgr_set in result.items() if len(cgr_set) == 1]
+    new_synthon_cgr, new_lgs = replace_leaving_groups_in_synthon(subgroup, to_remove)
+    synthon_reaction = ReactionContainer.from_cgr(new_synthon_cgr)
+    synthon_reaction.clean2d()
+    old_reactants = ReactionContainer.from_cgr(new_synthon_cgr).reactants
+    target_mol = synthon_reaction.products[0]  # TO DO: target_mol might be non 0
+    max_in_target_mol = max(target_mol._atoms)
+    new_reactants = new_lg_reaction_replacer(
+        synthon_reaction, new_lgs, max_in_target_mol
+    )
+    new_synthon_reaction = ReactionContainer(
+        reactants=new_reactants, products=[target_mol]
+    )
+    new_synthon_reaction.clean2d()
+    subgroup["synthon_reaction"] = new_synthon_reaction
+    subgroup["nodes_data"] = remove_and_shift(subgroup["nodes_data"], to_remove)
+    subgroup["post_processed"] = True
+    subgroup["group_lgs"] = group_by_identical_values(subgroup["nodes_data"])
+    return subgroup
+def group_by_identical_values(nodes_data):  # Under development
+    """
+    Groups entries in a nested dictionary based on identical sets of core values.
+    Identifies route IDs whose inner dictionaries contain the
+    same sequence of leaving groups, when ordered by subkey. These are collapsed into a single entry.
+    Args:
+        nodes_data (dict): A dictionary mapping outer keys to inner dictionaries.
+            Each inner dictionary maps subkeys to a tuple `(value_obj, other_info)`.
+            `value_obj` is used for grouping, `other_info` is ignored.
+            Example: {'route_1': {'pos_a': (1, 'infoA'), 'pos_b': (2, 'infoB')}, ...}
+    Returns:
+        dict: A dictionary where:
+            - Keys are tuples of the original outer keys that were grouped.
+            - Values are dictionaries mapping the original subkeys to the
+              `value_obj` from the first outer key in the group's tuple.
+            The dictionary is sorted descending by the number of grouped outer keys.
+            Example: {('route_1', 'route_2'): {'pos_a': 1, 'pos_b': 2}, ...}
+    """
+    # Step 1: Build a signature for each outer key: the tuple of all first-elements in its inner dict
+    signature_map = defaultdict(list)
+    for outer_key, inner_dict in nodes_data.items():
+        # Sort inner_dict items by subkey to ensure consistent ordering
+        sorted_items = sorted(inner_dict.items(), key=lambda kv: kv[0])
+        # Extract only the first element of each (value_obj, other_info) tuple
+        signature = tuple(val_tuple[0] for _, val_tuple in sorted_items)
+        signature_map[signature].append(outer_key)
+    # Step 2: Build the grouped result
+    grouped = {}
+    for signature, outer_keys in signature_map.items():
+        # Use the representative inner dict from the first outer key in this group
+        rep_inner = nodes_data[outer_keys[0]]
+        # Build mapping subkey -> value_obj
+        rep_values = {subkey: val_tuple[0] for subkey, val_tuple in rep_inner.items()}
+        # Store under tuple of grouped outer keys
+        grouped_key = tuple(outer_keys)
+        grouped[grouped_key] = rep_values
+    sorted_grouped = dict(
+        sorted(grouped.items(), key=lambda item: len(item[0]), reverse=True)
+    )
+    return sorted_grouped

synplan/chem/reaction_routes/io.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import csv
+import json
+import pickle
+import os
+from CGRtools import smiles as read_smiles
+from synplan.mcts.tree import Tree
+def make_dict(routes_json):
+    """
+    routes_json : list of tree-dicts as produced by make_json()
+    Returns a dict mapping each route index (0, 1, …) to a sub-dict
+    of {new_step_id: ReactionContainer}, where the step IDs run
+    from the earliest reaction (0) up to the final (max).
+    """
+    routes_dict = {}
+    if isinstance(routes_json, dict):
+        for route_idx, tree in routes_json.items():
+            rxn_list = []
+            def _postorder(node):
+                # first dive into any children, then record this reaction
+                for child in node.get("children", []):
+                    _postorder(child)
+                if node["type"] == "reaction":
+                    rxn_list.append(read_smiles(node["smiles"]))
+                # mol-nodes simply recurse (no record)
+            # collect all reactions in leaf→root order
+            _postorder(tree)
+            # now assign 0,1,2,… in that order
+            reactions = {i: rxn for i, rxn in enumerate(rxn_list)}
+            routes_dict[int(route_idx)] = reactions
+        return routes_dict
+    else:
+        for route_idx, tree in enumerate(routes_json):
+            rxn_list = []
+            def _postorder(node):
+                # first dive into any children, then record this reaction
+                for child in node.get("children", []):
+                    _postorder(child)
+                if node["type"] == "reaction":
+                    rxn_list.append(read_smiles(node["smiles"]))
+                # mol-nodes simply recurse (no record)
+            # collect all reactions in leaf→root order
+            _postorder(tree)
+            # now assign 0,1,2,… in that order
+            reactions = {i: rxn for i, rxn in enumerate(rxn_list)}
+            routes_dict[int(route_idx)] = reactions
+        return routes_dict
+def read_routes_json(file_path="routes.csv", to_dict=False):
+    with open(file_path, "r") as file:
+        routes_json = json.load(file)
+    if to_dict:
+        return make_dict(routes_json)
+    return routes_json
+def read_routes_csv(file_path="routes.csv"):
+    """
+    Read a CSV with columns: route_id, step_id, smiles, meta
+    and return a nested dict mapping
+        route_id (int) -> step_id (int) -> ReactionContainer
+    (ignoring meta for now, but you could extract it if needed).
+    """
+    routes_dict = {}
+    with open(file_path, newline="") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            route_id = int(row["route_id"])
+            step_id = int(row["step_id"])
+            smiles = row["smiles"]
+            # adjust this constructor to your actual API
+            reaction = read_smiles(smiles)
+            routes_dict.setdefault(route_id, {})[step_id] = reaction
+    return routes_dict
+def make_json(routes_dict, keep_ids=True):
+    """
+    Convert routes into a nested JSON tree of reaction and molecule nodes.
+    Args:
+        routes_dict (dict[int, dict[int, Reaction]]): Mapping route IDs to steps (step_id -> Reaction).
+        keep_ids (bool): If True, returns a list of route trees; otherwise returns a dict mapping route IDs to trees.
+    Returns:
+        list or dict: JSON-like tree(s) of routes.
+    """
+    # Prepare output
+    all_routes = {} if keep_ids else []
+    for route_id, steps in routes_dict.items():
+        if not steps:
+            continue
+        # Determine target molecule atoms from the final step of this route
+        final_step = max(steps)
+        target = steps[final_step].products[0]
+        atom_nums = set(target._atoms.keys())
+        # Precompute canonical SMILES and producer mapping for all products
+        prod_map = {}  # smiles -> list of step_ids
+        for sid, rxn in steps.items():
+            for prod in rxn.products:
+                prod.kekule()
+                prod.implicify_hydrogens()
+                prod.thiele()
+                s = str(prod)
+                prod_map.setdefault(s, []).append(sid)
+        def transform(mol):
+            mol.kekule()
+            mol.implicify_hydrogens()
+            mol.thiele()
+            return str(mol)
+        def build_mol_node(sid):
+            """Find the product with any overlap to target atoms and recurse into its reaction."""
+            rxn = steps[sid]
+            for p in rxn.products:
+                if atom_nums & set(p._atoms.keys()):
+                    smiles = str(p)
+                    return {
+                        "type": "mol",
+                        "smiles": smiles,
+                        "children": [build_reaction_node(sid)],
+                        "in_stock": False,
+                    }
+            # Shouldn't reach here if tree is consistent
+            return None
+        def build_reaction_node(sid):
+            """Build reaction node and recurse into reactant molecule nodes."""
+            rxn = steps[sid]
+            node = {"type": "reaction", "smiles": format(rxn, "m"), "children": []}
+            for react in rxn.reactants:
+                r_smi = transform(react)
+                # Look up any prior step producing this reactant
+                prior = [ps for ps in prod_map.get(r_smi, []) if ps < sid]
+                if prior:
+                    node["children"].append(build_mol_node(max(prior)))
+                else:
+                    node["children"].append(
+                        {"type": "mol", "smiles": r_smi, "in_stock": True}
+                    )
+            return node
+        # Build route tree and store
+        tree = build_mol_node(final_step)
+        if keep_ids:
+            all_routes[int(route_id)] = tree
+        else:
+            all_routes.append(tree)
+    return all_routes
+def write_routes_json(routes_dict, file_path):
+    """Serialize reaction routes to a JSON file."""
+    routes_json = make_json(routes_dict)
+    with open(file_path, "w") as f:
+        json.dump(routes_json, f, indent=2)
+def write_routes_csv(routes_dict, file_path="routes.csv"):
+    """
+    Write out a nested routes_dict of the form
+        { route_id: { step_id: reaction_obj, ... }, ... }
+    to a CSV with columns: route_id, step_id, smiles, meta
+    where smiles is format(reaction, 'm') and meta is left blank.
+    """
+    with open(file_path, "w", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        # header row
+        writer.writerow(["route_id", "step_id", "smiles", "meta"])
+        # sort routes and steps for deterministic output
+        for route_id in sorted(routes_dict):
+            steps = routes_dict[route_id]
+            for step_id in sorted(steps):
+                reaction = steps[step_id]
+                smiles = format(reaction, "m")
+                meta = ""  # or reaction.meta if you add that later
+                writer.writerow([route_id, step_id, smiles, meta])
+class TreeWrapper:
+    def __init__(self, tree, mol_id=1, config=1, path="planning_results/forest"):
+        """Initializes the TreeWrapper."""
+        self.tree = tree
+        self.mol_id = mol_id
+        self.config = config
+        self.path = path
+        # Ensure the directory exists before creating the filename
+        os.makedirs(self.path, exist_ok=True)
+        self.filename = os.path.join(self.path, f"tree_{mol_id}_{config}.pkl")
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        tree_state = self.tree.__dict__.copy()
+        # Reset or remove non-pickleable attributes (e.g., _tqdm, policy_network, value_network)
+        if "_tqdm" in tree_state:
+            tree_state["_tqdm"] = True  # Reset to a simple flag
+        for attr in ["policy_network", "value_network"]:
+            if attr in tree_state:
+                tree_state[attr] = None
+        state["tree_state"] = tree_state
+        del state["tree"]
+        return state
+    def __setstate__(self, state):
+        tree_state = state.pop("tree_state")
+        self.__dict__.update(state)
+        new_tree = Tree.__new__(Tree)
+        new_tree.__dict__.update(tree_state)
+        self.tree = new_tree
+    def save_tree(self):
+        """Saves the TreeWrapper instance (including the tree state) to a file."""
+        try:
+            with open(self.filename, "wb") as f:
+                pickle.dump(self, f)
+            print(
+                f"Tree wrapper for mol_id '{self.mol_id}', config '{self.config}' saved to '{self.filename}'."
+            )
+        except Exception as e:
+            print(f"Error saving tree to {self.filename}: {e}")
+    @classmethod
+    def load_tree_from_id(cls, mol_id, config=1, path="planning_results/forest"):
+        """
+        Loads a Tree object from a saved file using mol_id and config.
+        Args:
+            mol_id: The molecule ID used for saving.
+            config: The configuration used for saving.
+            path: The directory where the file is located
+        Returns:
+            The loaded Tree object, or None if loading fails.
+        """
+        filename = os.path.join(path, f"tree_{mol_id}_{config}.pkl")
+        print(f"Attempting to load tree from: {filename}")
+        try:
+            # Ensure the 'Tree' class is defined in the current scope
+            if "Tree" not in globals() and "Tree" not in locals():
+                raise NameError(
+                    "The 'Tree' class definition is required to load the object."
+                )
+            with open(filename, "rb") as f:
+                loaded_wrapper = pickle.load(f)  # This implicitly calls __setstate__
+            print(
+                f"Tree object for mol_id '{mol_id}', config '{config}' successfully loaded from '{filename}'."
+            )
+            # The __setstate__ method already reconstructed the tree inside the wrapper
+            return loaded_wrapper.tree
+        except FileNotFoundError:
+            print(f"Error: File not found at {filename}")
+            return None
+        except (pickle.UnpicklingError, EOFError) as e:
+            print(
+                f"Error: Could not unpickle file {filename}. It might be corrupted or empty. Details: {e}"
+            )
+            return None
+        except NameError as e:
+            print(f"Error during loading: {e}. Ensure 'Tree' class is defined.")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred loading tree from {filename}: {e}")
+            return None

synplan/chem/reaction_routes/leaving_groups.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from CGRtools.periodictable import Core, At, DynamicElement
+from typing import Optional
+class Marked(Core):
+    __slots__ = "__mark", "_isotope"
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__mark = None
+        self._isotope = 0  # Make sure this exists
+    @property
+    def mark(self):
+        return self.__mark
+    @mark.setter
+    def mark(self, mark):
+        self.__mark = mark
+    @property
+    def isotope(self):
+        return getattr(self, "_isotope", 0)  # Always returns int
+    @isotope.setter
+    def isotope(self, value):
+        self._isotope = int(value)
+    def __repr__(self):
+        return f"{self.symbol}({self.isotope})"
+    @property
+    def atomic_symbol(self) -> str:
+        return self.__class__.__name__[6:]
+    @property
+    def symbol(self) -> str:
+        return "X"  # For human-readable representation
+    def __len__(self):
+        return super().__len__()
+class MarkedAt(Marked, At):
+    atomic_number = At.atomic_number
+    @property
+    def atomic_symbol(self):
+        return "At"
+    @property
+    def symbol(self):
+        return "X"
+    def __repr__(self):
+        return f"X({self.isotope})"
+    def __str__(self):
+        return f"X({self.isotope})"
+    def __hash__(self):
+        return hash(
+            (
+                self.isotope,
+                getattr(self, "atomic_number", 0),
+                getattr(self, "charge", 0),
+                getattr(self, "is_radical", False),
+            )
+        )
+class DynamicX(DynamicElement):
+    __slots__ = ("_mark", "_isotope")
+    atomic_number = 85
+    mass = 0.0
+    group = 0
+    period = 0
+    isotopes_distribution = list(range(20))
+    atomic_radius = 0.5
+    isotopes_masses = 0
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._isotope = None
+        self._mark = None
+    @property
+    def mark(self):
+        return getattr(self, "_mark", None)
+    @mark.setter
+    def mark(self, value):
+        self._mark = value
+    @property
+    def isotope(self):
+        return getattr(self, "_isotope", None)
+    @isotope.setter
+    def isotope(self, value):
+        self._isotope = value
+    @property
+    def symbol(self) -> str:
+        return "X"
+    def valence_rules(
+        self, charge: int = 0, is_radical: bool = False, valence: int = 0
+    ) -> tuple:
+        if charge == 0 and not is_radical and (valence == 1):
+            return tuple()
+        elif charge == 0 and not is_radical and valence == 0:
+            return tuple()
+        else:
+            return tuple()
+    def __repr__(self):
+        return f"Dynamic{self.symbol}()"
+    @property
+    def p_charge(self) -> int:
+        return self.charge
+    @property
+    def p_is_radical(self) -> bool:
+        return self.is_radical
+    @property
+    def p_hybridization(self) -> Optional[int]:
+        return self.hybridization

synplan/chem/reaction_routes/route_cgr.py ADDED Viewed

	@@ -0,0 +1,570 @@

+from CGRtools.containers.bonds import DynamicBond
+from CGRtools.containers import ReactionContainer, CGRContainer, MoleculeContainer
+from synplan.mcts.tree import Tree
+def find_next_atom_num(reactions: list):
+    """
+    Find the next available atom number across a list of reactions.
+    This function iterates through a list of reaction containers, composes
+    each reaction to get its Condensed Graph of Reaction (CGR), and finds
+    the maximum atom index used within each CGR. It then returns the maximum
+    atom index found across all reactions plus one, providing a unique
+    next available atom number.
+    Args:
+        reactions (list): A list of ReactionContainer objects.
+    Returns:
+        int: The next available integer atom number, which is one greater
+             than the maximum atom index found in any of the provided reaction CGRs.
+    """
+    max_num = 0
+    for reaction in reactions:
+        cgr = reaction.compose()
+        max_num = max(max_num, max(cgr._atoms.keys()))
+    return max_num + 1
+def get_clean_mapping(
+    curr_prod: MoleculeContainer, prod: MoleculeContainer, reverse: bool = False
+):
+    """
+    Get a 'clean' atom mapping between two molecules, avoiding conflicts.
+    This function attempts to establish a mapping between the atoms of two
+    MoleculeContainer objects (`curr_prod` and `prod`). It uses an internal
+    mapping mechanism and then filters the result to create a "clean" mapping.
+    The cleaning process specifically avoids adding entries to the mapping
+    where the source and target indices are the same, or where the target
+    index already exists as a source in the mapping with a different target.
+    It also checks for potential conflicts based on the atom keys present
+    in the original molecules.
+    Args:
+        curr_prod (MoleculeContainer): The first MoleculeContainer object.
+        prod (MoleculeContainer): The second MoleculeContainer object.
+        reverse (bool, optional): If True, the mapping is generated in the
+                                  reverse direction (from `prod` to `curr_prod`).
+                                  Defaults to False (mapping from `curr_prod` to `prod`).
+    Returns:
+        dict: A dictionary representing the clean atom mapping. Keys are atom
+              indices from the source molecule, and values are the corresponding
+              atom indices in the target molecule. Returns an empty dictionary
+              if no mapping is found or if the initial mapping is empty.
+    """
+    dict_map = {}
+    m = list(curr_prod.get_mapping(prod))
+    if len(m) == 0:
+        return dict_map
+    curr_atoms = set(curr_prod._atoms.keys())
+    prod_atoms = set(prod._atoms.keys())
+    rr = m[0]
+    # Build mapping while checking for conflicts
+    for key, value in rr.items():
+        if key != value:
+            if value in rr.keys() and rr[value] != key:
+                continue
+            source = value if reverse else key
+            target = key if reverse else value
+            if reverse and target in curr_atoms:
+                continue
+            if not reverse and target in prod_atoms:
+                continue
+            dict_map[source] = target
+    return dict_map
+def validate_molecule_components(curr_mol: MoleculeContainer, node_id: int):
+    """
+    Validate that a molecule has only one connected component.
+    This function checks if a given MoleculeContainer object represents a
+    single connected molecule or multiple disconnected fragments. It extracts
+    the connected components and prints an error message if more than one
+    component is found, indicating a potential issue with the molecule
+    representation within a specific tree node.
+    Args:
+        curr_mol (MoleculeContainer): The MoleculeContainer object to validate.
+        node_id (int): The ID of the tree node associated with this molecule,
+                       used for reporting purposes in the error message.
+    """
+    new_rmol = [curr_mol.substructure(c) for c in curr_mol.connected_components]
+    if len(new_rmol) > 1:
+        print(f"Error tree {node_id}: We have more than one molecule in one node")
+def get_leaving_groups(products: list):
+    """
+    Extract leaving group atom numbers from a list of reaction products.
+    This function takes a list of product MoleculeContainer objects resulting
+     from a reaction. It assumes the first molecule in the list is the main
+    product and the subsequent molecules are leaving groups. It collects
+    the atom indices (keys from the `_atoms` dictionary) for all molecules
+    except the first one, considering these indices as belonging to leaving
+    group atoms.
+    Args:
+        products (list): A list of MoleculeContainer objects representing the
+                         products of a reaction. The first element is assumed
+                         to be the main product.
+    Returns:
+        list: A list of integer atom indices corresponding to the atoms
+              in the leaving group molecules.
+    """
+    lg_atom_nums = []
+    for i, prod in enumerate(products):
+        if i != 0:  # Skip first product (main product)
+            lg_atom_nums.extend(prod._atoms.keys())
+    return lg_atom_nums
+def process_first_reaction(first_react: ReactionContainer, tree: Tree, node_id: int):
+    """
+    Process the first reaction in a retrosynthetic route and initialize the building block set.
+    This function takes the first reaction in a route, iterates through its
+    reactants, validates that each reactant is a single connected component,
+    and identifies potential building blocks. A reactant is considered a
+    potential building block if its size is less than or equal to the
+    minimum molecule size defined in the tree's configuration or if its
+    SMILES string is present in the tree's building blocks set. The atom
+    indices of such building blocks are collected into a set.
+    Args:
+        first_react (ReactionContainer): The first ReactionContainer object in the route.
+        tree (Tree): The Tree object containing the retrosynthetic search tree
+                     and configuration (including `min_mol_size` and `building_blocks`).
+        node_id (int): The ID of the tree node associated with this reaction,
+                       used for validation reporting.
+    Returns:
+        set: A set of integer atom indices corresponding to the atoms
+             identified as part of building blocks in the first reaction's reactants.
+    """
+    bb_set = set()
+    for curr_mol in first_react.reactants:
+        react_key = tuple(curr_mol._atoms)
+        react_key_set = set(react_key)
+        if (
+            len(curr_mol) <= tree.config.min_mol_size
+            or str(curr_mol) in tree.building_blocks
+        ):
+            bb_set = react_key_set
+        validate_molecule_components(curr_mol, node_id)
+    return bb_set
+def update_reaction_dict(
+    reaction: ReactionContainer,
+    node_id: int,
+    mapping: dict,
+    react_dict: dict,
+    tree: Tree,
+    bb_set: set,
+    prev_remap: dict = None,
+):
+    """
+    Update a reaction dictionary with atom mappings and identify building blocks.
+    This function processes the reactants of a given reaction, validates their
+    structure (single connected component), updates a dictionary (`react_dict`)
+    with atom mappings for each reactant, and expands a set of building block
+    atom indices (`bb_set`). The mapping is filtered based on the atoms present
+    in the current reactant, and can optionally include a previous remapping.
+    Reactants are identified as building blocks based on size or presence in
+    the tree's building blocks set.
+    Args:
+        reaction (ReactionContainer): The ReactionContainer object representing the reaction.
+        node_id (int): The ID of the tree node associated with this synthethic route,
+                       used for validation reporting.
+        mapping (dict): The primary atom mapping dictionary to filter and apply.
+        react_dict (dict): The dictionary to update with filtered mappings for each reactant.
+                           Keys are tuples of atom indices for each reactant molecule.
+        tree (Tree): The Tree object containing the retrosynthetic search tree
+                     and configuration (including `min_mol_size` and `building_blocks`).
+        bb_set (set): The set of building block atom indices to update.
+        prev_remap (dict, optional): An optional dictionary representing a previous
+                                     remapping to include in the filtered mapping.
+                                     Defaults to None.
+    Returns:
+        tuple: A tuple containing:
+               - dict: The updated `react_dict` with filtered mappings for each reactant.
+               - set: The updated `bb_set` including atom indices from newly identified
+                      building blocks.
+    """
+    for curr_mol in reaction.reactants:
+        react_key = tuple(curr_mol._atoms)
+        react_key_set = set(react_key)
+        validate_molecule_components(curr_mol, node_id)
+        if (
+            len(curr_mol) <= tree.config.min_mol_size
+            or str(curr_mol) in tree.building_blocks
+        ):
+            bb_set = bb_set.union(react_key_set)
+        # Filter the mapping to include only keys present in the current react_key
+        filtered_mapping = {k: v for k, v in mapping.items() if k in react_key_set}
+        if prev_remap:
+            prev_remappping = {
+                k: v for k, v in prev_remap.items() if k in react_key_set
+            }
+            filtered_mapping.update(prev_remappping)
+        react_dict[react_key] = filtered_mapping
+    return react_dict, bb_set
+def process_target_blocks(
+    curr_products: list,
+    curr_prod: MoleculeContainer,
+    lg_atom_nums: list,
+    curr_lg_atom_nums: list,
+    bb_set: set,
+):
+    """
+    Identifies and collects atom indices for target blocks based on leaving groups and building blocks.
+    This function iterates through a list of current product molecules, compares their atoms
+    to a reference molecule (`curr_prod`), and collects the indices of atoms that correspond
+    to atoms in the provided leaving group lists (`lg_atom_nums`, `curr_lg_atom_nums`) or
+    the building block set (`bb_set`). This is typically used to identify parts of molecules
+    that should be treated as 'target blocks' during a remapping or analysis process.
+    Args:
+        curr_products (list): A list of MoleculeContainer objects representing the current products.
+        curr_prod (MoleculeContainer): A reference MoleculeContainer object, likely the main product,
+                                       used for mapping atom indices.
+        lg_atom_nums (list): A list of integer atom indices identified as leaving group atoms
+                             in a relevant context.
+        curr_lg_atom_nums (list): Another list of integer atom indices identified as leaving
+                                   group atoms, potentially from a different context than `lg_atom_nums`.
+        bb_set (set): A set of integer atom indices identified as building block atoms.
+    Returns:
+        list: A list of integer atom indices that are identified as 'target blocks' based on
+              their presence in the leaving group lists or building block set after mapping
+              to the reference molecule.
+    """
+    target_block = []
+    if len(curr_products) > 1:
+        for prod in curr_products:
+            dict_map = get_clean_mapping(curr_prod, prod)
+            if prod._atoms.keys() != curr_prod._atoms.keys():
+                for key in list(prod._atoms.keys()):
+                    if key in lg_atom_nums or key in curr_lg_atom_nums:
+                        target_block.append(key)
+                    if key in bb_set:
+                        target_block.append(key)
+    return target_block
+def compose_route_cgr(tree_or_routes, node_id):
+    """
+    Process a single synthesis route maintaining consistent state.
+    Parameters
+    ----------
+    tree_or_routes : synplan.mcts.tree.Tree
+        or dict mapping route_id -> {step_id: ReactionContainer}
+    node_id : int
+        the route index (in the Tree’s winning_nodes, or the dict’s keys)
+    Returns
+    -------
+    dict or None
+      - if successful: { 'cgr': <composed CGR>, 'reactions_dict': {step: ReactionContainer,…} }
+      - on error: None
+    """
+    # ----------- dict-based branch ------------
+    if isinstance(tree_or_routes, dict):
+        routes_dict = tree_or_routes
+        if node_id not in routes_dict:
+            raise KeyError(f"Route {node_id} not in provided dict.")
+        # grab and sort the ReactionContainers in chronological order
+        step_map = routes_dict[node_id]
+        sorted_ids = sorted(step_map)
+        reactions = [step_map[i] for i in sorted_ids]
+        # start from the last (final) reaction
+        accum_cgr = reactions[-1].compose()
+        reactions_dict = {len(reactions) - 1: reactions[-1]}
+        # now fold backwards through the earlier steps
+        for idx in range(len(reactions) - 2, -1, -1):
+            rxn = reactions[idx]
+            curr_cgr = rxn.compose()
+            accum_cgr = curr_cgr.compose(accum_cgr)
+            reactions_dict[idx] = rxn
+        return {"cgr": accum_cgr, "reactions_dict": reactions_dict}
+    # ----------- tree-based branch ------------
+    tree = tree_or_routes
+    try:
+        # original tree-based logic:
+        reactions = tree.synthesis_route(node_id)
+        first_react = reactions[-1]
+        reactions_dict = {len(reactions) - 1: first_react}
+        accum_cgr = first_react.compose()
+        bb_set = process_first_reaction(first_react, tree, node_id)
+        react_dict = {}
+        max_num = find_next_atom_num(reactions)
+        for step in range(len(reactions) - 2, -1, -1):
+            reaction = reactions[step]
+            curr_cgr = reaction.compose()
+            curr_prod = reaction.products[0]
+            accum_products = accum_cgr.decompose()[1].split()
+            lg_atom_nums = get_leaving_groups(accum_products)
+            curr_products = curr_cgr.decompose()[1].split()
+            tuple_atoms = tuple(curr_prod._atoms)
+            prev_remap = react_dict.get(tuple_atoms, {})
+            if prev_remap:
+                curr_cgr = curr_cgr.remap(prev_remap, copy=True)
+            # identify new atom‐numbers for any overlap
+            target_block = process_target_blocks(
+                curr_products,
+                curr_prod,
+                lg_atom_nums,
+                [list(p._atoms.keys()) for p in curr_products[1:]],
+                bb_set,
+            )
+            mapping = {}
+            for atom_num in sorted(target_block):
+                if atom_num in accum_cgr._atoms and atom_num not in mapping:
+                    mapping[atom_num] = max_num
+                    max_num += 1
+            # carry forward any clean remap on the product itself
+            dict_map = {}
+            for ap in accum_products:
+                clean_map = get_clean_mapping(curr_prod, ap, reverse=True)
+                if clean_map:
+                    dict_map = clean_map
+                    break
+            if dict_map:
+                curr_cgr = curr_cgr.remap(dict_map, copy=False)
+            # update our react_dict & bb_set
+            react_dict, bb_set = update_reaction_dict(
+                reaction, node_id, mapping, react_dict, tree, bb_set, prev_remap
+            )
+            # apply the new overlap‐mapping
+            if mapping:
+                curr_cgr = curr_cgr.remap(mapping, copy=False)
+            reactions_dict[step] = ReactionContainer.from_cgr(curr_cgr)
+            accum_cgr = curr_cgr.compose(accum_cgr)
+        return {"cgr": accum_cgr, "reactions_dict": reactions_dict}
+    except Exception as e:
+        print(f"Error processing node {node_id}: {e}")
+        return None
+def compose_all_route_cgrs(tree_or_routes, node_id=None):
+    """
+    Process routes (reassign atom mappings) to compose RouteCGR.
+    Parameters
+    ----------
+    tree_or_routes : synplan.mcts.tree.Tree
+        or dict mapping route_id -> {step_id: ReactionContainer}
+    node_id : int or None
+        if None, do *all* winning routes (or all keys of the dict);
+        otherwise only that specific route.
+    Returns
+    -------
+    dict or None
+      - if node_id is None: {route_id: CGR, …}
+      - if node_id is given: {node_id: CGR}
+      - returns None on error
+    """
+    # dict-based branch
+    if isinstance(tree_or_routes, dict):
+        routes_dict = tree_or_routes
+        def _single(rid):
+            res = compose_route_cgr(routes_dict, rid)
+            return res["cgr"] if res else None
+        if node_id is not None:
+            if node_id not in routes_dict:
+                raise KeyError(f"Route {node_id} not in provided dict.")
+            return {node_id: _single(node_id)}
+        # all routes
+        result = {rid: _single(rid) for rid in sorted(routes_dict)}
+        return result
+    # tree-based branch
+    tree = tree_or_routes
+    route_cgrs = {}
+    if node_id is not None:
+        res = compose_route_cgr(tree, node_id)
+        if res:
+            route_cgrs[node_id] = res["cgr"]
+        else:
+            return None
+        return route_cgrs
+    for rid in sorted(set(tree.winning_nodes)):
+        res = compose_route_cgr(tree, rid)
+        if res:
+            route_cgrs[rid] = res["cgr"]
+    return route_cgrs
+def extract_reactions(tree: Tree, node_id=None):
+    """
+    Collect mapped reaction sequences from a synthesis tree.
+    Traverses either a single branch (if `node_id` is given) or all winning routes,
+    composing CGR-based reactions for each, and returns a dict of reaction mappings.
+    Ensures that in every extracted reaction, atom indices are uniquely mapped (no overlaps)
+    Parameters
+    ----------
+    tree : ReactionTree
+        A retrosynthetic tree object with a `.winning_nodes` attribute and
+        supporting `compose_route_cgr(...)`.
+    node_id : hashable, optional
+        If provided, only extract reactions for this specific node/route.
+    Returns
+    -------
+    dict[node_id, dict]
+        Maps each route terminal node ID to its `reactions_dict` (as returned
+        by `compose_route_cgr`). Returns `None` if the specified `node_id` fails
+        to produce valid reactions.
+    """
+    react_dict = {}
+    if node_id is not None:
+        result = compose_route_cgr(tree, node_id)
+        if result:
+            react_dict[node_id] = result["reactions_dict"]
+        else:
+            return None
+        return react_dict
+    for node_id in set(tree.winning_nodes):
+        result = compose_route_cgr(tree, node_id)
+        if result:
+            react_dict[node_id] = result["reactions_dict"]
+    return dict(sorted(react_dict.items()))
+def compose_reduced_route_cgr(route_cgr: CGRContainer):
+    """
+    Reduces a Routes Condensed Graph of reaction (RouteCGR) by performing the following steps:
+    1. Extracts substructures corresponding to connected components from the input RouteCGR.
+    2. Selects the first substructure as the target to work on.
+    3. Iterates over all bonds in the target RouteCGR:
+       - If a bond is identified as a "leaving group" (its primary order is None while its original order is defined),
+         the bond is removed.
+       - If a bond has a modified order (both primary and original orders are integers) and the primary order is less than the original,
+         the bond is deleted and then re-added with a new dynamic bond using the primary order (this updates the bond to the reduced form).
+    4. After bond modifications, re-extracts the substructure from the target RouteCGR (now called the reduced RouteCGR or ReducedRouteCGR).
+    5. If the charge distributions (_p_charges vs. _charges) differ, neutralizes the charges by setting them to zero.
+    Args:
+        route_cgr: The input RouteCGR object to be reduced.
+    Returns:
+        The reduced RouteCGR object.
+    """
+    # Get all connected components of the RouteCGR as separate substructures.
+    cgr_prods = [route_cgr.substructure(c) for c in route_cgr.connected_components]
+    target_cgr = cgr_prods[
+        0
+    ]  # Choose the first substructure (main product) for further reduction.
+    # Iterate over each bond in the target RouteCGR.
+    bond_items = list(target_cgr._bonds.items())
+    for atom1, bond_set in bond_items:
+        bond_set_items = list(bond_set.items())
+        for atom2, bond in bond_set_items:
+            # Removing bonds corresponding to leaving groups:
+            # If product bond order is None (indicating a leaving group) but an original bond order exists,
+            # delete the bond.
+            if bond.p_order is None and bond.order is not None:
+                target_cgr.delete_bond(atom1, atom2)
+            # For bonds that have been modified (not leaving groups) where the new (primary) order is less than the original:
+            # Remove the bond and re-add it using the DynamicBond with the primary order for both bond orders.
+            elif (
+                type(bond.p_order) is int
+                and type(bond.order) is int
+                and bond.p_order != bond.order
+            ):
+                p_order = int(bond.p_order)
+                target_cgr.delete_bond(atom1, atom2)
+                target_cgr.add_bond(atom1, atom2, DynamicBond(p_order, p_order))
+    # After modifying bonds, extract the reduced RouteCGR from the target's connected components.
+    reduced_route_cgr = [
+        target_cgr.substructure(c) for c in target_cgr.connected_components
+    ][0]
+    # Neutralize charges if the primary charges and current charges differ.
+    if reduced_route_cgr._p_charges != reduced_route_cgr._charges:
+        for num, charge in reduced_route_cgr._charges.items():
+            if charge != 0:
+                reduced_route_cgr._atoms[num].charge = 0
+    return reduced_route_cgr
+def compose_all_reduced_route_cgrs(route_cgrs_dict: dict):
+    """
+    Processes a collection (dictionary) of RouteCGRs to generate their reduced forms (ReducedRouteCGRs).
+    Iterates over each RouteCGR in the provided dictionary and applies the compose_reduced_route_cgr function.
+    Args:
+        route_cgrs_dict (dict): A dictionary where keys are identifiers (e.g., route numbers)
+                                and values are RouteCGR objects.
+    Returns:
+        dict: A dictionary where each key corresponds to the original identifier from
+              `route_cgrs_dict` and the value is the corresponding ReducedRouteCGR object.
+    """
+    all_reduced_route_cgrs = dict()
+    for num, cgr in route_cgrs_dict.items():
+        all_reduced_route_cgrs[num] = compose_reduced_route_cgr(cgr)
+    return all_reduced_route_cgrs

synplan/chem/reaction_routes/visualisation.py ADDED Viewed

	@@ -0,0 +1,903 @@

+from CGRtools.algorithms.depict import (
+    Depict,
+    DepictMolecule,
+    DepictCGR,
+    rotate_vector,
+    _render_charge,
+)
+from CGRtools.containers import ReactionContainer, MoleculeContainer, CGRContainer
+from collections import defaultdict
+from uuid import uuid4
+from math import hypot
+from functools import partial
+class WideBondDepictCGR(DepictCGR):
+    """
+    Like DepictCGR, but all DynamicBonds
+    are drawn 2.5× wider than the standard bond width.
+    """
+    __slots__ = ()
+    def _render_bonds(self):
+        """
+        Renders the bonds of the CGR as SVG lines, with DynamicBonds drawn wider.
+        This method overrides the base `_render_bonds` to apply a wider stroke
+        to DynamicBonds, highlighting changes in bond order during a reaction.
+        It iterates through all bonds, calculates their positions based on
+        2D coordinates, and generates SVG `<line>` elements with appropriate
+        styles (color, width, dash array) based on the bond's original (`order`)
+        and primary (`p_order`) states. Aromatic bonds are handled separately
+        using a helper method.
+        Returns:
+            list: A list of strings, where each string is an SVG element
+                  representing a bond.
+        """
+        plane = self._plane
+        config = self._render_config
+        # get the normal width (default 1.0) and compute a 4× wide stroke
+        normal_width = config.get("bond_width", 0.02)
+        wide_width = normal_width * 2.5
+        broken = config["broken_color"]
+        formed = config["formed_color"]
+        dash1, dash2 = config["dashes"]
+        double_space = config["double_space"]
+        triple_space = config["triple_space"]
+        svg = []
+        ar_bond_colors = defaultdict(dict)
+        for n, m, bond in self.bonds():
+            order, p_order = bond.order, bond.p_order
+            nx, ny = plane[n]
+            mx, my = plane[m]
+            # invert Y for SVG
+            ny, my = -ny, -my
+            rv = partial(rotate_vector, 0, x2=mx - nx, y2=ny - my)
+            if order == 1:
+                if p_order == 1:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order is None:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order == 4:
+                if p_order == 4:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"  stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order is None:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = broken
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = None
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order == 2:
+                if p_order == 2:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed} stroke-width="{wide_width:.2f}""/>'
+                    )
+                elif p_order is None:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order == 3:
+                if p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" '
+                        f'stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}" '
+                        f'y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" x2="{mx - dx:.2f}" '
+                        f'y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order is None:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" '
+                        f'x2="{mx:.2f}" y2="{my:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    dx, dy = rv(double_space)
+                    dx3 = 3 * dx
+                    dy3 = 3 * dy
+                    svg.append(
+                        f'      <line x1="{nx + dx3:.2f}" y1="{ny - dy3:.2f}" x2="{mx + dx3:.2f}" '
+                        f'y2="{my - dy3:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx3:.2f}" y1="{ny + dy3:.2f}" x2="{mx - dx3:.2f}" '
+                        f'y2="{my + dy3:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            elif order is None:
+                if p_order == 1:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = formed
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(double_space)
+                    # dx = dx // 1.4
+                    # dy = dy // 1.4
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}" '
+                        f'y2="{my - dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" x2="{mx - dx:.2f}" '
+                        f'y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}" '
+                        f'stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+            else:
+                if p_order == 8:
+                    svg.append(
+                        f'        <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}" '
+                        f'stroke-dasharray="{dash1:.2f} {dash2:.2f}"/>'
+                    )
+                elif p_order == 1:
+                    dx, dy = rv(double_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 4:
+                    ar_bond_colors[n][m] = ar_bond_colors[m][n] = None
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 2:
+                    dx, dy = rv(triple_space)
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" x2="{mx + dx:.2f}"'
+                        f' y2="{my - dy:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}"'
+                        f' stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                elif p_order == 3:
+                    dx, dy = rv(double_space)
+                    dx3 = 3 * dx
+                    dy3 = 3 * dy
+                    svg.append(
+                        f'      <line x1="{nx + dx3:.2f}" y1="{ny - dy3:.2f}" x2="{mx + dx3:.2f}" '
+                        f'y2="{my - dy3:.2f}" stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx + dx:.2f}" y1="{ny - dy:.2f}" '
+                        f'x2="{mx + dx:.2f}" y2="{my - dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx:.2f}" y1="{ny + dy:.2f}" '
+                        f'x2="{mx - dx:.2f}" y2="{my + dy:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                    svg.append(
+                        f'      <line x1="{nx - dx3:.2f}" y1="{ny + dy3:.2f}" '
+                        f'x2="{mx - dx3:.2f}" y2="{my + dy3:.2f}" stroke="{formed}" stroke-width="{wide_width:.2f}"/>'
+                    )
+                else:
+                    svg.append(
+                        f'      <line x1="{nx:.2f}" y1="{ny:.2f}" x2="{mx:.2f}" y2="{my:.2f}" '
+                        f'stroke-dasharray="{dash1:.2f} {dash2:.2f}" stroke="{broken}" stroke-width="{wide_width:.2f}"/>'
+                    )
+        # aromatic rings - unchanged
+        for ring in self.aromatic_rings:
+            cx = sum(plane[x][0] for x in ring) / len(ring)
+            cy = sum(plane[x][1] for x in ring) / len(ring)
+            for n, m in zip(ring, ring[1:]):
+                nx, ny = plane[n]
+                mx, my = plane[m]
+                aromatic = self.__render_aromatic_bond(
+                    nx, ny, mx, my, cx, cy, ar_bond_colors[n].get(m)
+                )
+                if aromatic:
+                    svg.append(aromatic)
+            n, m = ring[-1], ring[0]
+            nx, ny = plane[n]
+            mx, my = plane[m]
+            aromatic = self.__render_aromatic_bond(
+                nx, ny, mx, my, cx, cy, ar_bond_colors[n].get(m)
+            )
+            if aromatic:
+                svg.append(aromatic)
+        return svg
+    def __render_aromatic_bond(self, n_x, n_y, m_x, m_y, c_x, c_y, color):
+        config = self._render_config
+        dash1, dash2 = config["dashes"]
+        dash3, dash4 = config["aromatic_dashes"]
+        aromatic_space = config["cgr_aromatic_space"]
+        normal_width = config.get("bond_width", 0.02)
+        wide_width = normal_width * 2
+        # n aligned xy
+        mn_x, mn_y, cn_x, cn_y = m_x - n_x, m_y - n_y, c_x - n_x, c_y - n_y
+        # nm reoriented xy
+        mr_x, mr_y = hypot(mn_x, mn_y), 0
+        cr_x, cr_y = rotate_vector(cn_x, cn_y, mn_x, -mn_y)
+        if cr_y and aromatic_space / cr_y < 0.65:
+            if cr_y > 0:
+                r_y = aromatic_space
+            else:
+                r_y = -aromatic_space
+                cr_y = -cr_y
+            ar_x = aromatic_space * cr_x / cr_y
+            br_x = mr_x - aromatic_space * (mr_x - cr_x) / cr_y
+            # backward reorienting
+            an_x, an_y = rotate_vector(ar_x, r_y, mn_x, mn_y)
+            bn_x, bn_y = rotate_vector(br_x, r_y, mn_x, mn_y)
+            if color:
+                # print('color')
+                return (
+                    f'      <line x1="{an_x + n_x:.2f}" y1="{-an_y - n_y:.2f}" x2="{bn_x + n_x:.2f}" '
+                    f'y2="{-bn_y - n_y:.2f}" stroke-dasharray="{dash3:.2f} {dash4:.2f}" stroke="{color}" stroke-width="{wide_width:.2f}"/>'
+                )
+            elif color is None:
+                dash3, dash4 = dash1, dash2
+            return (
+                f'      <line x1="{an_x + n_x:.2f}" y1="{-an_y - n_y:.2f}"'
+                f' x2="{bn_x + n_x:.2f}" y2="{-bn_y - n_y:.2f}" stroke-dasharray="{dash3:.2f} {dash4:.2f}"/>'
+            )
+def cgr_display(cgr: CGRContainer) -> str:
+    """
+    Generates an SVG string for displaying a CGR with wider DynamicBonds.
+    This function temporarily modifies the rendering methods of the
+    `CGRContainer` class to use the bond rendering logic from
+    `WideBondDepictCGR`, which draws DynamicBonds with a wider stroke.
+    It cleans the 2D coordinates of the input CGR and then calls its
+    `depict()` method to generate the SVG string using the modified
+    rendering behavior.
+    Args:
+        cgr (CGRContainer): The CGRContainer object to be depicted.
+    Returns:
+        str: An SVG string representing the depiction of the CGR
+             with wider DynamicBonds.
+    """
+    CGRContainer._CGRContainer__render_aromatic_bond = (
+        WideBondDepictCGR._WideBondDepictCGR__render_aromatic_bond
+    )
+    CGRContainer._render_bonds = WideBondDepictCGR._render_bonds
+    CGRContainer._WideBondDepictCGR__render_aromatic_bond = (
+        WideBondDepictCGR._WideBondDepictCGR__render_aromatic_bond
+    )
+    cgr.clean2d()
+    return cgr.depict()
+class CustomDepictMolecule(DepictMolecule):
+    """
+    Custom molecule depiction class that uses atom.symbol for rendering.
+    """
+    def _render_atoms(self):
+        bonds = self._bonds
+        plane = self._plane
+        charges = self._charges
+        radicals = self._radicals
+        hydrogens = self._hydrogens
+        config = self._render_config
+        carbon = config["carbon"]
+        mapping = config["mapping"]
+        span_size = config["span_size"]
+        font_size = config["font_size"]
+        monochrome = config["monochrome"]
+        other_size = config["other_size"]
+        atoms_colors = config["atoms_colors"]
+        mapping_font = config["mapping_size"]
+        dx_m, dy_m = config["dx_m"], config["dy_m"]
+        dx_ci, dy_ci = config["dx_ci"], config["dy_ci"]
+        symbols_font_style = config["symbols_font_style"]
+        # for cumulenes
+        try:
+            # Check if _cumulenes method exists and handle potential errors
+            cumulenes = {
+                y
+                for x in self._cumulenes(heteroatoms=True)
+                if len(x) > 2
+                for y in x[1:-1]
+            }
+        except AttributeError:
+            cumulenes = set()  # Fallback if _cumulenes is not available or fails
+        if monochrome:
+            map_fill = other_fill = "black"
+        else:
+            map_fill = config["mapping_color"]
+            other_fill = config["other_color"]
+        svg = []
+        maps = []
+        others = []
+        font2 = 0.2 * font_size
+        font3 = 0.3 * font_size
+        font4 = 0.4 * font_size
+        font5 = 0.5 * font_size
+        font6 = 0.6 * font_size
+        font7 = 0.7 * font_size
+        font15 = 0.15 * font_size
+        font25 = 0.25 * font_size
+        mask = defaultdict(list)
+        for n, atom in self._atoms.items():
+            x, y = plane[n]
+            y = -y
+            # --- KEY CHANGE HERE ---
+            # Use atom.symbol if it exists, otherwise fallback to atomic_symbol
+            try:
+                symbol = atom.symbol
+            except AttributeError:
+                symbol = atom.atomic_symbol  # Fallback if .symbol doesn't exist
+            # --- END KEY CHANGE ---
+            if (
+                not bonds.get(n)
+                or symbol != "C"
+                or carbon
+                or atom.charge
+                or atom.is_radical
+                or atom.isotope
+                or n in cumulenes
+            ):  # Added bonds.get(n) check for single atoms
+                # Calculate hydrogens if the attribute exists, otherwise default to 0
+                try:
+                    h = hydrogens[n]
+                except (KeyError, AttributeError):
+                    h = 0  # Default if _hydrogens is missing or key n is not present
+                if h == 1:
+                    h_str = "H"
+                    span = ""
+                elif h and h > 1:  # Check if h is not None and greater than 1
+                    span = f'<tspan  dy="{config["span_dy"]:.2f}" font-size="{span_size:.2f}">{h}</tspan>'
+                    h_str = "H"
+                else:
+                    h_str = ""
+                    span = ""
+                # Handle charges and radicals safely
+                charge_val = charges.get(n, 0)
+                is_radical = radicals.get(n, False)
+                if charge_val:
+                    t = f'{_render_charge.get(charge_val, "")}{"↑" if is_radical else ""}'  # Use .get for safety
+                    if t:  # Only add if charge text is generated
+                        others.append(
+                            f'        <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}" dy="-{dy_ci:.2f}">'
+                            f"{t}</text>"
+                        )
+                        mask["other"].append(
+                            f'           <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}" dy="-{dy_ci:.2f}">'
+                            f"{t}</text>"
+                        )
+                elif is_radical:
+                    others.append(
+                        f'        <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}" dy="-{dy_ci:.2f}">↑</text>'
+                    )
+                    mask["other"].append(
+                        f'            <text x="{x:.2f}" y="{y:.2f}" dx="{dx_ci:.2f}"'
+                        f' dy="-{dy_ci:.2f}">↑</text>'
+                    )
+                # Handle isotope safely
+                try:
+                    iso = atom.isotope
+                    if iso:
+                        t = iso
+                        others.append(
+                            f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_ci:.2f}" dy="-{dy_ci:.2f}" '
+                            f'text-anchor="end">{t}</text>'
+                        )
+                        mask["other"].append(
+                            f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_ci:.2f}"'
+                            f' dy="-{dy_ci:.2f}" text-anchor="end">{t}</text>'
+                        )
+                except AttributeError:
+                    pass  # Atom might not have isotope attribute
+                # Determine atom color based on atomic_number, default to black if monochrome or not found
+                atom_color = "black"
+                if not monochrome:
+                    try:
+                        an = atom.atomic_number
+                        if 0 < an <= len(atoms_colors):
+                            atom_color = atoms_colors[an - 1]
+                        else:
+                            atom_color = atoms_colors[
+                                5
+                            ]  # Default to Carbon color if out of range
+                    except AttributeError:
+                        atom_color = atoms_colors[
+                            5
+                        ]  # Default to Carbon color if no atomic_number
+                svg.append(
+                    f'      <g fill="{atom_color}" '
+                    f'font-family="{symbols_font_style }">'
+                )
+                # Adjust dx based on symbol length for better centering
+                if len(symbol) > 1:
+                    dx = font7
+                    dx_mm = dx_m + font5
+                    if symbol[-1].lower() in (
+                        "l",
+                        "i",
+                        "r",
+                        "t",
+                    ):  # Heuristic for narrow last letters
+                        rx = font6
+                        ax = font25
+                    else:
+                        rx = font7
+                        ax = font15
+                    mask["center"].append(
+                        f'          <ellipse cx="{x - ax:.2f}" cy="{y:.2f}" rx="{rx}" ry="{font4}"/>'
+                    )
+                else:
+                    if symbol == "I":  # Special case for 'I'
+                        dx = font15
+                        dx_mm = dx_m
+                    else:  # Single character
+                        dx = font4
+                        dx_mm = dx_m + font2
+                    mask["center"].append(
+                        f'          <circle cx="{x:.2f}" cy="{y:.2f}" r="{font4:.2f}"/>'
+                    )
+                svg.append(
+                    f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx:.2f}" dy="{font4:.2f}" '
+                    f'font-size="{font_size:.2f}">{symbol}{h_str}{span}</text>'
+                )
+                mask["symbols"].append(
+                    f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx:.2f}" '
+                    f'dy="{font4:.2f}">{symbol}{h_str}</text>'
+                )
+                if span:
+                    mask["span"].append(
+                        f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx:.2f}" dy="{font4:.2f}">'
+                        f"{symbol}{h_str}{span}</text>"
+                    )
+                svg.append("      </g>")
+                if mapping:
+                    maps.append(
+                        f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" dy="{dy_m + font3:.2f}" '
+                        f'text-anchor="end">{n}</text>'
+                    )
+                    mask["aam"].append(
+                        f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" '
+                        f'dy="{dy_m + font3:.2f}" text-anchor="end">{n}</text>'
+                    )
+            elif mapping:
+                # Determine dx_mm for mapping based on symbol length even if atom itself isn't drawn
+                if len(symbol) > 1:
+                    dx_mm = dx_m + font5
+                else:
+                    dx_mm = dx_m + font2 if symbol != "I" else dx_m
+                maps.append(
+                    f'        <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" dy="{dy_m:.2f}" '
+                    f'text-anchor="end">{n}</text>'
+                )
+                mask["aam"].append(
+                    f'            <text x="{x:.2f}" y="{y:.2f}" dx="-{dx_mm:.2f}" dy="{dy_m:.2f}" '
+                    f'text-anchor="end">{n}</text>'
+                )
+        if others:
+            svg.append(
+                f'      <g font-family="{config["other_font_style"]}" fill="{other_fill}" '
+                f'font-size="{other_size:.2f}">'
+            )
+            svg.extend(others)
+            svg.append("      </g>")
+        if mapping:
+            svg.append(f'      <g fill="{map_fill}" font-size="{mapping_font:.2f}">')
+            svg.extend(maps)
+            svg.append("      </g>")
+        return svg, mask
+def depict_custom_reaction(reaction: ReactionContainer):
+    """
+    Depicts a ReactionContainer using custom atom rendering logic (replace At to X).
+    This function generates an SVG string representing a reaction. It
+    temporarily modifies the classes of the molecules within the reaction
+    to use a custom depiction logic (`CustomDepictMolecule`) that alters
+    how atoms are rendered (specifically, it seems to use `atom.symbol`
+    instead of `atom.atomic_symbol`, potentially for replacing 'At' with 'X'
+    as mentioned in the original comment). After depicting each molecule
+    with the temporary class, it restores the original classes. The function
+    then combines the individual molecule depictions, reaction arrow, and
+    reaction signs into a single SVG.
+    Args:
+        reaction (ReactionContainer): The ReactionContainer object to be depicted.
+    Returns:
+        str: An SVG string representing the depiction of the reaction
+             with custom atom rendering.
+    """
+    if not reaction._arrow:
+        reaction.fix_positions()  # Ensure positions are calculated
+    r_atoms = []
+    r_bonds = []
+    r_masks = []
+    r_max_x = r_max_y = r_min_y = 0
+    original_classes = {}  # Store original classes to restore later
+    try:
+        # Temporarily change the class of molecules to use the custom depiction
+        for mol in reaction.molecules():
+            if isinstance(mol, (MoleculeContainer, CGRContainer)):
+                original_classes[mol] = mol.__class__
+                custom_class_name = (
+                    f"TempCustom_{mol.__class__.__name__}_{uuid4().hex}"  # Unique name
+                )
+                # Combine custom depiction with original class methods
+                # Ensure the custom _render_atoms takes precedence
+                new_bases = (CustomDepictMolecule,) + original_classes[mol].__bases__
+                # Filter out DepictMolecule if it's already a base to avoid MRO issues
+                new_bases = tuple(b for b in new_bases if b is not DepictMolecule)
+                # If DepictMolecule wasn't a direct base, ensure its methods are accessible
+                if CustomDepictMolecule not in original_classes[mol].__mro__:
+                    # Prioritize CustomDepictMolecule's methods
+                    new_bases = (CustomDepictMolecule, original_classes[mol])
+                else:
+                    # If DepictMolecule was a base, CustomDepictMolecule is already first
+                    new_bases = (CustomDepictMolecule,) + tuple(
+                        b
+                        for b in original_classes[mol].__bases__
+                        if b is not DepictMolecule
+                    )
+                # Create the temporary class
+                mol.__class__ = type(custom_class_name, new_bases, {})
+            # Depict using the (potentially) modified class
+            atoms, bonds, masks, min_x, min_y, max_x, max_y = mol.depict(embedding=True)
+            r_atoms.append(atoms)
+            r_bonds.append(bonds)
+            r_masks.append(masks)
+            if max_x > r_max_x:
+                r_max_x = max_x
+            if max_y > r_max_y:
+                r_max_y = max_y
+            if min_y < r_min_y:
+                r_min_y = min_y
+    finally:
+        # Restore original classes
+        for mol, original_class in original_classes.items():
+            mol.__class__ = original_class
+    config = DepictMolecule._render_config  # Access via the imported class
+    font_size = config["font_size"]
+    font125 = 1.25 * font_size
+    width = r_max_x + 3.0 * font_size
+    height = r_max_y - r_min_y + 2.5 * font_size
+    viewbox_x = -font125
+    viewbox_y = -r_max_y - font125
+    svg = [
+        f'<svg width="{width:.2f}cm" height="{height:.2f}cm" '
+        f'viewBox="{viewbox_x:.2f} {viewbox_y:.2f} {width:.2f} '
+        f'{height:.2f}" xmlns="http://www.w3.org/2000/svg" version="1.1">\n'
+        '  <defs>\n    <marker id="arrow" markerWidth="10" markerHeight="10" '
+        'refX="0" refY="3" orient="auto">\n      <path d="M0,0 L0,6 L9,3"/>\n    </marker>\n  </defs>\n'
+        f'  <line x1="{reaction._arrow[0]:.2f}" y1="0" x2="{reaction._arrow[1]:.2f}" y2="0" '
+        'fill="none" stroke="black" stroke-width=".04" marker-end="url(#arrow)"/>'
+    ]
+    sings_plus = reaction._signs
+    if sings_plus:
+        svg.append(f'  <g fill="none" stroke="black" stroke-width=".04">')
+        for x in sings_plus:
+            svg.append(
+                f'    <line x1="{x + .35:.2f}" y1="0" x2="{x + .65:.2f}" y2="0"/>'
+            )
+            svg.append(
+                f'    <line x1="{x + .5:.2f}" y1="0.15" x2="{x + .5:.2f}" y2="-0.15"/>'
+            )
+        svg.append("  </g>")
+    for atoms, bonds, masks in zip(r_atoms, r_bonds, r_masks):
+        # Use the static method from Depict directly
+        svg.extend(
+            Depict._graph_svg(atoms, bonds, masks, viewbox_x, viewbox_y, width, height)
+        )
+    svg.append("</svg>")
+    return "\n".join(svg)
+def remove_and_shift(nested_dict, to_remove):  # Under development
+    """
+    Removes specified inner keys from a nested dictionary and renumbers the remaining keys.
+    Given a dictionary where values are themselves dictionaries, this function
+    iterates through each inner dictionary. For each inner dictionary, it
+    creates a new dictionary containing only the key-value pairs where the
+    inner key is NOT present in the `to_remove` list. The keys of the remaining
+    elements in the new inner dictionary are then renumbered sequentially
+    starting from 0, effectively removing gaps left by the removed keys.
+    Args:
+        nested_dict (dict): The input nested dictionary (dict of dicts).
+        to_remove (list): A list of keys to remove from the inner dictionaries.
+    Returns:
+        dict: A new nested dictionary with the specified keys removed from
+              inner dictionaries and the remaining inner keys renumbered.
+    """
+    rem_set = set(to_remove)
+    result = {}
+    for outer_k, inner in nested_dict.items():
+        new_inner = {}
+        for old_k, v in inner.items():
+            if old_k in rem_set:
+                continue
+            shift = sum(1 for r in rem_set if r < old_k)
+            new_k = old_k - shift
+            new_inner[new_k] = v
+        result[outer_k] = new_inner
+    return result

synplan/chem/reaction_rules/__init__.py ADDED Viewed

File without changes

synplan/chem/reaction_rules/extraction.py ADDED Viewed

	@@ -0,0 +1,744 @@

+"""Module containing functions for protocol of reaction rules extraction."""
+import logging
+import pickle
+from collections import defaultdict
+from itertools import islice
+from os.path import splitext
+from typing import Dict, List, Set, Tuple
+import ray
+from chython import smarts
+from chython import QueryContainer as QueryContainerChython
+from CGRtools.containers.cgr import CGRContainer
+from CGRtools.containers.molecule import MoleculeContainer
+from CGRtools.containers.query import QueryContainer
+from CGRtools.containers.reaction import ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+from tqdm import tqdm
+from synplan.chem.data.standardizing import RemoveReagentsStandardizer
+from synplan.chem.utils import (
+    reverse_reaction,
+    cgrtools_to_chython_molecule,
+    chython_query_to_cgrtools,
+)
+from synplan.utils.config import RuleExtractionConfig
+from synplan.utils.files import ReactionReader
+def add_environment_atoms(
+    cgr: CGRContainer, center_atoms: Set[int], environment_atom_count: int
+) -> Set[int]:
+    """
+    Adds environment atoms to the set of center atoms based on the specified depth.
+    :param cgr: A complete graph representation of a reaction (ReactionContainer
+        object).
+    :param center_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :param environment_atom_count: An integer specifying the depth of the environment
+        around the reaction center to be included. If it's 0, only the reaction center
+        is included. If it's 1, the first layer of surrounding atoms is included, and so
+        on.
+    :return: A set of atom id including the center atoms and their environment atoms up
+        to the specified depth. If environment_atom_count is 0, the original set of
+        center atoms is returned unchanged.
+    """
+    if environment_atom_count:
+        env_cgr = cgr.augmented_substructure(center_atoms, deep=environment_atom_count)
+        # combine the original center atoms with the new environment atoms
+        return center_atoms | set(env_cgr)
+    # if no environment is to be included, return the original center atoms
+    return center_atoms
+def add_functional_groups(
+    reaction: ReactionContainer,
+    center_atoms: Set[int],
+    func_groups_list: List[QueryContainerChython],
+) -> Set[int]:
+    """
+    Augments the set of reaction rule atoms with functional groups if specified.
+    :param reaction: The reaction object (ReactionContainer) from which molecules are
+        extracted.
+    :param center_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :param func_groups_list: A list of functional group objects (MoleculeContainer or
+        QueryContainer) to be considered when including functional groups. These objects
+        define the structure of the functional groups to be included.
+    :return: A set of atom id corresponding to the rule atoms, including atoms from the
+        specified functional groups if include_func_groups is True. If
+        include_func_groups is False, the original set of center atoms is returned.
+    """
+    rule_atoms = center_atoms.copy()
+    # iterate over each molecule in the reaction
+    for molecule in reaction.molecules():
+        molecule_chython = cgrtools_to_chython_molecule(molecule)
+        # for each functional group specified in the list
+        for func_group in func_groups_list:
+            # find mappings of the functional group in the molecule
+            for mapping in func_group.get_mapping(molecule_chython):
+                # remap the functional group based on the found mapping
+                func_group.remap(mapping)
+                # if the functional group intersects with center atoms, include it
+                if set(func_group.atoms_numbers) & center_atoms:
+                    rule_atoms |= set(func_group.atoms_numbers)
+                # reset the mapping to its original state for the next iteration
+                func_group.remap({v: k for k, v in mapping.items()})
+    return rule_atoms
+def add_ring_structures(cgr: CGRContainer, rule_atoms: Set[int]) -> Set[int]:
+    """
+    Adds ring structures to the set of rule atoms if they intersect with the reaction
+    center atoms.
+    :param cgr: A condensed graph representation of a reaction (CGRContainer object).
+    :param rule_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :return: A set of atom id corresponding to the original rule atoms and the included
+        ring structures.
+    """
+    for ring in cgr.sssr:
+        # check if the current ring intersects with the set of rule atoms
+        if set(ring) & rule_atoms:
+            # if the intersection exists, include all atoms in the ring to the rule atoms
+            rule_atoms |= set(ring)
+    return rule_atoms
+def add_leaving_incoming_groups(
+    reaction: ReactionContainer,
+    rule_atoms: Set[int],
+    keep_leaving_groups: bool,
+    keep_incoming_groups: bool,
+) -> Tuple[Set[int], Dict[str, Set]]:
+    """
+    Identifies and includes leaving and incoming groups to the rule atoms based on
+    specified flags.
+    :param reaction: The reaction object (ReactionContainer) from which leaving and
+        incoming groups are extracted.
+    :param rule_atoms: A set of atom id corresponding to the center atoms of the
+        reaction.
+    :param keep_leaving_groups: A boolean flag indicating whether to include leaving
+        groups in the rule.
+    :param keep_incoming_groups: A boolean flag indicating whether to include incoming
+        groups in the rule.
+    :return: Updated set of rule atoms including leaving and incoming groups if
+        specified, and metadata about added groups.
+    """
+    meta_debug = {"leaving": set(), "incoming": set()}
+    # extract atoms from reactants and products
+    reactant_atoms = {atom for reactant in reaction.reactants for atom in reactant}
+    product_atoms = {atom for product in reaction.products for atom in product}
+    # identify leaving groups (reactant atoms not in products)
+    if keep_leaving_groups:
+        leaving_atoms = reactant_atoms - product_atoms
+        new_leaving_atoms = leaving_atoms - rule_atoms
+        # include leaving atoms in the rule atoms
+        rule_atoms |= leaving_atoms
+        # add leaving atoms to metadata
+        meta_debug["leaving"] |= new_leaving_atoms
+    # identify incoming groups (product atoms not in reactants)
+    if keep_incoming_groups:
+        incoming_atoms = product_atoms - reactant_atoms
+        new_incoming_atoms = incoming_atoms - rule_atoms
+        # Include incoming atoms in the rule atoms
+        rule_atoms |= incoming_atoms
+        # Add incoming atoms to metadata
+        meta_debug["incoming"] |= new_incoming_atoms
+    return rule_atoms, meta_debug
+def clean_molecules(
+    rule_molecules: List[MoleculeContainer],
+    reaction_molecules: Tuple[MoleculeContainer],
+    reaction_center_atoms: Set[int],
+    atom_retention_details: Dict[str, Dict[str, bool]],
+) -> List[QueryContainer]:
+    """
+    Cleans rule molecules by removing specified information about atoms based on
+    retention details provided.
+    :param rule_molecules: A list of query container objects representing the rule molecules.
+    :param reaction_molecules: A list of molecule container objects involved in the reaction.
+    :param reaction_center_atoms: A set of id corresponding to the atom numbers in the reaction center.
+    :param atom_retention_details: A dictionary specifying what atom information to retain or remove.
+                                   This dictionary should have two keys: "reaction_center" and "environment",
+                                   each mapping to another dictionary. The nested dictionaries should have
+                                   keys representing atom attributes (like "neighbors", "hybridization",
+                                   "implicit_hydrogens", "ring_sizes") and boolean values.
+                                   A value of True indicates that the corresponding attribute
+                                   should be retained, while False indicates it should be removed from the atom.
+    :return: A list of QueryContainer objects representing the cleaned rule molecules.
+    """
+    cleaned_rule_molecules = []
+    for rule_molecule in rule_molecules:
+        for reaction_molecule in reaction_molecules:
+            if set(rule_molecule.atoms_numbers) <= set(reaction_molecule.atoms_numbers):
+                query_reaction_molecule = reaction_molecule.substructure(
+                    reaction_molecule, as_query=True
+                )
+                query_rule_molecule = query_reaction_molecule.substructure(
+                    rule_molecule
+                )
+                # clean reaction center atoms
+                if not all(
+                    atom_retention_details["reaction_center"].values()
+                ):  # if everything True, we keep all marks
+                    local_reaction_center_atoms = (
+                        set(rule_molecule.atoms_numbers) & reaction_center_atoms
+                    )
+                    for atom_number in local_reaction_center_atoms:
+                        query_rule_molecule = clean_atom(
+                            query_rule_molecule,
+                            atom_retention_details["reaction_center"],
+                            atom_number,
+                        )
+                # clean environment atoms
+                if not all(
+                    atom_retention_details["environment"].values()
+                ):  # if everything True, we keep all marks
+                    local_environment_atoms = (
+                        set(rule_molecule.atoms_numbers) - reaction_center_atoms
+                    )
+                    for atom_number in local_environment_atoms:
+                        query_rule_molecule = clean_atom(
+                            query_rule_molecule,
+                            atom_retention_details["environment"],
+                            atom_number,
+                        )
+                cleaned_rule_molecules.append(query_rule_molecule)
+                break
+    return cleaned_rule_molecules
+def clean_atom(
+    query_molecule: QueryContainer,
+    attributes_to_keep: Dict[str, bool],
+    atom_number: int,
+) -> QueryContainer:
+    """
+    Removes specified information from a given atom in a query molecule.
+    :param query_molecule: The QueryContainer of molecule.
+    :param attributes_to_keep: Dictionary indicating which attributes to keep in the atom. The keys should be strings
+                               representing the attribute names, and the values should be booleans indicating whether
+                               to retain (True) or remove(False) that attribute. Expected keys are:
+                               - "neighbors": Indicates if neighbors of the atom should be removed.
+                               - "hybridization": Indicates if hybridization information of the atom should be removed.
+                               - "implicit_hydrogens": Indicates if implicit hydrogen information of the atom should be removed.
+                               - "ring_sizes": Indicates if ring size information of the atom should be removed.
+    :param atom_number: The number of the atom to be modified in the query molecule.
+    """
+    target_atom = query_molecule.atom(atom_number)
+    if not attributes_to_keep["neighbors"]:
+        target_atom.neighbors = None
+    if not attributes_to_keep["hybridization"]:
+        target_atom.hybridization = None
+    if not attributes_to_keep["implicit_hydrogens"]:
+        target_atom.implicit_hydrogens = None
+    if not attributes_to_keep["ring_sizes"]:
+        target_atom.ring_sizes = None
+    return query_molecule
+def create_substructures_and_reagents(
+    reaction: ReactionContainer,
+    rule_atoms: Set[int],
+    as_query_container: bool,
+    keep_reagents: bool,
+) -> Tuple[List[MoleculeContainer], List[MoleculeContainer], List]:
+    """
+    Creates substructures for reactants and products, and optionally includes
+    reagents, based on specified parameters. The function processes the reaction to
+    create substructures for reactants and products based on the rule atoms. It also
+    handles the inclusion of reagents based on the keep_reagents flag and converts these
+    structures to query containers if required.
+    :param reaction: The reaction object (ReactionContainer) from which to extract substructures.
+                     This object  represents a chemical reaction with specified reactants, products, and possibly reagents.
+    :param rule_atoms: A set of atom id corresponding to the rule atoms. These are used to identify relevant
+                       substructures in reactants and products.
+    :param as_query_container: A boolean flag indicating whether the substructures should be converted to query containers.
+                               Query containers are used for pattern matching in chemical structures.
+    :param keep_reagents: A boolean flag indicating whether reagents should be included in the resulting structures.
+                          Reagents are additional substances that are present in the reaction but are not reactants or products.
+    :return: A tuple containing three elements:
+             - A list of reactant substructures, each corresponding to a part of the reactants that matches the rule atoms.
+             - A list of product substructures, each corresponding to a part of the products that matches the rule atoms.
+             - A list of reagents, included as is or as substructures, depending on the as_query_container flag.
+    """
+    reactant_substructures = [
+        reactant.substructure(rule_atoms.intersection(reactant.atoms_numbers))
+        for reactant in reaction.reactants
+        if rule_atoms.intersection(reactant.atoms_numbers)
+    ]
+    product_substructures = [
+        product.substructure(rule_atoms.intersection(product.atoms_numbers))
+        for product in reaction.products
+        if rule_atoms.intersection(product.atoms_numbers)
+    ]
+    reagents = []
+    if keep_reagents:
+        if as_query_container:
+            reagents = [
+                reagent.substructure(reagent, as_query=True)
+                for reagent in reaction.reagents
+            ]
+        else:
+            reagents = reaction.reagents
+    return reactant_substructures, product_substructures, reagents
+def assemble_final_rule(
+    reactant_substructures: List[QueryContainer],
+    product_substructures: List[QueryContainer],
+    reagents: List,
+    meta_debug: Dict[str, Set],
+    keep_metadata: bool,
+    reaction: ReactionContainer,
+) -> ReactionContainer:
+    """
+    Assembles the final reaction rule from the provided substructures and metadata.
+    This function brings together the various components of a reaction rule, including
+    reactant and product substructures, reagents, and metadata. It creates a
+    comprehensive representation of the reaction rule, which can be used for further
+    processing or analysis.
+    :param reactant_substructures: A list of substructures derived from the reactants of
+        the reaction. These substructures represent parts of reactants that are relevant
+        to the rule.
+    :param product_substructures: A list of substructures derived from the products of
+        the reaction. These substructures represent parts of products that are relevant
+        to the rule.
+    :param reagents: A list of reagents involved in the reaction. These may be included
+        as-is or as substructures, depending on earlier processing steps.
+    :param meta_debug: A dictionary containing additional metadata about the reaction,
+        such as leaving and incoming groups.
+    :param keep_metadata: A boolean flag indicating whether to retain the metadata
+        associated with the reaction in the rule.
+    :param reaction: The original reaction object (ReactionContainer) from which the
+        rule is being created.
+    :return: A ReactionContainer object representing the assembled reaction rule. This
+        container includes the reactant and product substructures, reagents, and any
+        additional metadata if keep_metadata is True.
+    """
+    rule_metadata = meta_debug if keep_metadata else {}
+    rule_metadata.update(reaction.meta if keep_metadata else {})
+    rule = ReactionContainer(
+        reactant_substructures, product_substructures, reagents, rule_metadata
+    )
+    if keep_metadata:
+        rule.name = reaction.name
+    rule.flush_cache()
+    return rule
+def validate_rule(rule: ReactionContainer, reaction: ReactionContainer) -> bool:
+    """
+    Validates a reaction rule by ensuring it can correctly generate the products from
+    the reactants. The function uses a chemical reactor to simulate the reaction based
+    on the provided rule. It then compares the products generated by the simulation with
+    the actual products of the reaction. If they match, the rule is considered valid. If
+    not, a ValueError is raised, indicating an issue with the rule.
+    :param rule: The reaction rule to be validated. This is a ReactionContainer object
+        representing a chemical reaction rule, which includes the necessary information
+        to perform a reaction.
+    :param reaction: The original reaction object (ReactionContainer) against which the
+        rule is to be validated. This object contains the actual reactants and products
+        of the reaction.
+    :return: The validated rule if the rule correctly generates the products from the
+        reactants.
+    :raises ValueError: If the rule does not correctly generate the products from the
+        reactants, indicating an incorrect or incomplete rule.
+    """
+    # create a reactor with the given rule
+    reactor = Reactor(rule)
+    try:
+        for result_reaction in reactor(reaction.reactants):
+            result_products = []
+            for result_product in result_reaction.products:
+                tmp = result_product.copy()
+                try:
+                    tmp.kekule()
+                    if tmp.check_valence():
+                        continue
+                except InvalidAromaticRing:
+                    continue
+                result_products.append(result_product)
+            if set(reaction.products) == set(result_products) and len(
+                reaction.products
+            ) == len(result_products):
+                return True
+    except (KeyError, IndexError):
+        # KeyError - iteration over reactor is finished and products are different from the original reaction
+        # IndexError - mistake in __contract_ions, possibly problems with charges in reaction rule
+        return False
+    return False
+def create_rule(
+    config: RuleExtractionConfig, reaction: ReactionContainer
+) -> ReactionContainer:
+    """
+    Creates a reaction rule from a given reaction based on the specified
+    configuration. The function processes the reaction to create a rule that matches the
+    configuration settings. It handles the inclusion of environmental atoms, functional
+    groups, ring structures, and leaving and incoming groups. It also constructs
+    substructures for reactants, products, and reagents, and cleans molecule
+    representations if required. Optionally, it validates the rule using a reactor.
+    :param config: An instance of ExtractRuleConfig, containing various settings that
+                   determine how the rule is created, such as environmental atom count, inclusion
+                   of functional groups, rings, leaving and incoming groups, and other parameters.
+    :param reaction: The reaction object (ReactionContainer) from which to create the
+                     rule. This object represents a chemical reaction with specified reactants,
+                     products, and possibly reagents.
+    :return: A ReactionContainer object representing the extracted reaction rule. This
+             rule includes various elements of the reaction as specified by the
+             configuration, such as reaction centers, environmental atoms, functional groups,
+             and others.
+    """
+    # 1. create reaction CGR
+    cgr = ~reaction
+    center_atoms = set(cgr.center_atoms)
+    # 2. add atoms of reaction environment based on config settings
+    center_atoms = add_environment_atoms(
+        cgr, center_atoms, config.environment_atom_count
+    )
+    # 3. include functional groups in the rule if specified in config
+    if config.include_func_groups and config.func_groups_list:
+        rule_atoms = add_functional_groups(
+            reaction, center_atoms, config.func_groups_list
+        )
+    else:
+        rule_atoms = center_atoms.copy()
+    # 4. include ring structures in the rule if specified in config
+    if config.include_rings:
+        rule_atoms = add_ring_structures(cgr, rule_atoms)
+    # 5. add leaving and incoming groups to the rule based on config settings
+    rule_atoms, meta_debug = add_leaving_incoming_groups(
+        reaction, rule_atoms, config.keep_leaving_groups, config.keep_incoming_groups
+    )
+    # 6. create substructures for reactants, products, and reagents
+    reactant_substructures, product_substructures, reagents = (
+        create_substructures_and_reagents(
+            reaction, rule_atoms, config.as_query_container, config.keep_reagents
+        )
+    )
+    # 7. clean atom marks in the molecules if they are being converted to query containers
+    if config.as_query_container:
+        reactant_substructures = clean_molecules(
+            reactant_substructures,
+            reaction.reactants,
+            center_atoms,
+            config.atom_info_retention,
+        )
+        product_substructures = clean_molecules(
+            product_substructures,
+            reaction.products,
+            center_atoms,
+            config.atom_info_retention,
+        )
+    # 8. assemble the final rule including metadata if specified
+    rule = assemble_final_rule(
+        reactant_substructures,
+        product_substructures,
+        reagents,
+        meta_debug,
+        config.keep_metadata,
+        reaction,
+    )
+    # 9. reverse extracted reaction rule and reaction
+    if config.reverse_rule:
+        rule = reverse_reaction(rule)
+        reaction = reverse_reaction(reaction)
+    # 10. validate the rule using a reactor if validation is enabled in config
+    if config.reactor_validation:
+        if validate_rule(rule, reaction):
+            rule.meta["reactor_validation"] = "passed"
+        else:
+            rule.meta["reactor_validation"] = "failed"
+    return rule
+def extract_rules(
+    config: RuleExtractionConfig, reaction: ReactionContainer
+) -> List[ReactionContainer]:
+    """
+    Extracts reaction rules from a given reaction based on the specified
+    configuration.
+    :param config: An instance of ExtractRuleConfig, which contains various
+        configuration settings for rule extraction, such as whether to include
+        multicenter rules, functional groups, ring structures, leaving and incoming
+        groups, etc.
+    :param reaction: The reaction object (ReactionContainer) from which to extract
+        rules. The reaction object represents a chemical reaction with specified
+        reactants, products, and possibly reagents.
+    :return: A list of ReactionContainer objects, each representing a distinct reaction
+        rule. If config.multicenter_rules is True, a single rule encompassing all
+        reaction centers is returned. Otherwise, separate rules for each reaction center
+        are extracted, up to a maximum of 15 distinct centers.
+    """
+    standardizer = (
+        RemoveReagentsStandardizer()
+    )  # reagents are needed if they are the part of reaction rule specification
+    reaction = standardizer(reaction)
+    if config.multicenter_rules:
+        # extract a single rule encompassing all reaction centers
+        return [create_rule(config, reaction)]
+    # extract separate rules for each distinct reaction center
+    distinct_rules = set()
+    for center_reaction in islice(reaction.enumerate_centers(), 15):
+        single_rule = create_rule(config, center_reaction)
+        distinct_rules.add(single_rule)
+    return list(distinct_rules)
+@ray.remote
+def process_reaction_batch(
+    batch: List[Tuple[int, ReactionContainer]], config: RuleExtractionConfig
+) -> List[Tuple[int, List[ReactionContainer]]]:
+    """
+    Processes a batch of reactions to extract reaction rules based on the given
+    configuration. This function operates as a remote task in a distributed system using
+    Ray. It takes a batch of reactions, where each reaction is paired with an index. For
+    each reaction in the batch, it extracts reaction rules as specified by the
+    configuration object. The extracted rules for each reaction are then returned along
+    with the corresponding index. This function is intended to be used in a distributed
+    manner with Ray to parallelize the rule extraction process across multiple
+    reactions.
+    :param batch: A list where each element is a tuple containing an index (int) and a
+        ReactionContainer object. The index is typically used to keep track of the
+        reaction's position in a larger dataset.
+    :param config: An instance of ExtractRuleConfig that provides settings and
+        parameters for the rule extraction process.
+    :return: A list where each element is a tuple. The first element of the tuple is an
+        index (int), and the second is a list of ReactionContainer objects representing
+        the extracted rules for the corresponding reaction.
+    """
+    extracted_rules_list = []
+    for index, reaction in batch:
+        try:
+            extracted_rules = extract_rules(config, reaction)
+            extracted_rules_list.append((index, extracted_rules))
+        except Exception as e:
+            logging.debug(e)
+            continue
+    return extracted_rules_list
+def process_completed_batch(
+    futures: Dict,
+    rules_statistics: Dict,
+) -> None:
+    """
+    Processes completed batches of reactions, updating the rules statistics and
+    writing rules to a file. This function waits for the completion of a batch of
+    reactions processed in parallel (using Ray), updates the statistics for each
+    extracted rule, and writes the rules to a result file if they are new. It also
+    updates the progress bar with the size of the processed batch.
+    :param futures: A dictionary of futures representing ongoing batch processing tasks.
+    :param rules_statistics: A dictionary to keep track of statistics for each rule.
+    :return: None
+    """
+    ready_id, running_id = ray.wait(list(futures.keys()), num_returns=1)
+    completed_batch = ray.get(ready_id[0])
+    for index, extracted_rules in completed_batch:
+        for rule in extracted_rules:
+            prev_stats_len = len(rules_statistics)
+            rules_statistics[rule].append(index)
+            if len(rules_statistics) != prev_stats_len:
+                rule.meta["first_reaction_index"] = index
+    del futures[ready_id[0]]
+def sort_rules(
+    rules_stats: Dict, min_popularity: int, single_reactant_only: bool
+) -> List[Tuple[ReactionContainer, List[int]]]:
+    """
+    Sorts reaction rules based on their popularity and validation status. This
+    function sorts the given rules according to their popularity (i.e., the number of
+    times they have been applied) and filters out rules that haven't passed reactor
+    validation or are less popular than the specified minimum popularity threshold.
+    :param rules_stats: A dictionary where each key is a reaction rule and the value is
+        a list of integers. Each integer represents an index where the rule was applied.
+    :type rules_stats: The number of occurrence of the reaction rules.
+    :param min_popularity: The minimum number of times a rule must be applied to be
+        considered. Default is 3.
+    :type min_popularity: The minimum number of occurrence of the reaction rule to be
+        selected.
+    :param single_reactant_only: Whether to keep only reaction rules with a single
+        molecule on the right side of reaction arrow. Default is True.
+    :return: A list of tuples, where each tuple contains a reaction rule and a list of
+        indices representing the rule's applications. The list is sorted in descending
+        order of the rule's popularity.
+    """
+    return sorted(
+        (
+            (rule, indices)
+            for rule, indices in rules_stats.items()
+            if len(indices) >= min_popularity
+            and rule.meta["reactor_validation"] == "passed"
+            and (not single_reactant_only or len(rule.reactants) == 1)
+        ),
+        key=lambda x: -len(x[1]),
+    )
+def extract_rules_from_reactions(
+    config: RuleExtractionConfig,
+    reaction_data_path: str,
+    reaction_rules_path: str,
+    num_cpus: int,
+    batch_size: int,
+) -> None:
+    """
+    Extracts reaction rules from a set of reactions based on the given configuration.
+    This function initializes a Ray environment for distributed computing and processes
+    each reaction in the provided reaction database to extract reaction rules. It
+    handles the reactions in batches, parallelize the rule extraction process. Extracted
+    rules are written to RDF files and their statistics are recorded. The function also
+    sorts the rules based on their popularity and saves the sorted rules.
+    :param config: Configuration settings for rule extraction, including file paths,
+        batch size, and other parameters.
+    :param reaction_data_path: Path to the file containing reaction database.
+    :param reaction_rules_path: Name of the file to store the extracted rules.
+    :param num_cpus: Number of CPU cores to use for processing. Defaults to 1.
+    :param batch_size: Number of reactions to process in each batch. Defaults to 10.
+    :return: None
+    """
+    ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+    reaction_rules_path, _ = splitext(reaction_rules_path)
+    with ReactionReader(reaction_data_path) as reactions:
+        futures = {}
+        batch = []
+        max_concurrent_batches = num_cpus
+        extracted_rules_and_statistics = defaultdict(list)
+        for index, reaction in tqdm(
+            enumerate(reactions),
+            desc="Number of reactions processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            # reaction ready to use
+            batch.append((index, reaction))
+            if len(batch) == batch_size:
+                future = process_reaction_batch.remote(batch, config)
+                futures[future] = None
+                batch = []
+                while len(futures) >= max_concurrent_batches:
+                    process_completed_batch(
+                        futures,
+                        extracted_rules_and_statistics,
+                    )
+        if batch:
+            future = process_reaction_batch.remote(batch, config)
+            futures[future] = None
+        while futures:
+            process_completed_batch(
+                futures,
+                extracted_rules_and_statistics,
+            )
+        sorted_rules = sort_rules(
+            extracted_rules_and_statistics,
+            min_popularity=config.min_popularity,
+            single_reactant_only=config.single_reactant_only,
+        )
+        ray.shutdown()
+        with open(f"{reaction_rules_path}.pickle", "wb") as statistics_file:
+            pickle.dump(sorted_rules, statistics_file)
+        print(f"Number of extracted reaction rules: {len(sorted_rules)}")

synplan/chem/reaction_rules/manual/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .decompositions import rules as d_rules
+from .transformations import rules as t_rules
+hardcoded_rules = t_rules + d_rules
+__all__ = ["hardcoded_rules"]

synplan/chem/reaction_rules/manual/decompositions.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""Module containing hardcoded decomposition reaction rules."""
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+rules = []
+def prepare():
+    """Creates and returns three query containers and appends a reaction container to
+    the "rules" list."""
+    q_ = QueryContainer()
+    p1_ = QueryContainer()
+    p2_ = QueryContainer()
+    rules.append(ReactionContainer((q_,), (p1_, p2_)))
+    return q_, p1_, p2_
+# R-amide/ester formation
+# [C](-[N,O;D23;Zs])(-[C])=[O]>>[A].[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom(ListElement(["N", "O"]), hybridization=1, neighbors=(2, 3))
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O", _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+p2.add_atom("A", _map=4)
+# acyl group addition with aromatic carbon's case (Friedel-Crafts)
+# [C;Za]-[C](-[C])=[O]>>[C].[C]-[C](-[Cl])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C", hybridization=4)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("Cl", _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+p2.add_atom("C", _map=4)
+# Williamson reaction
+# [C;Za]-[O]-[C;Zs;W0]>>[C]-[Br].[C]-[O]
+q, p1, p2 = prepare()
+q.add_atom("C", hybridization=4)
+q.add_atom("O")
+q.add_atom("C", hybridization=1, heteroatoms=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_bond(1, 2, 1)
+p2.add_atom("C", _map=3)
+p2.add_atom("Br")
+p2.add_bond(3, 4, 1)
+# Buchwald-Hartwig amination
+# [N;D23;Zs;W0]-[C;Za]>>[C]-[Br].[N]
+q, p1, p2 = prepare()
+q.add_atom("N", heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom("C", hybridization=4)
+q.add_bond(1, 2, 1)
+p1.add_atom("C", _map=2)
+p1.add_atom("Br")
+p1.add_bond(2, 3, 1)
+p2.add_atom("N")
+# imidazole imine atom's alkylation
+# [C;r5](:[N;r5]-[C;Zs;W1]):[N;D2;r5]>>[C]-[Br].[N]:[C]:[N]
+q, p1, p2 = prepare()
+q.add_atom("N", rings_sizes=5)
+q.add_atom("C", rings_sizes=5)
+q.add_atom("N", rings_sizes=5, neighbors=2)
+q.add_atom("C", hybridization=1, heteroatoms=(1, 2))
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 4)
+q.add_bond(1, 4, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(1, 2, 4)
+p1.add_bond(2, 3, 4)
+p2.add_atom("C", _map=4)
+p2.add_atom("Br")
+p2.add_bond(4, 5, 1)
+# Knoevenagel condensation (nitryl and carboxyl case)
+# [C]=[C](-[C]#[N])-[C](-[O])=[O]>>[C]=[O].[C](-[C]#[N])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 2)
+q.add_bond(5, 7, 1)
+p1.add_atom("C", _map=2)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O")
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 2)
+p1.add_bond(5, 7, 1)
+p2.add_atom("C", _map=1)
+p2.add_atom("O", _map=8)
+p2.add_bond(1, 8, 2)
+# Knoevenagel condensation (double nitryl case)
+# [C]=[C](-[C]#[N])-[C]#[N]>>[C]=[O].[C](-[C]#[N])-[C]#[N]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("N")
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 3)
+p1.add_atom("C", _map=2)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 3)
+p2.add_atom("C", _map=1)
+p2.add_atom("O", _map=8)
+p2.add_bond(1, 8, 2)
+# Knoevenagel condensation (double carboxyl case)
+# [C]=[C](-[C](-[O])=[O])-[C](-[O])=[O]>>[C]=[O].[C](-[C](-[O])=[O])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 2)
+q.add_bond(3, 5, 1)
+q.add_bond(2, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+p1.add_atom("C", _map=2)
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O")
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O")
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 2)
+p1.add_bond(3, 5, 1)
+p1.add_bond(2, 6, 1)
+p1.add_bond(6, 7, 2)
+p1.add_bond(6, 8, 1)
+p2.add_atom("C", _map=1)
+p2.add_atom("O", _map=9)
+p2.add_bond(1, 9, 2)
+# heterocyclization with guanidine
+# [c]((-[N;W0;Zs])@[n]@[c](-[N;D1])@[c;W0])@[n]@[c]-[O; D1]>>[C](-[N])(=[N])-[N].[C](#[N])-[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("N", heteroatoms=0, hybridization=1)
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("N", neighbors=1)
+q.add_atom("C", heteroatoms=0)
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 4)
+q.add_bond(1, 7, 4)
+q.add_bond(7, 8, 4)
+q.add_bond(8, 9, 1)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("N")
+p1.add_atom("N", _map=7)
+p1.add_bond(1, 2, 1)
+p1.add_bond(1, 3, 2)
+p1.add_bond(1, 7, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("N")
+p2.add_atom("C")
+p2.add_atom("C", _map=8)
+p2.add_atom("O", _map=9)
+p2.add_atom("O")
+p2.add_bond(4, 5, 3)
+p2.add_bond(4, 6, 1)
+p2.add_bond(6, 8, 1)
+p2.add_bond(8, 9, 2)
+p2.add_bond(8, 10, 1)
+# alkylation of amine
+# [C]-[N]-[C]>>[C]-[N].[C]-[Br]
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("Cl")
+p2.add_bond(4, 5, 1)
+# Synthesis of guanidines
+#
+q, p1, p2 = prepare()
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("N", hybridization=1)
+q.add_atom("N", hybridization=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+p2.add_atom("N", _map=4)
+# Grignard reaction with nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("N")
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 3)
+p2.add_atom("C", _map=4)
+p2.add_atom("Br")
+p2.add_bond(4, 5, 1)
+# Alkylation of alpha-carbon atom of nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C", neighbors=(3, 4))
+q.add_atom("C", hybridization=1)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("Cl")
+p2.add_bond(4, 5, 1)
+# Gomberg-Bachmann reaction
+#
+q, p1, p2 = prepare()
+q.add_atom("C", hybridization=4, heteroatoms=0)
+q.add_atom("C", hybridization=4, heteroatoms=0)
+q.add_bond(1, 2, 1)
+p1.add_atom("C")
+p1.add_atom("N", _map=3)
+p1.add_bond(1, 3, 1)
+p2.add_atom("C", _map=2)
+# Cyclocondensation
+#
+q, p1, p2 = prepare()
+q.add_atom("N", neighbors=2)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 1)
+q.add_bond(7, 8, 2)
+q.add_bond(1, 7, 1)
+p1.add_atom("N")
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("C")
+p1.add_atom("O", _map=9)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 1)
+p1.add_bond(4, 9, 2)
+p2.add_atom("N", _map=5)
+p2.add_atom("C")
+p2.add_atom("C")
+p2.add_atom("O")
+p2.add_atom("O", _map=10)
+p2.add_bond(5, 6, 1)
+p2.add_bond(6, 7, 1)
+p2.add_bond(7, 8, 2)
+p2.add_bond(7, 10, 1)
+# heterocyclization dicarboxylic acids
+#
+q, p1, p2 = prepare()
+q.add_atom("C", rings_sizes=(5, 6))
+q.add_atom("O")
+q.add_atom(ListElement(["O", "N"]))
+q.add_atom("C", rings_sizes=(5, 6))
+q.add_atom("O")
+q.add_bond(1, 2, 2)
+q.add_bond(1, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+p1.add_atom("C")
+p1.add_atom("O")
+p1.add_atom("O", _map=6)
+p1.add_bond(1, 2, 2)
+p1.add_bond(1, 6, 1)
+p2.add_atom("C", _map=4)
+p2.add_atom("O")
+p2.add_atom("O", _map=7)
+p2.add_bond(4, 5, 2)
+p2.add_bond(4, 7, 1)
+__all__ = ["rules"]

synplan/chem/reaction_rules/manual/transformations.py ADDED Viewed

	@@ -0,0 +1,532 @@

+"""Module containing hardcoded transformation reaction rules."""
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+rules = []
+def prepare():
+    """Creates and returns three query containers and appends a reaction container to
+    the "rules" list."""
+    q_ = QueryContainer()
+    p_ = QueryContainer()
+    rules.append(ReactionContainer((q_,), (p_,)))
+    return q_, p_
+# aryl nitro reduction
+# [C;Za;W1]-[N;D1]>>[O-]-[N+](-[C])=[O]
+q, p = prepare()
+q.add_atom("N", neighbors=1)
+q.add_atom("C", hybridization=4, heteroatoms=1)
+q.add_bond(1, 2, 1)
+p.add_atom("N", charge=1)
+p.add_atom("C")
+p.add_atom("O", charge=-1)
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(1, 3, 1)
+p.add_bond(1, 4, 2)
+# aryl nitration
+# [O-]-[N+](=[O])-[C;Za;W12]>>[C]
+q, p = prepare()
+q.add_atom("N", charge=1)
+q.add_atom("C", hybridization=4, heteroatoms=(1, 2))
+q.add_atom("O", charge=-1)
+q.add_atom("O")
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 1)
+q.add_bond(1, 4, 2)
+p.add_atom("C", _map=2)
+# Beckmann rearrangement (oxime -> amide)
+# [C]-[N;D2]-[C]=[O]>>[O]-[N]=[C]-[C]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("N", neighbors=2)
+q.add_atom("O")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+q.add_bond(2, 4, 1)
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("O")
+p.add_atom("C")
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+p.add_bond(1, 4, 1)
+# aldehydes or ketones into oxime/imine reaction
+# [C;Zd;W1]=[N]>>[C]=[O]
+q, p = prepare()
+q.add_atom("C", hybridization=2, heteroatoms=1)
+q.add_atom("N")
+q.add_bond(1, 2, 2)
+p.add_atom("C")
+p.add_atom("O", _map=3)
+p.add_bond(1, 3, 2)
+# addition of halogen atom into phenol ring (orto)
+# [C](-[Cl,F,Br,I;D1]):[C]-[O,N;Zs]>>[C](-[A]):[C]
+q, p = prepare()
+q.add_atom(ListElement(["O", "N"]), hybridization=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom(ListElement(["Cl", "F", "Br", "I"]), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+p.add_atom("A")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+# addition of halogen atom into phenol ring (para)
+# [C](:[C]:[C]:[C]-[O,N;Zs])-[Cl,F,Br,I;D1]>>[A]-[C]:[C]:[C]:[C]
+q, p = prepare()
+q.add_atom(ListElement(["O", "N"]), hybridization=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom(ListElement(["Cl", "F", "Br", "I"]), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 4)
+q.add_bond(5, 6, 1)
+p.add_atom("A")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+p.add_bond(3, 4, 4)
+p.add_bond(4, 5, 4)
+# hard reduction of Ar-ketones
+# [C;Za]-[C;D2;Zs;W0]>>[C]-[C]=[O]
+q, p = prepare()
+q.add_atom("C", hybridization=4)
+q.add_atom("C", hybridization=1, neighbors=2, heteroatoms=0)
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+# reduction of alpha-hydroxy pyridine
+# [C;W1]:[N;H0;r6]>>[C](:[N])-[O]
+q, p = prepare()
+q.add_atom("C", heteroatoms=1)
+q.add_atom("N", rings_sizes=6, hydrogens=0)
+q.add_bond(1, 2, 4)
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("O")
+p.add_bond(1, 2, 4)
+p.add_bond(1, 3, 1)
+# Reduction of alkene
+# [C]-[C;D23;Zs;W0]-[C;D123;Zs;W0]>>[C](-[C])=[C]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("C", heteroatoms=0, neighbors=(2, 3), hybridization=1)
+q.add_atom("C", heteroatoms=0, neighbors=(1, 2, 3), hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+# Kolbe-Schmitt reaction
+# [C](:[C]-[O;D1])-[C](=[O])-[O;D1]>>[C](-[O]):[C]
+q, p = prepare()
+q.add_atom("O", neighbors=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_atom("O")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 2)
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+# reduction of carboxylic acid
+# [O;D1]-[C;D2]-[C]>>[C]-[C](-[O])=[O]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("C", neighbors=2)
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 1)
+p.add_bond(2, 4, 2)
+# halogenation of alcohols
+# [C;Zs]-[Cl,Br;D1]>>[C]-[O]
+q, p = prepare()
+q.add_atom("C", hybridization=1, heteroatoms=1)
+q.add_atom(ListElement(["Cl", "Br"]), neighbors=1)
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+p.add_atom("O", _map=3)
+p.add_bond(1, 3, 1)
+# Kolbe nitrilation
+# [N]#[C]-[C;Zs;W0]>>[Br]-[C]
+q, p = prepare()
+q.add_atom("C", heteroatoms=0, hybridization=1)
+q.add_atom("C")
+q.add_atom("N")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 3)
+p.add_atom("C")
+p.add_atom("Br", _map=4)
+p.add_bond(1, 4, 1)
+# Nitrile hydrolysis
+# [O;D1]-[C]=[O]>>[N]#[C]
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_atom("O")
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+p.add_atom("C")
+p.add_atom("N", _map=4)
+p.add_bond(1, 4, 3)
+# sulfamidation
+# [c]-[S](=[O])(=[O])-[N]>>[c]
+q, p = prepare()
+q.add_atom("C", hybridization=4)
+q.add_atom("S")
+q.add_atom("O")
+q.add_atom("O")
+q.add_atom("N", neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 2)
+q.add_bond(2, 5, 1)
+p.add_atom("C")
+# Ring expansion rearrangement
+#
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C", rings_sizes=6)
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(3, 6, 1)
+q.add_bond(4, 7, 1)
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(4, 6, 1)
+p.add_bond(4, 7, 1)
+# hydrolysis of bromide alkyl
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+p.add_atom("Br")
+p.add_bond(1, 2, 1)
+# Condensation of ketones/aldehydes and amines into imines
+#
+q, p = prepare()
+q.add_atom("N", neighbors=(1, 2))
+q.add_atom("C", neighbors=(2, 3), heteroatoms=1)
+q.add_bond(1, 2, 2)
+p.add_atom("C", _map=2)
+p.add_atom("O")
+p.add_bond(2, 3, 2)
+# Halogenation of alkanes
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom(ListElement(["F", "Cl", "Br"]))
+q.add_bond(1, 2, 1)
+p.add_atom("C")
+# heterocyclization
+#
+q, p = prepare()
+q.add_atom("N", heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom("C", heteroatoms=2)
+q.add_atom("N", heteroatoms=0, neighbors=2)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(2, 4, 2)
+# Reduction of nitrile
+#
+q, p = prepare()
+q.add_atom("N", neighbors=1)
+q.add_atom("C")
+q.add_atom("C", hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+# SPECIAL CASE
+# Reduction of nitrile into methylamine
+#
+q, p = prepare()
+q.add_atom("C", neighbors=1)
+q.add_atom("N", neighbors=2)
+q.add_atom("C")
+q.add_atom("C", hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p.add_atom("N", _map=2)
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(2, 3, 3)
+p.add_bond(3, 4, 1)
+# methylation of amides
+#
+q, p = prepare()
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("N")
+q.add_atom("C", neighbors=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("N")
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+# hydrocyanation of alkenes
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("N")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 2)
+# decarbocylation (alpha atom of nitrile)
+#
+q, p = prepare()
+q.add_atom("N")
+q.add_atom("C")
+q.add_atom("C", neighbors=2)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(4, 6, 1)
+# Bichler-Napieralski reaction
+#
+q, p = prepare()
+q.add_atom("C", rings_sizes=(6,))
+q.add_atom("C", rings_sizes=(6,))
+q.add_atom("N", rings_sizes=(6,), neighbors=2)
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom("O", neighbors=1)
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+q.add_bond(5, 9, 4)
+q.add_bond(9, 10, 1)
+q.add_bond(1, 9, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("N")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_atom("C")
+p.add_atom("O")
+p.add_atom("O")
+p.add_bond(1, 2, 4)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(5, 6, 1)
+p.add_bond(6, 7, 2)
+p.add_bond(6, 8, 1)
+p.add_bond(5, 9, 1)
+p.add_bond(9, 10, 2)
+p.add_bond(9, 11, 1)
+# heterocyclization in Prins reaction
+#
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("O")
+q.add_atom("C")
+q.add_atom(ListElement(["N", "O"]), neighbors=2)
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 1)
+p.add_atom("C")
+p.add_atom("C", _map=5)
+p.add_bond(1, 5, 2)
+# recyclization of tetrahydropyran through an opening the ring and dehydration
+#
+q, p = prepare()
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom("C")
+q.add_atom(ListElement(["N", "O"]))
+q.add_atom("C")
+q.add_atom("C")
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 2)
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("A")
+p.add_atom("C")
+p.add_atom("C")
+p.add_atom("O")
+p.add_bond(1, 2, 1)
+p.add_bond(1, 7, 1)
+p.add_bond(3, 7, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(5, 6, 1)
+p.add_bond(1, 6, 1)
+# alkenes + h2o/hHal
+#
+q, p = prepare()
+q.add_atom("C", hybridization=1)
+q.add_atom("C", hybridization=1)
+q.add_atom(ListElement(["O", "F", "Cl", "Br", "I"]), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom("C")
+p.add_atom("C")
+p.add_bond(1, 2, 2)
+# methylation of dimethylamines
+#
+q, p = prepare()
+q.add_atom("C", neighbors=1)
+q.add_atom("N", neighbors=3)
+q.add_bond(1, 2, 1)
+p.add_atom("N", _map=2)
+__all__ = ["rules"]

synplan/chem/utils.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""Module containing additional functions needed in different reaction data processing
+protocols."""
+import logging
+from typing import Iterable
+from CGRtools.containers import (
+    CGRContainer,
+    MoleculeContainer,
+    QueryContainer,
+    ReactionContainer,
+)
+from CGRtools.exceptions import InvalidAromaticRing
+from tqdm import tqdm
+from synplan.chem import smiles_parser
+from synplan.utils.files import MoleculeReader, MoleculeWriter
+from chython import MoleculeContainer as MoleculeContainerChython
+def mol_from_smiles(
+    smiles: str,
+    standardize: bool = True,
+    clean_stereo: bool = True,
+    clean2d: bool = True,
+) -> MoleculeContainer:
+    """Converts a SMILES string to a `MoleculeContainer` object and optionally
+    standardizes, cleans stereochemistry, and cleans 2D coordinates.
+    :param smiles: The SMILES string representing the molecule.
+    :param standardize: Whether to standardize the molecule (default is True).
+    :param clean_stereo: Whether to remove the stereo marks on atoms of the molecule (default is True).
+    :param clean2d: Whether to clean the 2D coordinates of the molecule (default is True).
+    :return: The processed molecule object.
+    :raises ValueError: If the SMILES string could not be processed by CGRtools.
+    """
+    molecule = smiles_parser(smiles)
+    if not isinstance(molecule, MoleculeContainer):
+        raise ValueError("SMILES string was not processed by CGRtools")
+    tmp = molecule.copy()
+    try:
+        if standardize:
+            tmp.canonicalize()
+        if clean_stereo:
+            tmp.clean_stereo()
+        if clean2d:
+            tmp.clean2d()
+        molecule = tmp
+    except InvalidAromaticRing:
+        logging.warning(
+            "CGRtools was not able to standardize molecule due to invalid aromatic ring"
+        )
+    return molecule
+def query_to_mol(query: QueryContainer) -> MoleculeContainer:
+    """Converts a QueryContainer object into a MoleculeContainer object.
+    :param query: A QueryContainer object representing the query structure.
+    :return: A MoleculeContainer object that replicates the structure of the query.
+    """
+    new_mol = MoleculeContainer()
+    for n, atom in query.atoms():
+        new_mol.add_atom(
+            atom.atomic_symbol, n, charge=atom.charge, is_radical=atom.is_radical
+        )
+    for i, j, bond in query.bonds():
+        new_mol.add_bond(i, j, int(bond))
+    return new_mol
+def reaction_query_to_reaction(reaction_rule: ReactionContainer) -> ReactionContainer:
+    """Converts a ReactionContainer object with query structures into a
+    ReactionContainer with molecular structures.
+    :param reaction_rule: A ReactionContainer object where reactants and products are
+        QueryContainer objects.
+    :return: A new ReactionContainer object where reactants and products are
+        MoleculeContainer objects.
+    """
+    reactants = [query_to_mol(q) for q in reaction_rule.reactants]
+    products = [query_to_mol(q) for q in reaction_rule.products]
+    reagents = [
+        query_to_mol(q) for q in reaction_rule.reagents
+    ]  # Assuming reagents are also part of the rule
+    reaction = ReactionContainer(reactants, products, reagents, reaction_rule.meta)
+    reaction.name = reaction_rule.name
+    return reaction
+def unite_molecules(molecules: Iterable[MoleculeContainer]) -> MoleculeContainer:
+    """Unites a list of MoleculeContainer objects into a single MoleculeContainer. This
+    function takes multiple molecules and combines them into one larger molecule. The
+    first molecule in the list is taken as the base, and subsequent molecules are united
+    with it sequentially.
+    :param molecules: A list of MoleculeContainer objects to be united.
+    :return: A single MoleculeContainer object representing the union of all input
+        molecules.
+    """
+    new_mol = MoleculeContainer()
+    for mol in molecules:
+        new_mol = new_mol.union(mol)
+    return new_mol
+def safe_canonicalization(molecule: MoleculeContainer) -> MoleculeContainer:
+    """Attempts to canonicalize a molecule, handling any exceptions. If the
+    canonicalization process fails due to an InvalidAromaticRing exception, it safely
+    returns the original molecule.
+    :param molecule: The given molecule to be canonicalized.
+    :return: The canonicalized molecule if successful, otherwise the original molecule.
+    """
+    molecule._atoms = dict(sorted(molecule._atoms.items()))
+    molecule_copy = molecule.copy()
+    try:
+        molecule_copy.canonicalize()
+        molecule_copy.clean_stereo()
+        return molecule_copy
+    except InvalidAromaticRing:
+        return molecule
+def standardize_building_blocks(input_file: str, output_file: str) -> str:
+    """Standardizes custom building blocks.
+    :param input_file: The path to the file that stores the original building blocks.
+    :param output_file: The path to the file that will store the standardized building
+        blocks.
+    :return: The path to the file with standardized building blocks.
+    """
+    if input_file == output_file:
+        raise ValueError("input_file name and output_file name cannot be the same.")
+    with MoleculeReader(input_file) as inp_file, MoleculeWriter(
+        output_file
+    ) as out_file:
+        for mol in tqdm(
+            inp_file,
+            desc="Number of building blocks processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            try:
+                mol = safe_canonicalization(mol)
+            except Exception as e:
+                logging.debug(e)
+                continue
+            out_file.write(mol)
+    return output_file
+def cgr_from_reaction_rule(reaction_rule: ReactionContainer) -> CGRContainer:
+    """Creates a CGR from the given reaction rule.
+    :param reaction_rule: The reaction rule to be converted.
+    :return: The resulting CGR.
+    """
+    reaction_rule = reaction_query_to_reaction(reaction_rule)
+    cgr_rule = ~reaction_rule
+    return cgr_rule
+def hash_from_reaction_rule(reaction_rule: ReactionContainer) -> hash:
+    """Generates hash for the given reaction rule.
+    :param reaction_rule: The reaction rule to be converted.
+    :return: The resulting hash.
+    """
+    reactants_hash = tuple(sorted(hash(r) for r in reaction_rule.reactants))
+    reagents_hash = tuple(sorted(hash(r) for r in reaction_rule.reagents))
+    products_hash = tuple(sorted(hash(r) for r in reaction_rule.products))
+    return hash((reactants_hash, reagents_hash, products_hash))
+def reverse_reaction(
+    reaction: ReactionContainer,
+) -> ReactionContainer:
+    """Reverses the given reaction.
+    :param reaction: The reaction to be reversed.
+    :return: The reversed reaction.
+    """
+    reversed_reaction = ReactionContainer(
+        reaction.products, reaction.reactants, reaction.reagents, reaction.meta
+    )
+    reversed_reaction.name = reaction.name
+    return reversed_reaction
+def cgrtools_to_chython_molecule(molecule):
+    molecule_chython = MoleculeContainerChython()
+    for n, atom in molecule.atoms():
+        molecule_chython.add_atom(atom.atomic_symbol, n)
+    for n, m, bond in molecule.bonds():
+        molecule_chython.add_bond(n, m, int(bond))
+    return molecule_chython
+def chython_query_to_cgrtools(query):
+    cgrtools_query = QueryContainer()
+    for n, atom in query.atoms():
+        cgrtools_query.add_atom(
+            atom=atom.atomic_symbol,
+            charge=atom.charge,
+            neighbors=atom.neighbors,
+            hybridization=atom.hybridization,
+            _map=n,
+        )
+    for n, m, bond in query.bonds():
+        cgrtools_query.add_bond(n, m, int(bond))
+    return cgrtools_query

synplan/interfaces/__init__.py ADDED Viewed

File without changes

synplan/interfaces/building_blocks/building_blocks_em_sa_ln.smi ADDED Viewed

The diff for this file is too large to render. See raw diff

synplan/interfaces/cli.py ADDED Viewed

	@@ -0,0 +1,506 @@

+"""Module containing commands line scripts for training and planning steps."""
+import os
+import warnings
+from pathlib import Path
+import click
+import yaml
+from synplan.chem.data.filtering import ReactionFilterConfig, filter_reactions_from_file
+from synplan.chem.data.standardizing import (
+    ReactionStandardizationConfig,
+    standardize_reactions_from_file,
+)
+from synplan.chem.reaction_rules.extraction import extract_rules_from_reactions
+from synplan.chem.reaction_routes.clustering import run_cluster_cli
+from synplan.chem.utils import standardize_building_blocks
+from synplan.mcts.search import run_search
+from synplan.ml.training.supervised import create_policy_dataset, run_policy_training
+from synplan.ml.training.reinforcement import run_updating
+from synplan.utils.config import (
+    PolicyNetworkConfig,
+    RuleExtractionConfig,
+    TreeConfig,
+    TuningConfig,
+    ValueNetworkConfig,
+)
+from synplan.utils.loading import download_all_data
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+)
+warnings.filterwarnings("ignore")
+@click.group(name="synplan")
+def synplan():
+    """SynPlanner command line interface."""
+@synplan.command(name="download_all_data")
+@click.option(
+    "--save_to",
+    "save_to",
+    help="Path to the folder where downloaded data will be stored.",
+)
+def download_all_data_cli(save_to: str = ".") -> None:
+    """Downloads all data for training, planning and benchmarking SynPlanner."""
+    download_all_data(save_to=save_to)
+@synplan.command(name="building_blocks_standardizing")
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with building blocks to be standardized.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    required=True,
+    type=click.Path(),
+    help="Path to the file where standardized building blocks will be stored.",
+)
+def building_blocks_standardizing_cli(input_file: str, output_file: str) -> None:
+    """Standardizes building blocks."""
+    standardize_building_blocks(input_file=input_file, output_file=output_file)
+@synplan.command(name="reaction_standardizing")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for reactions standardizing.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions to be standardized.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    type=click.Path(),
+    help="Path to the file where standardized reactions will be stored.",
+)
+@click.option(
+    "--num_cpus", default=4, type=int, help="The number of CPUs to use for processing."
+)
+def reaction_standardizing_cli(
+    config_path: str, input_file: str, output_file: str, num_cpus: int
+) -> None:
+    """Standardizes reactions and remove duplicates."""
+    stand_config = ReactionStandardizationConfig.from_yaml(config_path)
+    standardize_reactions_from_file(
+        config=stand_config,
+        input_reaction_data_path=input_file,
+        standardized_reaction_data_path=output_file,
+        num_cpus=num_cpus,
+        batch_size=100,
+    )
+@synplan.command(name="reaction_filtering")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for reactions filtering.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions to be filtered.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    default=Path("./"),
+    type=click.Path(),
+    help="Path to the file where successfully filtered reactions will be stored.",
+)
+@click.option(
+    "--num_cpus", default=4, type=int, help="The number of CPUs to use for processing."
+)
+def reaction_filtering_cli(
+    config_path: str, input_file: str, output_file: str, num_cpus: int
+):
+    """Filters erroneous reactions."""
+    reaction_check_config = ReactionFilterConfig().from_yaml(config_path)
+    filter_reactions_from_file(
+        config=reaction_check_config,
+        input_reaction_data_path=input_file,
+        filtered_reaction_data_path=output_file,
+        num_cpus=num_cpus,
+        batch_size=100,
+    )
+@synplan.command(name="rule_extracting")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for reaction rules extracting.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions for reaction rules extraction.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    required=True,
+    type=click.Path(),
+    help="Path to the file where extracted reaction rules will be stored.",
+)
+@click.option(
+    "--num_cpus", default=4, type=int, help="The number of CPUs to use for processing."
+)
+def rule_extracting_cli(
+    config_path: str, input_file: str, output_file: str, num_cpus: int
+):
+    """Reaction rules extraction."""
+    reaction_rule_config = RuleExtractionConfig.from_yaml(config_path)
+    extract_rules_from_reactions(
+        config=reaction_rule_config,
+        reaction_data_path=input_file,
+        reaction_rules_path=output_file,
+        num_cpus=num_cpus,
+        batch_size=100,
+    )
+@synplan.command(name="ranking_policy_training")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for ranking policy training.",
+)
+@click.option(
+    "--reaction_data",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with reactions for ranking policy training.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules.",
+)
+@click.option(
+    "--results_dir",
+    default=Path("."),
+    type=click.Path(),
+    help="Path to the directory where the trained policy network will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=4,
+    type=int,
+    help="The number of CPUs to use for training set preparation.",
+)
+def ranking_policy_training_cli(
+    config_path: str,
+    reaction_data: str,
+    reaction_rules: str,
+    results_dir: str,
+    num_cpus: int,
+) -> None:
+    """Ranking policy network training."""
+    policy_config = PolicyNetworkConfig.from_yaml(config_path)
+    policy_config.policy_type = "ranking"
+    policy_dataset_file = os.path.join(results_dir, "policy_dataset.dt")
+    datamodule = create_policy_dataset(
+        reaction_rules_path=reaction_rules,
+        molecules_or_reactions_path=reaction_data,
+        output_path=policy_dataset_file,
+        dataset_type="ranking",
+        batch_size=policy_config.batch_size,
+        num_cpus=num_cpus,
+    )
+    run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+@synplan.command(name="filtering_policy_training")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for filtering policy training.",
+)
+@click.option(
+    "--molecule_data",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with molecules for filtering policy training.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules.",
+)
+@click.option(
+    "--results_dir",
+    default=Path("."),
+    type=click.Path(),
+    help="Path to the directory where the trained policy network will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=8,
+    type=int,
+    help="The number of CPUs to use for training set preparation.",
+)
+def filtering_policy_training_cli(
+    config_path: str,
+    molecule_data: str,
+    reaction_rules: str,
+    results_dir: str,
+    num_cpus: int,
+):
+    """Filtering policy network training."""
+    policy_config = PolicyNetworkConfig.from_yaml(config_path)
+    policy_config.policy_type = "filtering"
+    policy_dataset_file = os.path.join(results_dir, "policy_dataset.ckpt")
+    datamodule = create_policy_dataset(
+        reaction_rules_path=reaction_rules,
+        molecules_or_reactions_path=molecule_data,
+        output_path=policy_dataset_file,
+        dataset_type="filtering",
+        batch_size=policy_config.batch_size,
+        num_cpus=num_cpus,
+    )
+    run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+@synplan.command(name="value_network_tuning")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for value network training.",
+)
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with target molecules for planning simulations.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules. Needed for planning simulations.",
+)
+@click.option(
+    "--building_blocks",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with building blocks. Needed for planning simulations.",
+)
+@click.option(
+    "--policy_network",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with trained policy network. Needed for planning simulations.",
+)
+@click.option(
+    "--value_network",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to the file with trained value network. Needed in case of additional value network fine-tuning",
+)
+@click.option(
+    "--results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the directory where the trained value network will be stored.",
+)
+def value_network_tuning_cli(
+    config_path: str,
+    targets: str,
+    reaction_rules: str,
+    building_blocks: str,
+    policy_network: str,
+    value_network: str,
+    results_dir: str,
+):
+    """Value network tuning."""
+    with open(config_path, "r", encoding="utf-8") as file:
+        config = yaml.safe_load(file)
+    policy_config = PolicyNetworkConfig.from_dict(config["node_expansion"])
+    policy_config.weights_path = policy_network
+    value_config = ValueNetworkConfig.from_dict(config["value_network"])
+    if value_network is None:
+        value_config.weights_path = os.path.join(
+            results_dir, "weights", "value_network.ckpt"
+        )
+    tree_config = TreeConfig.from_dict(config["tree"])
+    tuning_config = TuningConfig.from_dict(config["tuning"])
+    run_updating(
+        targets_path=targets,
+        tree_config=tree_config,
+        policy_config=policy_config,
+        value_config=value_config,
+        reinforce_config=tuning_config,
+        reaction_rules_path=reaction_rules,
+        building_blocks_path=building_blocks,
+        results_root=results_dir,
+    )
+@synplan.command(name="planning")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file for retrosynthetic planning.",
+)
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with target molecules for retrosynthetic planning.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with extracted reaction rules.",
+)
+@click.option(
+    "--building_blocks",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with building blocks.",
+)
+@click.option(
+    "--policy_network",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with trained policy network.",
+)
+@click.option(
+    "--value_network",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to the file with trained value network.",
+)
+@click.option(
+    "--results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where retrosynthetic planning results will be stored.",
+)
+def planning_cli(
+    config_path: str,
+    targets: str,
+    reaction_rules: str,
+    building_blocks: str,
+    policy_network: str,
+    value_network: str,
+    results_dir: str,
+):
+    """Retrosynthetic planning."""
+    with open(config_path, "r", encoding="utf-8") as file:
+        config = yaml.safe_load(file)
+    search_config = {**config["tree"], **config["node_evaluation"]}
+    policy_config = PolicyNetworkConfig.from_dict(
+        {**config["node_expansion"], **{"weights_path": policy_network}}
+    )
+    run_search(
+        targets_path=targets,
+        search_config=search_config,
+        policy_config=policy_config,
+        reaction_rules_path=reaction_rules,
+        building_blocks_path=building_blocks,
+        value_network_path=value_network,
+        results_root=results_dir,
+    )
+@synplan.command(name="clustering")
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the file with target molecules for retrosynthetic planning.",
+)
+@click.option(
+    "--routes_file",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where the planning results are stored.",
+)
+@click.option(
+    "--cluster_results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where clustering results will be stored.",
+)
+@click.option(
+    "--perform_subcluster",
+    default=None,
+    type=click.Path(exists=False),
+    help="Perform subclustering.",
+)
+@click.option(
+    "--subcluster_results_dir",
+    default=".",
+    type=click.Path(exists=False),
+    help="Path to the file where subclustering results will be stored.",
+)
+def cluster_route_from_file_cli(
+    targets: str,
+    routes_file: str,
+    cluster_results_dir: str,
+    perform_subcluster: bool,
+    subcluster_results_dir: str,
+):
+    """Clustering the routes from planning"""
+    run_cluster_cli(
+        routes_file=routes_file,
+        cluster_results_dir=cluster_results_dir,
+        perform_subcluster=perform_subcluster,
+        subcluster_results_dir=subcluster_results_dir if perform_subcluster else None,
+    )
+if __name__ == "__main__":
+    synplan()

synplan/interfaces/gui.py ADDED Viewed

	@@ -0,0 +1,1304 @@

+import base64
+import pickle
+import re
+import uuid
+import io
+import zipfile
+import pandas as pd
+import streamlit as st
+from CGRtools.files import SMILESRead
+from streamlit_ketcher import st_ketcher
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import disable_progress_bars
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.search import extract_tree_stats
+from synplan.mcts.tree import Tree
+from synplan.chem.utils import mol_from_smiles
+from synplan.chem.reaction_routes.route_cgr import *
+from synplan.chem.reaction_routes.clustering import *
+from synplan.utils.visualisation import (
+    routes_clustering_report,
+    routes_subclustering_report,
+    generate_results_html,
+    html_top_routes_cluster,
+    get_route_svg,
+)
+from synplan.utils.config import TreeConfig, PolicyNetworkConfig
+from synplan.utils.loading import load_reaction_rules, load_building_blocks
+import psutil
+import gc
+disable_progress_bars("huggingface_hub")
+smiles_parser = SMILESRead.create_parser(ignore=True)
+DEFAULT_MOL = "c1cc(ccc1Cl)C(CCO)NC(C2(CCN(CC2)c3c4cc[nH]c4ncn3)N)=O"
+# --- Helper Functions ---
+def download_button(
+    object_to_download, download_filename, button_text, pickle_it=False
+):
+    """
+    Issued from
+    Generates a link to download the given object_to_download.
+    Params:
+    ------
+    object_to_download:  The object to be downloaded.
+    download_filename (str): filename and extension of file. e.g. mydata.csv,
+    some_txt_output.txt download_link_text (str): Text to display for download
+    link.
+    button_text (str): Text to display on download button (e.g. 'click here to download file')
+    pickle_it (bool): If True, pickle file.
+    Returns:
+    -------
+    (str): the anchor tag to download object_to_download
+    Examples:
+    --------
+    download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
+    download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
+    """
+    if pickle_it:
+        try:
+            object_to_download = pickle.dumps(object_to_download)
+        except pickle.PicklingError as e:
+            st.write(e)
+            return None
+    else:
+        if isinstance(object_to_download, bytes):
+            pass
+        elif isinstance(object_to_download, pd.DataFrame):
+            object_to_download = object_to_download.to_csv(index=False).encode("utf-8")
+    try:
+        b64 = base64.b64encode(object_to_download.encode()).decode()
+    except AttributeError:
+        b64 = base64.b64encode(object_to_download).decode()
+    button_uuid = str(uuid.uuid4()).replace("-", "")
+    button_id = re.sub("\d+", "", button_uuid)
+    custom_css = f"""
+        <style>
+            #{button_id} {{
+                background-color: rgb(255, 255, 255);
+                color: rgb(38, 39, 48);
+                text-decoration: none;
+                border-radius: 4px;
+                border-width: 1px;
+                border-style: solid;
+                border-color: rgb(230, 234, 241);
+                border-image: initial;
+            }}
+            #{button_id}:hover {{
+                border-color: rgb(246, 51, 102);
+                color: rgb(246, 51, 102);
+            }}
+            #{button_id}:active {{
+                box-shadow: none;
+                background-color: rgb(246, 51, 102);
+                color: white;
+                }}
+        </style> """
+    dl_link = (
+        custom_css
+        + f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br></br>'
+    )
+    return dl_link
+@st.cache_resource
+def load_planning_resources_cached():  # Renamed to avoid conflict if main calls it directly
+    building_blocks_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="building_blocks_em_sa_ln.smi",
+        subfolder="building_blocks",
+        local_dir=".",
+    )
+    ranking_policy_weights_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="ranking_policy_network.ckpt",
+        subfolder="uspto/weights",
+        local_dir=".",
+    )
+    reaction_rules_path = hf_hub_download(
+        repo_id="Laboratoire-De-Chemoinformatique/SynPlanner",
+        filename="uspto_reaction_rules.pickle",
+        subfolder="uspto",
+        local_dir=".",
+    )
+    return building_blocks_path, ranking_policy_weights_path, reaction_rules_path
+# --- GUI Sections ---
+def initialize_app():
+    """1. Initialization: Setting up the main window, layout, and initial widgets."""
+    st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
+    # Initialize session state variables if they don't exist.
+    if "planning_done" not in st.session_state:
+        st.session_state.planning_done = False
+    if "tree" not in st.session_state:
+        st.session_state.tree = None
+    if "res" not in st.session_state:
+        st.session_state.res = None
+    if "target_smiles" not in st.session_state:
+        st.session_state.target_smiles = (
+            ""  # Initial value, might be overwritten by ketcher
+        )
+    # Clustering state
+    if "clustering_done" not in st.session_state:
+        st.session_state.clustering_done = False
+    if "clusters" not in st.session_state:
+        st.session_state.clusters = None
+    if "reactions_dict" not in st.session_state:
+        st.session_state.reactions_dict = None
+    if "num_clusters_setting" not in st.session_state:  # Store the setting used
+        st.session_state.num_clusters_setting = 10
+    if "route_cgrs_dict" not in st.session_state:
+        st.session_state.route_cgrs_dict = None
+    if "r_route_cgrs_dict" not in st.session_state:
+        st.session_state.r_route_cgrs_dict = None
+    # Subclustering state
+    if "subclustering_done" not in st.session_state:
+        st.session_state.subclustering_done = False
+    if "subclusters" not in st.session_state:  # Renamed from 'sub' for clarity
+        st.session_state.subclusters = None
+    # Download state (less critical now with direct download links)
+    if "clusters_downloaded" not in st.session_state:  # Example, might not be needed
+        st.session_state.clusters_downloaded = False
+    if "ketcher" not in st.session_state:  # For ketcher persistence
+        st.session_state.ketcher = DEFAULT_MOL
+    intro_text = """
+    This is a demo of the graphical user interface of
+    [SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
+    SynPlanner is a comprehensive tool for reaction data curation, rule extraction, model training and retrosynthetic planning.
+    More information on SynPlanner is available in the [official docs](https://synplanner.readthedocs.io/en/latest/index.html).
+    """
+    st.title("`SynPlanner GUI`")
+    st.write(intro_text)
+def setup_sidebar():
+    """2. Sidebar: Handling the widgets and logic within the sidebar area."""
+    # st.sidebar.image("img/logo.png") # Assuming img/logo.png is available
+    st.sidebar.title("Docs")
+    st.sidebar.markdown("https://synplanner.readthedocs.io/en/latest/")
+    st.sidebar.title("Tutorials")
+    st.sidebar.markdown(
+        "https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/tree/main/tutorials"
+    )
+    st.sidebar.title("Paper")
+    st.sidebar.markdown(
+        "https://chemrxiv.org/engage/chemrxiv/article-details/66add90bc9c6a5c07ae65796"
+    )
+    st.sidebar.title("Issues")
+    st.sidebar.markdown(
+        "[Report a bug 🐞](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/issues/new?assignees=&labels=bug&projects=&template=bug_report.md&title=%5BBUG%5D)"
+    )
+def handle_molecule_input():
+    """3. Molecule Input: Managing the input area for molecule data."""
+    st.header("Molecule input")
+    st.markdown(
+        """
+        You can provide a molecular structure by either providing:
+        * SMILES string + Enter
+        * Draw it + Apply
+        """
+    )
+    # Use st.session_state.ketcher to persist drawn molecule
+    molecule_text_input = st.text_input(
+        "SMILES:", value=st.session_state.ketcher, key="smiles_text_input_key"
+    )
+    smile_code_ketcher = st_ketcher(molecule_text_input, key="ketcher_widget")
+    # col_kethcer, col_info = st.columns([0.8, 0.2])
+    # with col_kethcer:
+    #     smile_code_ketcher = st_ketcher(molecule_text_input, key="ketcher_widget")
+    # with col_info:
+    #     st.subheader("Synthetic Complexity")
+    #     sascore = ()
+    #     st.markdown(f"SAScore: {sascore}")
+    #     syba_score = ()
+    #     st.markdown(f"SYBA: {sascore}")
+    current_smile_code = (
+        smile_code_ketcher  # The output from ketcher is the definitive SMILES
+    )
+    if (
+        "target_smiles" in st.session_state
+        and current_smile_code != st.session_state.target_smiles
+    ):
+        st.warning("Molecule structure changed. Please re-run planning.")
+        st.session_state.planning_done = False
+        st.session_state.clustering_done = False
+        st.session_state.subclustering_done = False
+        st.session_state.tree = None
+        st.session_state.res = None
+        st.session_state.clusters = None
+        st.session_state.reactions_dict = None
+        st.session_state.subclusters = None
+        st.session_state.ketcher = current_smile_code
+    return current_smile_code
+def setup_planning_options():
+    """4. Planning: Encapsulating the logic related to the "planning" functionality."""
+    st.header("Launch calculation")
+    st.markdown(
+        """If you modified the structure, please ensure you clicked on `Apply` (bottom right of the molecular editor)."""
+    )
+    # This smile_code display will be updated if handle_molecule_input has run and returned a new smile_code
+    # However, to display it correctly, we need the current smile_code from the session or input handler.
+    # For simplicity, let's assume handle_molecule_input has updated st.session_state.ketcher
+    st.markdown(
+        f"The molecule SMILES is actually: ``{st.session_state.get('ketcher', DEFAULT_MOL)}``"
+    )
+    st.subheader("Planning options")
+    st.markdown(
+        """
+        The description of each option can be found in the
+        [Retrosynthetic Planning Tutorial](https://synplanner.readthedocs.io/en/latest/tutorial_files/retrosynthetic_planning.html#Configuring-search-tree).
+        """
+    )
+    col_options_1, col_options_2 = st.columns(2, gap="medium")
+    with col_options_1:
+        search_strategy_input = st.selectbox(
+            label="Search strategy",
+            options=(
+                "Expansion first",
+                "Evaluation first",
+            ),
+            index=0,
+            key="search_strategy_input",
+        )
+        ucb_type = st.selectbox(
+            label="UCB type",
+            options=("uct", "puct", "value"),
+            index=0,
+            key="ucb_type_input",
+        )  # Fixed label
+        c_ucb = st.number_input(
+            "C coefficient of UCB",
+            value=0.1,
+            placeholder="Type a number...",
+            key="c_ucb_input",
+        )
+    with col_options_2:
+        max_iterations = st.slider(
+            "Total number of MCTS iterations",
+            min_value=50,
+            max_value=1000,
+            value=300,
+            key="max_iterations_slider",
+        )
+        max_depth = st.slider(
+            "Maximal number of reaction steps",
+            min_value=3,
+            max_value=9,
+            value=6,
+            key="max_depth_slider",
+        )
+        min_mol_size = st.slider(
+            "Minimum size of a molecule to be precursor",
+            min_value=0,
+            max_value=7,
+            value=0,
+            key="min_mol_size_slider",
+            help="Number of non-hydrogen atoms in molecule",
+        )
+    search_strategy_translator = {
+        "Expansion first": "expansion_first",
+        "Evaluation first": "evaluation_first",
+    }
+    search_strategy = search_strategy_translator[search_strategy_input]
+    planning_params = {
+        "search_strategy": search_strategy,
+        "ucb_type": ucb_type,
+        "c_ucb": c_ucb,
+        "max_iterations": max_iterations,
+        "max_depth": max_depth,
+        "min_mol_size": min_mol_size,
+    }
+    if st.button("Start retrosynthetic planning", key="submit_planning_button"):
+        # Reset downstream states if replanning
+        st.session_state.planning_done = False
+        st.session_state.clustering_done = False
+        st.session_state.subclustering_done = False
+        st.session_state.tree = None
+        st.session_state.res = None
+        st.session_state.clusters = None
+        st.session_state.reactions_dict = None
+        st.session_state.subclusters = None
+        st.session_state.route_cgrs_dict = None
+        st.session_state.r_route_cgrs_dict = None
+        active_smile_code = st.session_state.get(
+            "ketcher", DEFAULT_MOL
+        )  # Get current SMILES
+        st.session_state.target_smiles = (
+            active_smile_code  # Store the SMILES used for this run
+        )
+        try:
+            target_molecule = mol_from_smiles(active_smile_code)
+            if target_molecule is None:
+                st.error(f"Could not parse the input SMILES: {active_smile_code}")
+            else:
+                (
+                    building_blocks_path,
+                    ranking_policy_weights_path,
+                    reaction_rules_path,
+                ) = load_planning_resources_cached()
+                with st.spinner("Running retrosynthetic planning..."):
+                    with st.status("Loading resources...", expanded=False) as status:
+                        st.write("Loading building blocks...")
+                        building_blocks = load_building_blocks(
+                            building_blocks_path, standardize=False
+                        )
+                        st.write("Loading reaction rules...")
+                        reaction_rules = load_reaction_rules(reaction_rules_path)
+                        st.write("Loading policy network...")
+                        policy_config = PolicyNetworkConfig(
+                            weights_path=ranking_policy_weights_path
+                        )
+                        policy_function = PolicyNetworkFunction(
+                            policy_config=policy_config
+                        )
+                        status.update(label="Resources loaded!", state="complete")
+                    tree_config = TreeConfig(
+                        search_strategy=planning_params["search_strategy"],
+                        evaluation_type="rollout",  # This was hardcoded, keeping it.
+                        max_iterations=planning_params["max_iterations"],
+                        max_depth=planning_params["max_depth"],
+                        min_mol_size=planning_params["min_mol_size"],
+                        init_node_value=0.5,  # This was hardcoded
+                        ucb_type=planning_params["ucb_type"],
+                        c_ucb=planning_params["c_ucb"],
+                        silent=True,  # This was hardcoded
+                    )
+                    tree = Tree(
+                        target=target_molecule,
+                        config=tree_config,
+                        reaction_rules=reaction_rules,
+                        building_blocks=building_blocks,
+                        expansion_function=policy_function,
+                        evaluation_function=None,  # This was hardcoded
+                    )
+                    mcts_progress_text = "Running MCTS iterations..."
+                    mcts_bar = st.progress(0, text=mcts_progress_text)
+                    for step, (solved, node_id) in enumerate(tree):
+                        progress_value = min(
+                            1.0, (step + 1) / planning_params["max_iterations"]
+                        )
+                        mcts_bar.progress(
+                            progress_value,
+                            text=f"{mcts_progress_text} ({step+1}/{planning_params['max_iterations']})",
+                        )
+                    res = extract_tree_stats(tree, target_molecule)
+                    st.session_state["tree"] = tree
+                    st.session_state["res"] = res
+                    st.session_state.planning_done = True
+                    st.rerun()
+        except Exception as e:
+            st.error(f"An error occurred during planning: {e}")
+            st.session_state.planning_done = False
+def display_planning_results():
+    """5. Planning Results Display: Handling the presentation of results."""
+    if st.session_state.get("planning_done", False):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        if res is None or tree is None:
+            st.error(
+                "Planning results are missing from session state. Please re-run planning."
+            )
+            st.session_state.planning_done = False  # Reset state
+            return  # Exit this function if no results
+        if res.get("solved", False):  # Use .get for safety
+            st.header("Planning Results")
+            winning_nodes = (
+                sorted(set(tree.winning_nodes))
+                if hasattr(tree, "winning_nodes") and tree.winning_nodes
+                else []
+            )
+            st.subheader(f"Number of unique routes found: {len(winning_nodes)}")
+            st.subheader("Examples of found retrosynthetic routes")
+            image_counter = 0
+            visualised_node_ids = set()
+            if not winning_nodes:
+                st.warning(
+                    "Planning solved, but no winning nodes found in the tree object."
+                )
+            else:
+                for n, node_id in enumerate(winning_nodes):
+                    if image_counter >= 3:
+                        break
+                    if node_id not in visualised_node_ids:
+                        try:
+                            visualised_node_ids.add(node_id)
+                            num_steps = len(tree.synthesis_route(node_id))
+                            route_score = round(tree.route_score(node_id), 3)
+                            svg = get_route_svg(tree, node_id)
+                            if svg:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                                image_counter += 1
+                            else:
+                                st.warning(
+                                    f"Could not generate SVG for route {node_id}."
+                                )
+                        except Exception as e:
+                            st.error(f"Error displaying route {node_id}: {e}")
+        else:  # Not solved
+            st.header("Planning Results")
+            st.warning(
+                "No reaction path found for the target molecule with the current settings."
+            )
+            st.write(
+                "Consider adjusting planning options (e.g., increase iterations, adjust depth, check molecule validity)."
+            )
+            stat_col, _ = st.columns(2)
+            with stat_col:
+                st.subheader("Run Statistics (No Solution)")
+                try:
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display for the unsuccessful run.")
+                except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)
+def download_planning_results():
+    """6. Planning Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        res = st.session_state.res
+        tree = st.session_state.tree
+        # This section is usually placed within a column in the original script
+        # We'll assume it's called after display_planning_results and can use a new column or area.
+        # For proper layout, this should be integrated with display_planning_results' columns.
+        # For now, creating a placeholder or separate section for downloads:
+        # st.subheader("Downloads") # This might be redundant if called within a layout context.
+        # The original code places downloads in the second column of planning results.
+        # To replicate, we'd need to pass the column object or call this within that context.
+        # Simulating this by just creating the download links:
+        try:
+            html_body = generate_results_html(tree, html_path=None, extended=True)
+            dl_html = download_button(
+                html_body,
+                f"results_synplanner_{st.session_state.target_smiles}.html",
+                "Download results (HTML)",
+            )
+            if dl_html:
+                st.markdown(dl_html, unsafe_allow_html=True)
+            try:
+                res_df = pd.DataFrame(res, index=[0])
+                dl_csv = download_button(
+                    res_df,
+                    f"stats_synplanner_{st.session_state.target_smiles}.csv",
+                    "Download statistics (CSV)",
+                )
+                if dl_csv:
+                    st.markdown(dl_csv, unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"Could not prepare statistics CSV for download: {e}")
+        except Exception as e:
+            st.error(f"Error generating download links for planning results: {e}")
+def setup_clustering():
+    """7. Clustering: Encapsulating the logic related to the "clustering" functionality."""
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        st.divider()
+        st.header("Clustering the retrosynthetic routes")
+        # num_clusters_input = st.number_input( # This input was removed in the final user code, so omitting.
+        #     "Desired Number of Clusters (approximate):",
+        #     min_value=2, max_value=50, value=st.session_state.get("num_clusters_setting", 10),
+        #     key="num_clusters_input_key"
+        # )
+        if st.button("Run Clustering", key="submit_clustering_button"):
+            # st.session_state.num_clusters_setting = num_clusters_input
+            st.session_state.clustering_done = False
+            st.session_state.subclustering_done = False
+            st.session_state.clusters = None
+            st.session_state.reactions_dict = None
+            st.session_state.subclusters = None
+            st.session_state.route_cgrs_dict = None
+            st.session_state.r_route_cgrs_dict = None
+            with st.spinner("Performing clustering..."):
+                try:
+                    current_tree = st.session_state.tree
+                    if not current_tree:
+                        st.error("Tree object not found. Please re-run planning.")
+                        return
+                    st.write("Calculating RoutesCGRs...")
+                    route_cgrs_dict = compose_all_route_cgrs(current_tree)
+                    st.write("Processing ReducedRoutesCGRs...")
+                    r_route_cgrs_dict = compose_all_reduced_route_cgrs(route_cgrs_dict)
+                    results = cluster_routes(
+                        r_route_cgrs_dict, use_strat=False
+                    )  # num_clusters was removed from args
+                    results = dict(sorted(results.items(), key=lambda x: float(x[0])))
+                    st.session_state.clusters = results
+                    st.session_state.route_cgrs_dict = route_cgrs_dict
+                    st.session_state.r_route_cgrs_dict = r_route_cgrs_dict
+                    st.write("Extracting reactions...")
+                    st.session_state.reactions_dict = extract_reactions(current_tree)
+                    if (
+                        st.session_state.clusters is not None
+                        and st.session_state.reactions_dict is not None
+                    ):  # Check for None explicitly
+                        st.session_state.clustering_done = True
+                        st.success(
+                            f"Clustering complete. Found {len(st.session_state.clusters)} clusters."
+                        )
+                    else:
+                        st.error("Clustering failed or returned empty results.")
+                        st.session_state.clustering_done = False
+                    del results  # route_cgrs_dict, r_route_cgrs_dict are stored
+                    gc.collect()
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"An error occurred during clustering: {e}")
+                    st.session_state.clustering_done = False
+def display_clustering_results():
+    """8. Clustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("clustering_done", False):
+        clusters = st.session_state.clusters
+        # reactions_dict = st.session_state.reactions_dict # Needed for download, not directly for display here
+        tree = st.session_state.tree
+        MAX_DISPLAY_CLUSTERS_DATA = 10
+        if (
+            clusters is None or tree is None
+        ):  # reactions_dict removed as not critical for display part
+            st.error(
+                "Clustering results (clusters or tree) are missing. Please re-run clustering."
+            )
+            st.session_state.clustering_done = False
+            return
+        st.subheader(f"Best routes from {len(clusters)} Found Clusters")
+        clusters_items = list(clusters.items())
+        first_items = clusters_items[:MAX_DISPLAY_CLUSTERS_DATA]
+        remaining_items = clusters_items[MAX_DISPLAY_CLUSTERS_DATA:]
+        for cluster_num, group_data in first_items:
+            if (
+                not group_data
+                or "node_ids" not in group_data
+                or not group_data["node_ids"]
+            ):
+                st.warning(f"Cluster {cluster_num} has no data or node_ids.")
+                continue
+            st.markdown(
+                f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+            )
+            node_id = group_data["node_ids"][0]
+            try:
+                num_steps = len(tree.synthesis_route(node_id))
+                route_score = round(tree.route_score(node_id), 3)
+                svg = get_route_svg(tree, node_id)
+                r_route_cgr = group_data.get("r_route_cgr")  # Safely get r_route_cgr
+                r_route_cgr_svg = None
+                if r_route_cgr:
+                    r_route_cgr.clean2d()
+                    r_route_cgr_svg = cgr_display(r_route_cgr)
+                if svg and r_route_cgr_svg:
+                    col1, col2 = st.columns([0.2, 0.8])
+                    with col1:
+                        st.image(r_route_cgr_svg, caption="ReducedRouteCGR")
+                    with col2:
+                        st.image(
+                            svg,
+                            caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                        )
+                elif svg:  # Only route SVG available
+                    st.image(
+                        svg,
+                        caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                    )
+                    st.warning(
+                        f"ReducedRouteCGR could not be displayed for cluster {cluster_num}."
+                    )
+                else:
+                    st.warning(
+                        f"Could not generate SVG for route {node_id} or its ReducedRouteCGR."
+                    )
+            except Exception as e:
+                st.error(
+                    f"Error displaying route {node_id} for cluster {cluster_num}: {e}"
+                )
+        if remaining_items:
+            with st.expander(f"... and {len(remaining_items)} more clusters"):
+                for cluster_num, group_data in remaining_items:
+                    if (
+                        not group_data
+                        or "node_ids" not in group_data
+                        or not group_data["node_ids"]
+                    ):
+                        st.warning(
+                            f"Cluster {cluster_num} in expansion has no data or node_ids."
+                        )
+                        continue
+                    st.markdown(
+                        f"**Cluster {cluster_num}** (Size: {group_data.get('group_size', 'N/A')})"
+                    )
+                    node_id = group_data["node_ids"][0]
+                    try:
+                        num_steps = len(tree.synthesis_route(node_id))
+                        route_score = round(tree.route_score(node_id), 3)
+                        svg = get_route_svg(tree, node_id)
+                        r_route_cgr = group_data.get("r_route_cgr")
+                        r_route_cgr_svg = None
+                        if r_route_cgr:
+                            r_route_cgr.clean2d()
+                            r_route_cgr_svg = cgr_display(r_route_cgr)
+                        if svg and r_route_cgr_svg:
+                            col1, col2 = st.columns([0.2, 0.8])
+                            with col1:
+                                st.image(r_route_cgr_svg, caption="ReducedRouteCGR")
+                            with col2:
+                                st.image(
+                                    svg,
+                                    caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                                )
+                        elif svg:
+                            st.image(
+                                svg,
+                                caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}",
+                            )
+                            st.warning(
+                                f"ReducedRouteCGR could not be displayed for cluster {cluster_num}."
+                            )
+                        else:
+                            st.warning(
+                                f"Could not generate SVG for route {node_id} or its ReducedRouteCGR."
+                            )
+                    except Exception as e:
+                        st.error(
+                            f"Error displaying route {node_id} for cluster {cluster_num}: {e}"
+                        )
+def download_clustering_results():
+    """10. Clustering Results Download: Providing functionality to download."""
+    if st.session_state.get("clustering_done", False):
+        tree_for_html = st.session_state.get("tree")
+        clusters_for_html = st.session_state.get("clusters")
+        r_route_cgrs_for_html = st.session_state.get(
+            "r_route_cgrs_dict"
+        )  # This was used instead of reactions_dict in the original for report
+        if not tree_for_html:
+            st.warning("MCTS Tree data not found. Cannot generate cluster reports.")
+            return
+        if not clusters_for_html:
+            st.warning("Cluster data not found. Cannot generate cluster reports.")
+            return
+        # r_route_cgrs_for_html is optional for routes_clustering_report if not essential
+        st.subheader("Cluster Reports")  # Changed subheader in original
+        st.write("Generate downloadable HTML reports for each cluster:")
+        MAX_DOWNLOAD_LINKS_DISPLAYED = 10
+        num_clusters_total = len(clusters_for_html)
+        clusters_items = list(clusters_for_html.items())
+        for i, (cluster_idx, group_data) in enumerate(
+            clusters_items
+        ):  # group_data might not be needed here if report uses cluster_idx
+            if i >= MAX_DOWNLOAD_LINKS_DISPLAYED:
+                break
+            try:
+                html_content = routes_clustering_report(
+                    tree_for_html,
+                    clusters_for_html,  # Pass the whole dict
+                    str(cluster_idx),  # Pass the key of the cluster
+                    r_route_cgrs_for_html,  # Pass the r_route_cgrs dict
+                    aam=False,
+                )
+                st.download_button(
+                    label=f"Download report for cluster {cluster_idx}",
+                    data=html_content,
+                    file_name=f"cluster_{cluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_cluster_{cluster_idx}",
+                )
+            except Exception as e:
+                st.error(f"Error generating report for cluster {cluster_idx}: {e}")
+        if num_clusters_total > MAX_DOWNLOAD_LINKS_DISPLAYED:
+            remaining_items = clusters_items[MAX_DOWNLOAD_LINKS_DISPLAYED:]
+            remaining_count = len(remaining_items)
+            expander_label = f"Show remaining {remaining_count} cluster reports"
+            with st.expander(expander_label):
+                for (
+                    group_index,
+                    _,
+                ) in remaining_items:  # group_data not needed here either
+                    try:
+                        html_content = routes_clustering_report(
+                            tree_for_html,
+                            clusters_for_html,
+                            str(group_index),
+                            r_route_cgrs_for_html,
+                            aam=False,
+                        )
+                        st.download_button(
+                            label=f"Download report for cluster {group_index}",
+                            data=html_content,
+                            file_name=f"cluster_{group_index}_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_expanded_{group_index}",
+                        )
+                    except Exception as e:
+                        st.error(
+                            f"Error generating report for cluster {group_index} (expanded): {e}"
+                        )
+        try:
+            buffer = io.BytesIO()
+            with zipfile.ZipFile(
+                buffer, mode="w", compression=zipfile.ZIP_DEFLATED
+            ) as zf:
+                for idx, _ in clusters_items:  # group_data not needed
+                    html_content_zip = routes_clustering_report(
+                        tree_for_html,
+                        clusters_for_html,
+                        str(idx),
+                        r_route_cgrs_for_html,
+                        aam=False,
+                    )
+                    filename = f"cluster_{idx}_{st.session_state.target_smiles}.html"
+                    zf.writestr(filename, html_content_zip)
+            buffer.seek(0)
+            st.download_button(
+                label="📦 Download all cluster reports as ZIP",
+                data=buffer,
+                file_name=f"all_cluster_reports_{st.session_state.target_smiles}.zip",
+                mime="application/zip",
+                key="download_all_clusters_zip",
+            )
+        except Exception as e:
+            st.error(f"Error generating ZIP file for cluster reports: {e}")
+def setup_subclustering():
+    """11. Subclustering: Encapsulating the logic related to the "subclustering" functionality."""
+    if st.session_state.get(
+        "clustering_done", False
+    ):  # Subclustering depends on clustering being done
+        st.divider()
+        st.header("Sub-Clustering within a selected Cluster")
+        if st.button("Run Subclustering Analysis", key="submit_subclustering_button"):
+            st.session_state.subclustering_done = False
+            st.session_state.subclusters = None
+            with st.spinner("Performing subclustering analysis..."):
+                try:
+                    clusters_for_sub = st.session_state.get("clusters")
+                    r_route_cgrs_dict_for_sub = st.session_state.get(
+                        "r_route_cgrs_dict"
+                    )
+                    route_cgrs_dict_for_sub = st.session_state.get("route_cgrs_dict")
+                    if (
+                        clusters_for_sub
+                        and r_route_cgrs_dict_for_sub
+                        and route_cgrs_dict_for_sub
+                    ):  # Ensure all are present
+                        all_subgroups = subcluster_all_clusters(
+                            clusters_for_sub,
+                            r_route_cgrs_dict_for_sub,
+                            route_cgrs_dict_for_sub,
+                        )
+                        st.session_state.subclusters = all_subgroups
+                        st.session_state.subclustering_done = True
+                        st.success("Subclustering analysis complete.")
+                        gc.collect()
+                        st.rerun()
+                    else:
+                        missing = []
+                        if not clusters_for_sub:
+                            missing.append("clusters")
+                        if not r_route_cgrs_dict_for_sub:
+                            missing.append("ReducedRouteCGRs dictionary")
+                        if not route_cgrs_dict_for_sub:
+                            missing.append("RouteCGRs dictionary")
+                        st.error(
+                            f"Cannot run subclustering. Missing data: {', '.join(missing)}. Please ensure clustering ran successfully."
+                        )
+                        st.session_state.subclustering_done = False
+                except Exception as e:
+                    st.error(f"An error occurred during subclustering: {e}")
+                    st.session_state.subclustering_done = False
+def display_subclustering_results():
+    """12. Subclustering Results Display: Handling the presentation of results."""
+    if st.session_state.get("subclustering_done", False):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        # clusters_for_sub_display = st.session_state.get('clusters') # Not directly used in display logic from original code snippet
+        if not sub or not tree:
+            st.error(
+                "Subclustering results (subclusters or tree) are missing. Please re-run subclustering."
+            )
+            st.session_state.subclustering_done = False
+            return
+        sub_input_col, sub_display_col = st.columns([0.25, 0.75])
+        with sub_input_col:
+            st.subheader("Select Cluster and Subcluster")
+            available_cluster_nums = list(sub.keys())
+            if not available_cluster_nums:
+                st.warning("No clusters available in subclustering results.")
+                return  # Exit if no clusters to select
+            user_input_cluster_num_display = st.selectbox(
+                "Select Cluster #:",
+                options=sorted(available_cluster_nums),
+                key="subcluster_num_select_key",
+            )
+            selected_subcluster_idx = 0
+            if user_input_cluster_num_display in sub:
+                sub_step_cluster = sub[user_input_cluster_num_display]
+                allowed_subclusters_indices = sorted(list(sub_step_cluster.keys()))
+                if not allowed_subclusters_indices:
+                    st.warning(
+                        f"No reaction steps (subclusters) found for Cluster {user_input_cluster_num_display}."
+                    )
+                else:
+                    selected_subcluster_idx = st.selectbox(
+                        "Select Subcluster Index:",
+                        options=allowed_subclusters_indices,
+                        key="subcluster_index_select_key",
+                    )
+                    if selected_subcluster_idx in sub[user_input_cluster_num_display]:
+                        current_subcluster_data = sub[user_input_cluster_num_display][
+                            selected_subcluster_idx
+                        ]
+                        if "r_route_cgr" in current_subcluster_data:
+                            cluster_r_route_cgr_display = current_subcluster_data[
+                                "r_route_cgr"
+                            ]
+                            cluster_r_route_cgr_display.clean2d()
+                            st.image(
+                                cluster_r_route_cgr_display.depict(),
+                                caption=f"ReducedRouteCGR of parent Cluster {user_input_cluster_num_display}",
+                            )
+                        else:
+                            st.warning("ReducedRouteCGR for this subcluster not found.")
+            else:
+                st.warning(
+                    f"Selected cluster {user_input_cluster_num_display} not found in subclustering results."
+                )
+                return
+        with sub_display_col:
+            st.subheader("Subcluster Details")
+            if (
+                user_input_cluster_num_display in sub
+                and selected_subcluster_idx in sub[user_input_cluster_num_display]
+            ):
+                subcluster_content = sub[user_input_cluster_num_display][
+                    selected_subcluster_idx
+                ]
+                # subcluster_to_display = post_process_subgroup(subcluster_content) #Under development
+                subcluster_to_display = subcluster_content
+                if (
+                    not subcluster_to_display
+                    or "nodes_data" not in subcluster_to_display
+                    or not subcluster_to_display["nodes_data"]
+                ):
+                    st.info("No routes or data found for this subcluster selection.")
+                else:
+                    MAX_ROUTES_PER_SUBCLUSTER = 5
+                    all_route_ids_in_subcluster = list(
+                        subcluster_to_display["nodes_data"].keys()
+                    )
+                    routes_to_display_direct = all_route_ids_in_subcluster[
+                        :MAX_ROUTES_PER_SUBCLUSTER
+                    ]
+                    remaining_routes_sub = all_route_ids_in_subcluster[
+                        MAX_ROUTES_PER_SUBCLUSTER:
+                    ]
+                    st.markdown(
+                        f"--- \n**Subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}** (Size: {len(all_route_ids_in_subcluster)})"
+                    )
+                    if "synthon_reaction" in subcluster_to_display:
+                        synthon_reaction = subcluster_to_display["synthon_reaction"]
+                        try:
+                            st.image(
+                                depict_custom_reaction(synthon_reaction),
+                                caption=f"Markush-like pseudo reaction of subcluster",
+                            )  # Assuming depict_custom_reaction
+                        except Exception as e_depict:
+                            st.warning(f"Could not depict synthon reaction: {e_depict}")
+                    else:
+                        st.info("No synthon reaction data for this subcluster.")
+                    for route_id in routes_to_display_direct:
+                        try:
+                            route_score_sub = round(tree.route_score(route_id), 3)
+                            svg_sub = get_route_svg(tree, route_id)
+                            if svg_sub:
+                                st.image(
+                                    svg_sub,
+                                    caption=f"Route {route_id}; Score: {route_score_sub}",
+                                )
+                            else:
+                                st.warning(
+                                    f"Could not generate SVG for route {route_id}."
+                                )
+                        except Exception as e:
+                            st.error(
+                                f"Error displaying route {route_id} in subcluster: {e}"
+                            )
+                    if remaining_routes_sub:
+                        with st.expander(
+                            f"... and {len(remaining_routes_sub)} more routes in this subcluster"
+                        ):
+                            for route_id in remaining_routes_sub:
+                                try:
+                                    route_score_sub = round(
+                                        tree.route_score(route_id), 3
+                                    )
+                                    svg_sub = get_route_svg(tree, route_id)
+                                    if svg_sub:
+                                        st.image(
+                                            svg_sub,
+                                            caption=f"Route {route_id}; Score: {route_score_sub}",
+                                        )
+                                    else:
+                                        st.warning(
+                                            f"Could not generate SVG for route {route_id}."
+                                        )
+                                except Exception as e:
+                                    st.error(
+                                        f"Error displaying route {route_id} in subcluster (expanded): {e}"
+                                    )
+            else:
+                st.info("Select a valid cluster and subcluster index to see details.")
+def download_subclustering_results():
+    """13. Subclustering Results Download: Providing functionality to download."""
+    if (
+        st.session_state.get("subclustering_done", False)
+        and "subcluster_num_select_key" in st.session_state
+        and "subcluster_index_select_key" in st.session_state
+    ):
+        sub = st.session_state.get("subclusters")
+        tree = st.session_state.get("tree")
+        r_route_cgrs_for_report = st.session_state.get(
+            "r_route_cgrs_dict"
+        )  # Used by routes_subclustering_report
+        user_input_cluster_num_display = st.session_state.subcluster_num_select_key
+        selected_subcluster_idx = st.session_state.subcluster_index_select_key
+        if not tree or not sub or not r_route_cgrs_for_report:
+            st.warning(
+                "Missing data for subclustering report generation (tree, subclusters, or ReducedRouteCGRs)."
+            )
+            return
+        if (
+            user_input_cluster_num_display in sub
+            and selected_subcluster_idx in sub[user_input_cluster_num_display]
+        ):
+            subcluster_data_for_report = sub[user_input_cluster_num_display][
+                selected_subcluster_idx
+            ]
+            # Apply the same post-processing as in display
+            processed_subcluster_data = post_process_subgroup(
+                subcluster_data_for_report
+            )
+            if "nodes_data" in subcluster_data_for_report and isinstance(
+                subcluster_data_for_report["nodes_data"], dict
+            ):
+                processed_subcluster_data["group_lgs"] = group_by_identical_values(
+                    subcluster_data_for_report["nodes_data"]
+                )
+            else:
+                processed_subcluster_data["group_lgs"] = {}
+            try:
+                subcluster_html_content = routes_subclustering_report(
+                    tree,
+                    processed_subcluster_data,  # Pass the specific post-processed subcluster data
+                    user_input_cluster_num_display,
+                    selected_subcluster_idx,
+                    r_route_cgrs_for_report,  # Pass the whole r_route_cgrs dict
+                    if_lg_group=True,  # This parameter was in the original call
+                )
+                st.download_button(
+                    label=f"Download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}",
+                    data=subcluster_html_content,
+                    file_name=f"subcluster_{user_input_cluster_num_display}.{selected_subcluster_idx}_{st.session_state.target_smiles}.html",
+                    mime="text/html",
+                    key=f"download_subcluster_{user_input_cluster_num_display}_{selected_subcluster_idx}",
+                )
+            except Exception as e:
+                st.error(
+                    f"Error generating download report for subcluster {user_input_cluster_num_display}.{selected_subcluster_idx}: {e}"
+                )
+        # else:
+        # This case is handled by the display logic mostly, download button just won't appear or will be for previous valid selection.
+def implement_restart():
+    """14. Restart: Implementing the logic to reset or restart the application state."""
+    st.divider()
+    st.header("Restart Application State")
+    if st.button("Clear All Results & Restart", key="restart_button"):
+        keys_to_clear = [
+            "planning_done",
+            "tree",
+            "res",
+            "target_smiles",
+            "clustering_done",
+            "clusters",
+            "reactions_dict",
+            "num_clusters_setting",
+            "route_cgrs_dict",
+            "r_route_cgrs_dict",
+            "subclustering_done",
+            "subclusters",  # "sub" was renamed
+            "clusters_downloaded",
+            # Potentially ketcher related keys if they need manual reset beyond new input
+            "ketcher_widget",
+            "smiles_text_input_key",  # Keys for widgets
+            "subcluster_num_select_key",
+            "subcluster_index_select_key",
+        ]
+        for key in keys_to_clear:
+            if key in st.session_state:
+                del st.session_state[key]
+        # Reset ketcher input to default by resetting its session state variable
+        st.session_state.ketcher = DEFAULT_MOL
+        # Also explicitly set target_smiles to empty or default to avoid stale data
+        st.session_state.target_smiles = ""
+        # It's generally better to let Streamlit manage widget state if possible,
+        # but for a full reset, clearing their explicit session state keys might be needed.
+        st.rerun()
+# --- Main Application Flow ---
+def main():
+    initialize_app()
+    setup_sidebar()
+    current_smile_code = handle_molecule_input()
+    # Update session_state.ketcher if current_smile_code has changed from ketcher output
+    if st.session_state.get("ketcher") != current_smile_code:
+        st.session_state.ketcher = current_smile_code
+        # No rerun here, let the flow continue. handle_molecule_input already warns.
+    setup_planning_options()  # This function now also handles the button press and logic for planning
+    # Display planning results and download options together
+    if st.session_state.get("planning_done", False):
+        display_planning_results()  # Displays stats and routes
+        if st.session_state.res and st.session_state.res.get("solved", False):
+            stat_col, download_col = st.columns(
+                2, gap="medium"
+            )  # Placeholder for download column
+            with stat_col:
+                st.subheader("Statistics")
+                try:
+                    res = st.session_state.res
+                    if (
+                        "target_smiles" not in res
+                        and "target_smiles" in st.session_state
+                    ):
+                        res["target_smiles"] = st.session_state.target_smiles
+                    cols_to_show = [
+                        col
+                        for col in [
+                            "target_smiles",
+                            "num_routes",
+                            "num_nodes",
+                            "num_iter",
+                            "search_time",
+                        ]
+                        if col in res
+                    ]
+                    if cols_to_show:  # Ensure there are columns to show
+                        df = pd.DataFrame(res, index=[0])[cols_to_show]
+                        st.dataframe(df)
+                    else:
+                        st.write("No statistics to display from planning results.")
+                except Exception as e:
+                    st.error(f"Error displaying statistics: {e}")
+                    st.write(res)  # Show raw dict if DataFrame fails
+            with download_col:
+                st.subheader("Planning Downloads")  # Adding a subheader for clarity
+                download_planning_results()
+    # Clustering section (setup button, display, download)
+    if (
+        st.session_state.get("planning_done", False)
+        and st.session_state.res
+        and st.session_state.res.get("solved", False)
+    ):
+        setup_clustering()  # Contains the "Run Clustering" button and logic
+        if st.session_state.get("clustering_done", False):
+            display_clustering_results()  # Displays cluster routes and stats
+            cluster_stat_col, cluster_download_col = st.columns(2, gap="medium")
+            with cluster_stat_col:
+                clusters = st.session_state.clusters
+                cluster_sizes = [
+                    cluster.get("group_size", 0)
+                    for cluster in clusters.values()
+                    if cluster
+                ]  # Safe get
+                st.subheader("Cluster Statistics")
+                if cluster_sizes:
+                    cluster_df = pd.DataFrame(
+                        {
+                            "Cluster": [
+                                k for k, v in clusters.items() if v
+                            ],  # Filter out empty clusters
+                            "Number of Routes": [
+                                v["group_size"] for v in clusters.values() if v
+                            ],
+                        }
+                    )
+                    if not cluster_df.empty:
+                        cluster_df.index += 1
+                        st.dataframe(cluster_df)
+                        best_route_html = html_top_routes_cluster(
+                            clusters,
+                            st.session_state.tree,
+                            st.session_state.target_smiles,
+                        )
+                        st.download_button(
+                            label=f"Download best route from each cluster",
+                            data=best_route_html,
+                            file_name=f"cluster_best_{st.session_state.target_smiles}.html",
+                            mime="text/html",
+                            key=f"download_cluster_best",
+                        )
+                    else:
+                        st.write("No valid cluster data to display statistics for.")
+                    # download_top_routes_cluster()
+                else:
+                    st.write("No cluster data to display statistics for.")
+            with cluster_download_col:
+                download_clustering_results()
+    # Subclustering section (setup button, display, download)
+    if st.session_state.get("clustering_done", False):  # Depends on clustering
+        setup_subclustering()  # Contains "Run Subclustering" button
+        if st.session_state.get("subclustering_done", False):
+            display_subclustering_results()  # Displays subcluster details and routes
+            download_subclustering_results()  # This needs to be called after selections are made in display.
+    implement_restart()
+if __name__ == "__main__":
+    main()

synplan/interfaces/uspto/uspto_reaction_rules.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d9f275781cc926eeff1d9a9564b4aa335b66506c621f943c82ec902460bf977
+size 45489168

synplan/interfaces/uspto/weights/ranking_policy_network.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c8852c4d8177538ba2a815d53e7b29f27e8a6067341f05b136a690bc46d53e
+size 164172437

synplan/mcts/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from CGRtools.containers import MoleculeContainer
+from .node import *
+from .tree import *
+MoleculeContainer.depict_settings(aam=False)
+__all__ = ["Tree", "Node"]

synplan/mcts/evaluation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Module containing a class that represents a value function for prediction of
+synthesisablity of new nodes in the tree search."""
+from typing import List
+import torch
+from synplan.chem.precursor import Precursor, compose_precursors
+from synplan.ml.networks.value import ValueNetwork
+from synplan.ml.training import mol_to_pyg
+class ValueNetworkFunction:
+    """Value function implemented as a value neural network for node evaluation
+    (synthesisability prediction) in tree search."""
+    def __init__(self, weights_path: str) -> None:
+        """The value function predicts the probability to synthesize the target molecule
+        with available building blocks starting from a given precursor.
+        :param weights_path: The value network weights file path.
+        """
+        value_net = ValueNetwork.load_from_checkpoint(
+            weights_path, map_location=torch.device("cpu")
+        )
+        self.value_network = value_net.eval()
+    def predict_value(self, precursors: List[Precursor,]) -> float:
+        """Predicts a value based on the given precursors from the node. For prediction,
+        precursors must be composed into a single molecule (product).
+        :param precursors: The list of precursors.
+        :return: The predicted float value ("synthesisability") of the node.
+        """
+        molecule = compose_precursors(precursors=precursors, exclude_small=True)
+        pyg_graph = mol_to_pyg(molecule)
+        if pyg_graph:
+            with torch.no_grad():
+                value_pred = self.value_network.forward(pyg_graph)[0].item()
+        else:
+            value_pred = -1e6
+        return value_pred

synplan/mcts/expansion.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Module containing a class that represents a policy function for node expansion in the
+tree search."""
+from typing import Iterator, List, Tuple, Union
+import torch
+import torch_geometric
+from CGRtools.reactor.reactor import Reactor
+from synplan.chem.precursor import Precursor
+from synplan.ml.networks.policy import PolicyNetwork
+from synplan.ml.training import mol_to_pyg
+from synplan.utils.config import PolicyNetworkConfig
+class PolicyNetworkFunction:
+    """Policy function implemented as a policy neural network for node expansion in tree
+    search."""
+    def __init__(
+        self, policy_config: PolicyNetworkConfig, compile: bool = False
+    ) -> None:
+        """Initializes the expansion function (ranking or filter policy network).
+        :param policy_config: An expansion policy configuration.
+        :param compile: Is supposed to speed up the training with model compilation.
+        """
+        self.config = policy_config
+        policy_net = PolicyNetwork.load_from_checkpoint(
+            self.config.weights_path,
+            map_location=torch.device("cpu"),
+            batch_size=1,
+            dropout=0,
+        )
+        policy_net = policy_net.eval()
+        if compile:
+            self.policy_net = torch_geometric.compile(policy_net, dynamic=True)
+        else:
+            self.policy_net = policy_net
+    def predict_reaction_rules(
+        self, precursor: Precursor, reaction_rules: List[Reactor]
+    ) -> Iterator[Union[Iterator, Iterator[Tuple[float, Reactor, int]]]]:
+        """The policy function predicts the list of reaction rules for a given precursor.
+        :param precursor: The current precursor for which the reaction rules are predicted.
+        :param reaction_rules: The list of reaction rules from which applicable reaction
+            rules are predicted and selected.
+        :return: Yielding the predicted probability for the reaction rule, reaction rule
+            and reaction rule id.
+        """
+        out_dim = list(self.policy_net.modules())[-1].out_features
+        if out_dim != len(reaction_rules):
+            raise Exception(
+                f"The policy network output dimensionality is {out_dim}, but the number of reaction rules is {len(reaction_rules)}. "
+                "Probably you use a different version of the policy network. Be sure to retain the policy network "
+                "with the current set of reaction rules"
+            )
+        pyg_graph = mol_to_pyg(precursor.molecule, canonicalize=False)
+        if pyg_graph:
+            with torch.no_grad():
+                if self.policy_net.policy_type == "filtering":
+                    probs, priority = self.policy_net.forward(pyg_graph)
+                if self.policy_net.policy_type == "ranking":
+                    probs = self.policy_net.forward(pyg_graph)
+            del pyg_graph
+        else:
+            return []
+        probs = probs[0].double()
+        if self.policy_net.policy_type == "filtering":
+            priority = priority[0].double()
+            priority_coef = self.config.priority_rules_fraction
+            probs = (1 - priority_coef) * probs + priority_coef * priority
+        sorted_probs, sorted_rules = torch.sort(probs, descending=True)
+        sorted_probs, sorted_rules = (
+            sorted_probs[: self.config.top_rules],
+            sorted_rules[: self.config.top_rules],
+        )
+        if self.policy_net.policy_type == "filtering":
+            sorted_probs = torch.softmax(sorted_probs, -1)
+        sorted_probs, sorted_rules = sorted_probs.tolist(), sorted_rules.tolist()
+        for prob, rule_id in zip(sorted_probs, sorted_rules):
+            if (
+                prob > self.config.rule_prob_threshold
+            ):  # search may fail if rule_prob_threshold is too low (recommended value is 0.0)
+                yield prob, reaction_rules[rule_id], rule_id

synplan/mcts/node.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Module containing a class Node in the tree search."""
+class Node:
+    """Node class represents a node in the tree search."""
+    def __init__(
+        self, precursors_to_expand: tuple = None, new_precursors: tuple = None
+    ) -> None:
+        """The function initializes the new Node object.
+        :param precursors_to_expand: The tuple of precursors to be expanded. The first precursor
+            in the tuple is the current precursor which will be expanded (for which new
+            precursors will be generated by applying the predicted reaction rules). When
+            the first precursor has been successfully expanded, the second precursor becomes
+            the current precursor to be expanded.
+        :param new_precursors: The tuple of new precursors generated by applying the reaction
+            rule.
+        """
+        self.precursors_to_expand = precursors_to_expand
+        self.new_precursors = new_precursors
+        if len(self.precursors_to_expand) == 0:
+            self.curr_precursor = tuple()
+        else:
+            self.curr_precursor = self.precursors_to_expand[0]
+            self.next_precursor = self.precursors_to_expand[1:]
+    def __len__(self) -> int:
+        """Returns the number of precursor in the node to expand."""
+        return len(self.precursors_to_expand)
+    def __repr__(self) -> str:
+        """Returns the SMILES of each precursor in precursor_to_expand and new_precursor."""
+        return (
+            f"New precursors: {self.new_precursors}\n"
+            f"Precursors to expand: {self.precursors_to_expand}\n"
+        )
+    def is_solved(self) -> bool:
+        """If True, it is a terminal node.
+        There are no precursors for expansion.
+        """
+        return len(self.precursors_to_expand) == 0

synplan/mcts/search.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""Module containing functions for running tree search for the set of target
+molecules."""
+import csv
+import json
+import logging
+import os.path
+from pathlib import Path
+from typing import Union
+from CGRtools.containers import MoleculeContainer
+from tqdm import tqdm
+from synplan.chem.reaction_routes.route_cgr import extract_reactions
+from synplan.chem.reaction_routes.io import write_routes_csv, write_routes_json
+from synplan.chem.utils import mol_from_smiles
+from synplan.mcts.evaluation import ValueNetworkFunction
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.tree import Tree, TreeConfig
+from synplan.utils.config import PolicyNetworkConfig
+from synplan.utils.loading import load_building_blocks, load_reaction_rules
+from synplan.utils.visualisation import extract_routes, generate_results_html
+def extract_tree_stats(
+    tree: Tree, target: Union[str, MoleculeContainer], init_smiles: str = None
+):
+    """Collects various statistics from a tree and returns them in a dictionary format.
+    :param tree: The built search tree.
+    :param target: The target molecule associated with the tree.
+    :param init_smiles: initial SMILES of the molecule, optional.
+    :return: A dictionary with the calculated statistics.
+    """
+    newick_tree, newick_meta = tree.newickify(visits_threshold=0)
+    newick_meta_line = ";".join(
+        [f"{nid},{v[0]},{v[1]},{v[2]}" for nid, v in newick_meta.items()]
+    )
+    return {
+        "target_smiles": init_smiles if init_smiles is not None else str(target),
+        "num_routes": len(tree.winning_nodes),
+        "num_nodes": len(tree),
+        "num_iter": tree.curr_iteration,
+        "tree_depth": max(tree.nodes_depth.values()),
+        "search_time": round(tree.curr_time, 1),
+        "newick_tree": newick_tree,
+        "newick_meta": newick_meta_line,
+        "solved": True if len(tree.winning_nodes) > 0 else False,
+    }
+def run_search(
+    targets_path: str,
+    search_config: dict,
+    policy_config: PolicyNetworkConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+    value_network_path: str = None,
+    results_root: str = "search_results",
+) -> None:
+    """Performs a tree search on a set of target molecules using specified configuration
+    and reaction rules, logging the results and statistics.
+    :param targets_path: The path to the file containing the target molecules (in SDF or
+        SMILES format).
+    :param search_config: The config object containing the configuration for the tree
+        search.
+    :param policy_config: The config object containing the configuration for the policy.
+    :param reaction_rules_path: The path to the file containing reaction rules.
+    :param building_blocks_path: The path to the file containing building blocks.
+    :param value_network_path: The path to the file containing value weights (optional).
+    :param results_root: The name of the folder where the results of the tree search
+        will be saved.
+    :return: None.
+    """
+    # results folder
+    results_root = Path(results_root)
+    if not results_root.exists():
+        results_root.mkdir()
+    # output files
+    stats_file = results_root.joinpath("tree_search_stats.csv")
+    routes_file = results_root.joinpath("extracted_routes.json")
+    routes_folder = results_root.joinpath("extracted_routes_html")
+    routes_folder.mkdir(exist_ok=True)
+    # stats header
+    stats_header = [
+        "target_smiles",
+        "num_routes",
+        "num_nodes",
+        "num_iter",
+        "tree_depth",
+        "search_time",
+        "newick_tree",
+        "newick_meta",
+        "solved",
+        "error",
+    ]
+    # config
+    policy_function = PolicyNetworkFunction(policy_config=policy_config)
+    if search_config["evaluation_type"] == "gcn" and value_network_path:
+        value_function = ValueNetworkFunction(weights_path=value_network_path)
+    else:
+        value_function = None
+    reaction_rules = load_reaction_rules(reaction_rules_path)
+    building_blocks = load_building_blocks(building_blocks_path, standardize=True)
+    # run search
+    n_solved = 0
+    extracted_routes = []
+    tree_config = TreeConfig.from_dict(search_config)
+    tree_config.silent = True
+    with (
+        open(targets_path, "r", encoding="utf-8") as targets,
+        open(stats_file, "w", encoding="utf-8", newline="\n") as csvfile,
+    ):
+        statswriter = csv.DictWriter(csvfile, delimiter=",", fieldnames=stats_header)
+        statswriter.writeheader()
+        for ti, target_smi in tqdm(
+            enumerate(targets),
+            leave=True,
+            desc="Number of target molecules processed: ",
+            bar_format="{desc}{n} [{elapsed}]",
+        ):
+            target_smi = target_smi.strip()
+            target_mol = mol_from_smiles(target_smi)
+            try:
+                # run search
+                tree = Tree(
+                    target=target_mol,
+                    config=tree_config,
+                    reaction_rules=reaction_rules,
+                    building_blocks=building_blocks,
+                    expansion_function=policy_function,
+                    evaluation_function=value_function,
+                )
+                _ = list(tree)
+            except Exception as e:
+                extracted_routes.append(
+                    [
+                        {
+                            "type": "mol",
+                            "smiles": target_smi,
+                            "in_stock": False,
+                            "children": [],
+                        }
+                    ]
+                )
+                logging.warning(
+                    f"Retrosynthetic_planning {target_smi} failed with the following error: {e}"
+                )
+                continue
+            # is solved
+            n_solved += bool(tree.winning_nodes)
+            if bool(tree.winning_nodes):
+                # extract routes
+                extracted_routes.append(extract_routes(tree))
+                # save routes
+                generate_results_html(
+                    tree,
+                    os.path.join(routes_folder, f"retroroutes_target_{ti}.html"),
+                    extended=True,
+                )
+                # save stats
+                statswriter.writerow(extract_tree_stats(tree, target_smi))
+                csvfile.flush()
+                # save json routes
+                with open(routes_file, "w", encoding="utf-8") as f:
+                    json.dump(extracted_routes, f)
+                # Save mapped reactions (CSV)
+                routes_dict = extract_reactions(tree)
+                write_routes_csv(
+                    routes_dict, os.path.join(routes_folder, f"mapped_routes_{ti}.csv")
+                )
+                # save mapped reactions (JSON)
+                write_routes_json(
+                    routes_dict, os.path.join(routes_folder, f"mapped_routes_{ti}.json")
+                )
+    print(f"Number of solved target molecules: {n_solved}")

synplan/mcts/tree.py ADDED Viewed

	@@ -0,0 +1,635 @@

+"""Module containing a class Tree that used for tree search of retrosynthetic routes."""
+import logging
+import warnings
+from collections import defaultdict, deque
+from math import sqrt
+from random import choice, uniform
+from time import time
+from typing import Dict, List, Set, Tuple
+from CGRtools.reactor import Reactor
+from CGRtools.containers import MoleculeContainer
+from tqdm.auto import tqdm
+from synplan.chem.precursor import Precursor
+from synplan.chem.reaction import Reaction, apply_reaction_rule
+from synplan.mcts.evaluation import ValueNetworkFunction
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.node import Node
+from synplan.utils.config import TreeConfig
+class Tree:
+    """Tree class with attributes and methods for Monte-Carlo tree search."""
+    def __init__(
+        self,
+        target: MoleculeContainer,
+        config: TreeConfig,
+        reaction_rules: List[Reactor],
+        building_blocks: Set[str],
+        expansion_function: PolicyNetworkFunction,
+        evaluation_function: ValueNetworkFunction = None,
+    ):
+        """Initializes a tree object with optional parameters for tree search for target
+        molecule.
+        :param target: A target molecule for retrosynthetic routes search.
+        :param config: A tree configuration.
+        :param reaction_rules: A loaded reaction rules.
+        :param building_blocks: A loaded building blocks.
+        :param expansion_function: A loaded policy function.
+        :param evaluation_function: A loaded value function. If None, the rollout is
+            used as a default for node evaluation.
+        """
+        # config parameters
+        self.config = config
+        assert isinstance(
+            target, MoleculeContainer
+        ), "Target should be given as MoleculeContainer"
+        assert len(target) > 3, "Target molecule has less than 3 atoms"
+        target_molecule = Precursor(target)
+        target_molecule.prev_precursors.append(Precursor(target))
+        target_node = Node(
+            precursors_to_expand=(target_molecule,), new_precursors=(target_molecule,)
+        )
+        # tree structure init
+        self.nodes: Dict[int, Node] = {1: target_node}
+        self.parents: Dict[int, int] = {1: 0}
+        self.children: Dict[int, Set[int]] = {1: set()}
+        self.winning_nodes: List[int] = []
+        self.visited_nodes: Set[int] = set()
+        self.expanded_nodes: Set[int] = set()
+        self.nodes_visit: Dict[int, int] = {1: 0}
+        self.nodes_depth: Dict[int, int] = {1: 0}
+        self.nodes_prob: Dict[int, float] = {1: 0.0}
+        self.nodes_rules: Dict[int, float] = {}
+        self.nodes_init_value: Dict[int, float] = {1: 0.0}
+        self.nodes_total_value: Dict[int, float] = {1: 0.0}
+        # tree building limits
+        self.curr_iteration: int = 0
+        self.curr_tree_size: int = 2
+        self.start_time: float = 0
+        self.curr_time: float = 0
+        # building blocks and reaction reaction_rules
+        self.reaction_rules = reaction_rules
+        self.building_blocks = building_blocks
+        # policy and value functions
+        self.policy_network = expansion_function
+        if self.config.evaluation_type == "gcn":
+            if evaluation_function is None:
+                raise ValueError(
+                    "Value function not specified while evaluation type is 'gcn'"
+                )
+            if (
+                evaluation_function is not None
+                and self.config.evaluation_type == "rollout"
+            ):
+                raise ValueError(
+                    "Value function is not None while evaluation type is 'rollout'. What should  be evaluation type ?"
+                )
+            self.value_network = evaluation_function
+        # utils
+        self._tqdm = True  # needed to disable tqdm with multiprocessing module
+        target_smiles = str(self.nodes[1].curr_precursor.molecule)
+        if target_smiles in self.building_blocks:
+            self.building_blocks.remove(target_smiles)
+            print(
+                "Target was found in building blocks and removed from building blocks."
+            )
+    def __len__(self) -> int:
+        """Returns the current size (the number of nodes) in the tree."""
+        return self.curr_tree_size - 1
+    def __iter__(self) -> "Tree":
+        """The function is defining an iterator for a Tree object.
+        Also needed for the bar progress display.
+        """
+        self.start_time = time()
+        if self._tqdm:
+            self._tqdm = tqdm(
+                total=self.config.max_iterations, disable=self.config.silent
+            )
+        return self
+    def __repr__(self) -> str:
+        """Returns a string representation of the tree (target SMILES, tree size, and
+        the number of found routes)."""
+        return self.report()
+    def __next__(self) -> [bool, List[int]]:
+        """The __next__ method is used to do one iteration of the tree building.
+        :return: Returns True if the route was found and the node id of the last node in
+            the route. Otherwise, returns False and the id of the last visited node.
+        """
+        if self.curr_iteration >= self.config.max_iterations:
+            raise StopIteration("Iterations limit exceeded.")
+        if self.curr_tree_size >= self.config.max_tree_size:
+            raise StopIteration("Max tree size exceeded or all possible routes found.")
+        if self.curr_time >= self.config.max_time:
+            raise StopIteration("Time limit exceeded.")
+        # start new iteration
+        self.curr_iteration += 1
+        self.curr_time = time() - self.start_time
+        if self._tqdm:
+            self._tqdm.update()
+        curr_depth, node_id = 0, 1  # start from the root node_id
+        explore_route = True
+        while explore_route:
+            self.visited_nodes.add(node_id)
+            if self.nodes_visit[node_id]:  # already visited
+                if not self.children[node_id]:  # dead node
+                    self._update_visits(node_id)
+                    explore_route = False
+                else:
+                    node_id = self._select_node(node_id)  # select the child node
+                    curr_depth += 1
+            else:
+                if self.nodes[node_id].is_solved():  # found route
+                    self._update_visits(
+                        node_id
+                    )  # this prevents expanding of bb node_id
+                    self.winning_nodes.append(node_id)
+                    return True, [node_id]
+                if (
+                    curr_depth < self.config.max_depth
+                ):  # expand node if depth limit is not reached
+                    self._expand_node(node_id)
+                    if not self.children[node_id]:  # node was not expanded
+                        value_to_backprop = -1.0
+                    else:
+                        self.expanded_nodes.add(node_id)
+                        if self.config.search_strategy == "evaluation_first":
+                            # recalculate node value based on children synthesisability and backpropagation
+                            child_values = [
+                                self.nodes_init_value[child_id]
+                                for child_id in self.children[node_id]
+                            ]
+                            if self.config.evaluation_agg == "max":
+                                value_to_backprop = max(child_values)
+                            elif self.config.evaluation_agg == "average":
+                                value_to_backprop = sum(child_values) / len(
+                                    self.children[node_id]
+                                )
+                        elif self.config.search_strategy == "expansion_first":
+                            value_to_backprop = self._get_node_value(node_id)
+                    # backpropagation
+                    self._backpropagate(node_id, value_to_backprop)
+                    self._update_visits(node_id)
+                    explore_route = False
+                    if self.children[node_id]:
+                        # found after expansion
+                        found_after_expansion = set()
+                        for child_id in iter(self.children[node_id]):
+                            if self.nodes[child_id].is_solved():
+                                found_after_expansion.add(child_id)
+                                self.winning_nodes.append(child_id)
+                        if found_after_expansion:
+                            return True, list(found_after_expansion)
+                else:
+                    self._backpropagate(node_id, self.nodes_total_value[node_id])
+                    self._update_visits(node_id)
+                    explore_route = False
+        return False, [node_id]
+    def _ucb(self, node_id: int) -> float:
+        """Calculates the Upper Confidence Bound (UCB) statistics for a given node.
+        :param node_id: The id of the node.
+        :return: The calculated UCB.
+        """
+        prob = self.nodes_prob[node_id]  # predicted by policy network score
+        visit = self.nodes_visit[node_id]
+        if self.config.ucb_type == "puct":
+            u = (
+                self.config.c_ucb * prob * sqrt(self.nodes_visit[self.parents[node_id]])
+            ) / (visit + 1)
+            ucb_value = self.nodes_total_value[node_id] + u
+        if self.config.ucb_type == "uct":
+            u = (
+                self.config.c_ucb
+                * sqrt(self.nodes_visit[self.parents[node_id]])
+                / (visit + 1)
+            )
+            ucb_value = self.nodes_total_value[node_id] + u
+        if self.config.ucb_type == "value":
+            ucb_value = self.nodes_init_value[node_id] / (visit + 1)
+        return ucb_value
+    def _select_node(self, node_id: int) -> int:
+        """Selects a node based on its UCB value and returns the id of the node with the
+        highest UCB.
+        :param node_id: The id of the node.
+        :return: The id of the node with the highest UCB.
+        """
+        if self.config.epsilon > 0:
+            n = uniform(0, 1)
+            if n < self.config.epsilon:
+                return choice(list(self.children[node_id]))
+        best_score, best_children = None, []
+        for child_id in self.children[node_id]:
+            score = self._ucb(child_id)
+            if best_score is None or score > best_score:
+                best_score, best_children = score, [child_id]
+            elif score == best_score:
+                best_children.append(child_id)
+        # is needed for tree search reproducibility, when all child nodes has the same score
+        return best_children[0]
+    def _expand_node(self, node_id: int) -> None:
+        """Expands the node by generating new precursor with policy (expansion) function.
+        :param node_id: The id the node to be expanded.
+        :return: None.
+        """
+        curr_node = self.nodes[node_id]
+        prev_precursor = curr_node.curr_precursor.prev_precursors
+        tmp_precursor = set()
+        expanded = False
+        for prob, rule, rule_id in self.policy_network.predict_reaction_rules(
+            curr_node.curr_precursor, self.reaction_rules
+        ):
+            for products in apply_reaction_rule(
+                curr_node.curr_precursor.molecule, rule
+            ):
+                # check repeated products
+                if not products or not set(products) - tmp_precursor:
+                    continue
+                tmp_precursor.update(products)
+                for molecule in products:
+                    molecule.meta["reactor_id"] = rule_id
+                new_precursor = tuple(Precursor(mol) for mol in products)
+                scaled_prob = prob * len(
+                    list(filter(lambda x: len(x) > self.config.min_mol_size, products))
+                )
+                if set(prev_precursor).isdisjoint(new_precursor):
+                    precursors_to_expand = (
+                        *curr_node.next_precursor,
+                        *(
+                            x
+                            for x in new_precursor
+                            if not x.is_building_block(
+                                self.building_blocks, self.config.min_mol_size
+                            )
+                        ),
+                    )
+                    child_node = Node(
+                        precursors_to_expand=precursors_to_expand,
+                        new_precursors=new_precursor,
+                    )
+                    for new_precursor in new_precursor:
+                        new_precursor.prev_precursors = [new_precursor, *prev_precursor]
+                    self._add_node(node_id, child_node, scaled_prob, rule_id)
+                    expanded = True
+        if not expanded and node_id == 1:
+            raise StopIteration("\nThe target molecule was not expanded.")
+    def _add_node(
+        self,
+        node_id: int,
+        new_node: Node,
+        policy_prob: float = None,
+        rule_id: int = None,
+    ) -> None:
+        """Adds a new node to the tree with probability of reaction rules predicted by
+        policy function and applied to the parent node of the new node.
+        :param node_id: The id of the parent node.
+        :param new_node: The new node to be added.
+        :param policy_prob: The probability of reaction rules predicted by policy
+            function for thr parent node.
+        :return: None.
+        """
+        new_node_id = self.curr_tree_size
+        self.nodes[new_node_id] = new_node
+        self.parents[new_node_id] = node_id
+        self.children[node_id].add(new_node_id)
+        self.children[new_node_id] = set()
+        self.nodes_visit[new_node_id] = 0
+        self.nodes_prob[new_node_id] = policy_prob
+        self.nodes_rules[new_node_id] = rule_id
+        self.nodes_depth[new_node_id] = self.nodes_depth[node_id] + 1
+        self.curr_tree_size += 1
+        if self.config.search_strategy == "evaluation_first":
+            node_value = self._get_node_value(new_node_id)
+        elif self.config.search_strategy == "expansion_first":
+            node_value = self.config.init_node_value
+        self.nodes_init_value[new_node_id] = node_value
+        self.nodes_total_value[new_node_id] = node_value
+    def _get_node_value(self, node_id: int) -> float:
+        """Calculates the value for the given node (for example with rollout or value
+        network).
+        :param node_id: The id of the node to be evaluated.
+        :return: The estimated value of the node.
+        """
+        node = self.nodes[node_id]
+        if self.config.evaluation_type == "random":
+            node_value = uniform(0, 1)
+        elif self.config.evaluation_type == "rollout":
+            node_value = min(
+                (
+                    self._rollout_node(
+                        precursor, current_depth=self.nodes_depth[node_id]
+                    )
+                    for precursor in node.precursors_to_expand
+                ),
+                default=1.0,
+            )
+        elif self.config.evaluation_type == "gcn":
+            node_value = self.value_network.predict_value(node.new_precursors)
+        return node_value
+    def _update_visits(self, node_id: int) -> None:
+        """Updates the number of visits from the current node to the root node.
+        :param node_id: The id of the current node.
+        :return: None.
+        """
+        while node_id:
+            self.nodes_visit[node_id] += 1
+            node_id = self.parents[node_id]
+    def _backpropagate(self, node_id: int, value: float) -> None:
+        """Backpropagates the value through the tree from the current.
+        :param node_id: The id of the node from which to backpropagate the value.
+        :param value: The value to backpropagate.
+        :return: None.
+        """
+        while node_id:
+            if self.config.backprop_type == "muzero":
+                self.nodes_total_value[node_id] = (
+                    self.nodes_total_value[node_id] * self.nodes_visit[node_id] + value
+                ) / (self.nodes_visit[node_id] + 1)
+            elif self.config.backprop_type == "cumulative":
+                self.nodes_total_value[node_id] += value
+            node_id = self.parents[node_id]
+    def _rollout_node(self, precursor: Precursor, current_depth: int = None) -> float:
+        """Performs a rollout simulation from a given node in the tree. Given the
+        current precursor, find the first successful reaction and return the new precursor.
+        If the precursor is a building_block, return 1.0, else check the
+        first successful reaction.
+        If the reaction is not successful, return -1.0.
+        If the reaction is successful, but the generated precursor are not
+        the building_blocks and the precursor cannot be generated without
+        exceeding current_depth threshold, return -0.5.
+        If the reaction is successful, but the precursor are not the
+        building_blocks and the precursor cannot be generated, return
+        -1.0.
+        :param precursor: The precursor to be evaluated.
+        :param current_depth: The current depth of the tree.
+        :return: The reward (value) assigned to the precursor.
+        """
+        max_depth = self.config.max_depth - current_depth
+        # precursor checking
+        if precursor.is_building_block(self.building_blocks, self.config.min_mol_size):
+            return 1.0
+        if max_depth == 0:
+            print("max depth reached in the beginning")
+        # precursor simulating
+        occurred_precursor = set()
+        precursor_to_expand = deque([precursor])
+        history = defaultdict(dict)
+        rollout_depth = 0
+        while precursor_to_expand:
+            # Iterate through reactors and pick first successful reaction.
+            # Check products of the reaction if you can find them in in-building_blocks data
+            # If not, then add missed products to precursor_to_expand and try to decompose them
+            if len(history) >= max_depth:
+                reward = -0.5
+                return reward
+            current_precursor = precursor_to_expand.popleft()
+            history[rollout_depth]["target"] = current_precursor
+            occurred_precursor.add(current_precursor)
+            # Pick the first successful reaction while iterating through reactors
+            reaction_rule_applied = False
+            for prob, rule, rule_id in self.policy_network.predict_reaction_rules(
+                current_precursor, self.reaction_rules
+            ):
+                for products in apply_reaction_rule(current_precursor.molecule, rule):
+                    if products:
+                        reaction_rule_applied = True
+                        break
+                if reaction_rule_applied:
+                    history[rollout_depth]["rule_index"] = rule_id
+                    break
+            if not reaction_rule_applied:
+                reward = -1.0
+                return reward
+            products = tuple(Precursor(product) for product in products)
+            history[rollout_depth]["products"] = products
+            # check loops
+            if any(x in occurred_precursor for x in products) and products:
+                # sometimes manual can create a loop, when
+                # print('occurred_precursor')
+                reward = -1.0
+                return reward
+            if occurred_precursor.isdisjoint(products):
+                # added number of atoms check
+                precursor_to_expand.extend(
+                    [
+                        x
+                        for x in products
+                        if not x.is_building_block(
+                            self.building_blocks, self.config.min_mol_size
+                        )
+                    ]
+                )
+                rollout_depth += 1
+        reward = 1.0
+        return reward
+    def report(self) -> str:
+        """Returns the string representation of the tree."""
+        return (
+            f"Tree for: {str(self.nodes[1].precursors_to_expand[0])}\n"
+            f"Time: {round(self.curr_time, 1)} seconds\n"
+            f"Number of nodes: {len(self)}\n"
+            f"Number of iterations: {self.curr_iteration}\n"
+            f"Number of visited nodes: {len(self.visited_nodes)}\n"
+            f"Number of found routes: {len(self.winning_nodes)}"
+        )
+    def route_score(self, node_id: int) -> float:
+        """Calculates the score of a given route from the current node to the root node.
+        The score depends on cumulated node values nad the route length.
+        :param node_id: The id of the current given node.
+        :return: The route score.
+        """
+        cumulated_nodes_value, route_length = 0, 0
+        while node_id:
+            route_length += 1
+            cumulated_nodes_value += self.nodes_total_value[node_id]
+            node_id = self.parents[node_id]
+        return cumulated_nodes_value / (route_length**2)
+    def route_to_node(self, node_id: int) -> List[Node,]:
+        """Returns the route (list of id of nodes) to from the node current node to the
+        root node.
+        :param node_id: The id of the current node.
+        :return: The list of nodes.
+        """
+        nodes = []
+        while node_id:
+            nodes.append(node_id)
+            node_id = self.parents[node_id]
+        return [self.nodes[node_id] for node_id in reversed(nodes)]
+    def synthesis_route(self, node_id: int) -> Tuple[Reaction,]:
+        """Given a node_id, return a tuple of reactions that represent the
+        retrosynthetic route from the current node.
+        :param node_id: The id of the current node.
+        :return: The tuple of extracted reactions representing the synthesis route.
+        """
+        nodes = self.route_to_node(node_id)
+        reaction_sequence = [
+            Reaction(
+                [x.molecule for x in after.new_precursors],
+                [before.curr_precursor.molecule],
+            )
+            for before, after in zip(nodes, nodes[1:])
+        ]
+        for r in reaction_sequence:
+            r.clean2d()
+        return tuple(reversed(reaction_sequence))
+    def newickify(self, visits_threshold: int = 0, root_node_id: int = 1):
+        """
+        Adopted from https://stackoverflow.com/questions/50003007/how-to-convert-python-dictionary-to-newick-form-format.
+        :param visits_threshold: The minimum number of visits for the given node.
+        :param root_node_id: The id of the root node.
+        :return: The newick string and meta dict.
+        """
+        visited_nodes = set()
+        def newick_render_node(current_node_id: int) -> str:
+            """Recursively generates a Newick string representation of the tree.
+            :param current_node_id: The id of the current node.
+            :return: A string representation of a node in a Newick format.
+            """
+            assert (
+                current_node_id not in visited_nodes
+            ), "Error: The tree may not be circular!"
+            node_visit = self.nodes_visit[current_node_id]
+            visited_nodes.add(current_node_id)
+            if self.children[current_node_id]:
+                # Nodes
+                children = [
+                    child
+                    for child in list(self.children[current_node_id])
+                    if self.nodes_visit[child] >= visits_threshold
+                ]
+                children_strings = [newick_render_node(child) for child in children]
+                children_strings = ",".join(children_strings)
+                if children_strings:
+                    return f"({children_strings}){current_node_id}:{node_visit}"
+                # leafs within threshold
+                return f"{current_node_id}:{node_visit}"
+            return f"{current_node_id}:{node_visit}"
+        newick_string = newick_render_node(root_node_id) + ";"
+        meta = {}
+        for node_id in iter(visited_nodes):
+            node_value = round(self.nodes_total_value[node_id], 3)
+            node_synthesisability = round(self.nodes_init_value[node_id])
+            visit_in_node = self.nodes_visit[node_id]
+            meta[node_id] = (node_value, node_synthesisability, visit_in_node)
+        return newick_string, meta

synplan/ml/__init__.py ADDED Viewed

File without changes

synplan/ml/networks/__init__.py ADDED Viewed

File without changes

synplan/ml/networks/modules.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""Module containing basic pytorch architectures of policy and value neural networks."""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple, Union
+import torch
+from adabelief_pytorch import AdaBelief
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import GELU, Dropout, Linear, Module, ModuleDict, ModuleList
+from torch.nn.functional import relu
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torch_geometric.data.batch import Batch
+from torch_geometric.nn.conv import GCNConv
+from torch_geometric.nn.pool import global_add_pool
+class GraphEmbedding(Module):
+    """Needed to convert molecule atom vectors to the single vector using graph
+    convolution."""
+    def __init__(
+        self, vector_dim: int = 512, dropout: float = 0.4, num_conv_layers: int = 5
+    ):
+        """Initializes a graph convolutional module. Needed to convert molecule atom
+        vectors to the single vector using graph convolution.
+        :param vector_dim: The dimensionality of the hidden layers and output layer of
+            graph convolution module.
+        :param dropout: Dropout is a regularization technique used in neural networks to
+            prevent overfitting. It randomly sets a fraction of input units to 0 at each
+            update during training time.
+        :param num_conv_layers: The number of convolutional layers in a graph
+            convolutional module.
+        """
+        super().__init__()
+        self.expansion = Linear(11, vector_dim)
+        self.dropout = Dropout(dropout)
+        self.gcn_convs = ModuleList(
+            [
+                GCNConv(
+                    vector_dim,
+                    vector_dim,
+                    improved=True,
+                )
+                for _ in range(num_conv_layers)
+            ]
+        )
+    def forward(self, graph: Batch, batch_size: int) -> Tensor:
+        """Takes a graph as input and performs graph convolution on it.
+        :param graph: The batch of molecular graphs, where each atom is represented by
+            the atom/bond vector.
+        :param batch_size: The size of the batch.
+        :return: Graph embedding.
+        """
+        atoms, connections = graph.x.float(), graph.edge_index.long()
+        atoms = torch.log(atoms + 1)
+        atoms = self.expansion(atoms)
+        for gcn_conv in self.gcn_convs:
+            atoms = atoms + self.dropout(relu(gcn_conv(atoms, connections)))
+        return global_add_pool(atoms, graph.batch, size=batch_size)
+class GraphEmbeddingConcat(GraphEmbedding, Module):
+    """Needed to concat."""  # TODO for what ?
+    def __init__(
+        self, vector_dim: int = 512, dropout: float = 0.4, num_conv_layers: int = 8
+    ):
+        super().__init__()
+        gcn_dim = vector_dim // num_conv_layers
+        self.expansion = Linear(11, gcn_dim)
+        self.dropout = Dropout(dropout)
+        self.gcn_convs = ModuleList(
+            [
+                ModuleDict(
+                    {
+                        "gcn": GCNConv(gcn_dim, gcn_dim, improved=True),
+                        "activation": GELU(),
+                    }
+                )
+                for _ in range(num_conv_layers)
+            ]
+        )
+    def forward(self, graph: Batch, batch_size: int) -> Tensor:
+        """Takes a graph as input and performs graph convolution on it.
+        :param graph: The batch of molecular graphs, where each atom is represented by
+            the atom/bond vector.
+        :param batch_size: The size of the batch.
+        :return: Graph embedding.
+        """
+        atoms, connections = graph.x.float(), graph.edge_index.long()
+        atoms = torch.log(atoms + 1)
+        atoms = self.expansion(atoms)
+        collected_atoms = []
+        for gcn_convs in self.gcn_convs:
+            atoms = gcn_convs["gcn"](atoms, connections)
+            atoms = gcn_convs["activation"](atoms)
+            atoms = self.dropout(atoms)
+            collected_atoms.append(atoms)
+        atoms = torch.cat(collected_atoms, dim=-1)
+        return global_add_pool(atoms, graph.batch, size=batch_size)
+class MCTSNetwork(LightningModule, ABC):
+    """Basic class for policy and value networks."""
+    def __init__(
+        self,
+        vector_dim: int,
+        batch_size: int,
+        dropout: float = 0.4,
+        num_conv_layers: int = 5,
+        learning_rate: float = 0.001,
+        gcn_concat: bool = False,
+    ):
+        """The basic class for MCTS graph convolutional neural networks (policy and
+        value network).
+        :param vector_dim: The dimensionality of the hidden layers and output layer of
+            graph convolution module.
+        :param dropout: Dropout is a regularization technique used in neural networks to
+            prevent overfitting.
+        :param num_conv_layers: The number of convolutional layers in a graph
+            convolutional module.
+        :param learning_rate: The learning rate determines how quickly the model learns
+            from the training data.
+        :param gcn_concat: ???. #TODO explain
+        """
+        super().__init__()
+        if gcn_concat:
+            self.embedder = GraphEmbeddingConcat(vector_dim, dropout, num_conv_layers)
+        else:
+            self.embedder = GraphEmbedding(vector_dim, dropout, num_conv_layers)
+        self.batch_size = batch_size
+        self.lr = learning_rate
+    @abstractmethod
+    def forward(self, batch: Batch) -> Tensor:
+        """The forward function takes a batch of input data and performs forward
+        propagation through the neural network.
+        :param batch: The batch of molecular graphs processed together in a single
+            forward pass through the neural network.
+        """
+    @abstractmethod
+    def _get_loss(self, batch: Batch) -> Tensor:
+        """Calculate the loss for a given batch of data.
+        :param batch: The batch of input data that is used to compute the loss.
+        """
+    def training_step(self, batch: Batch, batch_idx: int) -> Tensor:
+        """Calculates the loss for a given training batch and logs the loss value.
+        :param batch: The batch of data that is used for training.
+        :param batch_idx: The index of the batch.
+        :return: The value of the training loss.
+        """
+        metrics = self._get_loss(batch)
+        for name, value in metrics.items():
+            self.log(
+                "train_" + name,
+                value,
+                prog_bar=True,
+                on_step=True,
+                on_epoch=True,
+                batch_size=self.batch_size,
+            )
+        return metrics["loss"]
+    def validation_step(self, batch: Batch, batch_idx: int) -> None:
+        """Calculates the loss for a given validation batch and logs the loss value.
+        :param batch: The batch of data that is used for validation.
+        :param batch_idx: The index of the batch.
+        """
+        metrics = self._get_loss(batch)
+        for name, value in metrics.items():
+            self.log("val_" + name, value, on_epoch=True, batch_size=self.batch_size)
+    def test_step(self, batch: Batch, batch_idx: int) -> None:
+        """Calculates the loss for a given test batch and logs the loss value.
+        :param batch: The batch of data that is used for testing.
+        :param batch_idx: The index of the batch.
+        """
+        metrics = self._get_loss(batch)
+        for name, value in metrics.items():
+            self.log("test_" + name, value, on_epoch=True, batch_size=self.batch_size)
+    def configure_optimizers(
+        self,
+    ) -> Tuple[List[AdaBelief], List[Dict[str, Union[bool, str, ReduceLROnPlateau]]]]:
+        """Returns an optimizer and a learning rate scheduler for training a model using
+        the AdaBelief optimizer and ReduceLROnPlateau scheduler.
+        :return: The optimizer and a scheduler.
+        """
+        optimizer = AdaBelief(
+            self.parameters(),
+            lr=self.lr,
+            eps=1e-16,
+            betas=(0.9, 0.999),
+            weight_decouple=True,
+            rectify=True,
+            weight_decay=0.01,
+            print_change_log=False,
+        )
+        lr_scheduler = ReduceLROnPlateau(
+            optimizer, patience=3, factor=0.8, min_lr=5e-5, verbose=True
+        )
+        scheduler = {
+            "scheduler": lr_scheduler,
+            "reduce_on_plateau": True,
+            "monitor": "val_loss",
+        }
+        return [optimizer], [scheduler]

synplan/ml/networks/policy.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Module containing main class for policy network."""
+from abc import ABC
+from typing import Dict
+import torch
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import Linear
+from torch.nn.functional import binary_cross_entropy_with_logits, cross_entropy, one_hot
+from torch_geometric.data.batch import Batch
+from torchmetrics.functional.classification import f1_score, recall, specificity
+from synplan.ml.networks.modules import MCTSNetwork
+class PolicyNetwork(MCTSNetwork, LightningModule, ABC):
+    """Policy network."""
+    def __init__(
+        self,
+        *args,
+        n_rules: int,
+        vector_dim: int,
+        policy_type: str = "ranking",
+        **kwargs
+    ):
+        """Initializes a policy network with the given number of reaction rules (output
+        dimension) and vector graph embedding dimension, and creates linear layers for
+        predicting the regular and priority reaction rules.
+        :param n_rules: The number of reaction rules in the policy network.
+        :param vector_dim: The dimensionality of the input vectors.
+        """
+        super().__init__(vector_dim, *args, **kwargs)
+        self.save_hyperparameters()
+        self.policy_type = policy_type
+        self.n_rules = n_rules
+        self.y_predictor = Linear(vector_dim, n_rules)
+        if self.policy_type == "filtering":
+            self.priority_predictor = Linear(vector_dim, n_rules)
+    def forward(self, batch: Batch) -> Tensor:
+        """Takes a molecular graph, applies a graph convolution and sigmoid layers to
+        predict regular and priority reaction rules.
+        :param batch: The input batch of molecular graphs.
+        :return: Returns the vector of probabilities (given by sigmoid) of successful
+            application of regular and priority reaction rules.
+        """
+        x = self.embedder(batch, self.batch_size)
+        y = self.y_predictor(x)
+        if self.policy_type == "ranking":
+            y = torch.softmax(y, dim=-1)
+            return y
+        if self.policy_type == "filtering":
+            y = torch.sigmoid(y)
+            priority = torch.sigmoid(self.priority_predictor(x))
+            return y, priority
+    def _get_loss(self, batch: Batch) -> Dict[str, Tensor]:
+        """Calculates the loss and various classification metrics for a given batch for
+        reaction rules prediction.
+        :param batch: The batch of molecular graphs.
+        :return: A dictionary with loss value and balanced accuracy of reaction rules
+            prediction.
+        """
+        true_y = batch.y_rules.long()
+        x = self.embedder(batch, self.batch_size)
+        pred_y = self.y_predictor(x)
+        if self.policy_type == "ranking":
+            true_one_hot = one_hot(true_y, num_classes=self.n_rules)
+            loss = cross_entropy(pred_y, true_one_hot.float())
+            ba_y = (
+                recall(pred_y, true_y, task="multiclass", num_classes=self.n_rules)
+                + specificity(
+                    pred_y, true_y, task="multiclass", num_classes=self.n_rules
+                )
+            ) / 2
+            f1_y = f1_score(pred_y, true_y, task="multiclass", num_classes=self.n_rules)
+            metrics = {"loss": loss, "balanced_accuracy_y": ba_y, "f1_score_y": f1_y}
+        elif self.policy_type == "filtering":
+            loss_y = binary_cross_entropy_with_logits(pred_y, true_y.float())
+            ba_y = (
+                recall(pred_y, true_y, task="multilabel", num_labels=self.n_rules)
+                + specificity(
+                    pred_y, true_y, task="multilabel", num_labels=self.n_rules
+                )
+            ) / 2
+            f1_y = f1_score(pred_y, true_y, task="multilabel", num_labels=self.n_rules)
+            true_priority = batch.y_priority.float()
+            pred_priority = self.priority_predictor(x)
+            loss_priority = binary_cross_entropy_with_logits(
+                pred_priority, true_priority
+            )
+            loss = loss_y + loss_priority
+            true_priority = true_priority.long()
+            ba_priority = (
+                recall(
+                    pred_priority,
+                    true_priority,
+                    task="multilabel",
+                    num_labels=self.n_rules,
+                )
+                + specificity(
+                    pred_priority,
+                    true_priority,
+                    task="multilabel",
+                    num_labels=self.n_rules,
+                )
+            ) / 2
+            f1_priority = f1_score(
+                pred_priority, true_priority, task="multilabel", num_labels=self.n_rules
+            )
+            metrics = {
+                "loss": loss,
+                "balanced_accuracy_y": ba_y,
+                "f1_score_y": f1_y,
+                "balanced_accuracy_priority": ba_priority,
+                "f1_score_priority": f1_priority,
+            }
+        return metrics

synplan/ml/networks/value.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Module containing main class for value network."""
+from abc import ABC
+from typing import Any, Dict
+import torch
+from pytorch_lightning import LightningModule
+from torch import Tensor
+from torch.nn import Linear
+from torch.nn.functional import binary_cross_entropy_with_logits
+from torch_geometric.data.batch import Batch
+from torchmetrics.functional.classification import (
+    binary_f1_score,
+    binary_recall,
+    binary_specificity,
+)
+from synplan.ml.networks.modules import MCTSNetwork
+class ValueNetwork(MCTSNetwork, LightningModule, ABC):
+    """Value network."""
+    def __init__(self, vector_dim: int, *args: Any, **kwargs: Any) -> None:
+        """Initializes a value network, and creates linear layer for predicting the
+        synthesisability of given precursor represented by molecular graph.
+        :param vector_dim: The dimensionality of the output linear layer.
+        """
+        super().__init__(vector_dim, *args, **kwargs)
+        self.save_hyperparameters()
+        self.predictor = Linear(vector_dim, 1)
+    def forward(self, batch) -> torch.Tensor:
+        """Takes a batch of molecular graphs, applies a graph convolution returns the
+        synthesisability (probability given by sigmoid function) of a given precursor
+        represented by molecular graph precessed by graph convolution.
+        :param batch: The batch of molecular graphs.
+        :return: The predicted synthesisability (between 0 and 1).
+        """
+        x = self.embedder(batch, self.batch_size)
+        x = torch.sigmoid(self.predictor(x))
+        return x
+    def _get_loss(self, batch: Batch) -> Dict[str, Tensor]:
+        """Calculates the loss and various classification metrics for a given batch for
+        the precursor synthesysability prediction.
+        :param batch: The batch of molecular graphs.
+        :return: The dictionary with loss value and balanced accuracy of precursor
+            synthesysability prediction.
+        """
+        true_y = batch.y.float()
+        true_y = torch.unsqueeze(true_y, -1)
+        x = self.embedder(batch, self.batch_size)
+        pred_y = self.predictor(x)
+        # calc loss func
+        loss = binary_cross_entropy_with_logits(pred_y, true_y)
+        true_y = true_y.long()
+        ba = (binary_recall(pred_y, true_y) + binary_specificity(pred_y, true_y)) / 2
+        f1 = binary_f1_score(pred_y, true_y)
+        metrics = {"loss": loss, "balanced_accuracy": ba, "f1_score": f1}
+        return metrics

synplan/ml/training/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .supervised import *
+from .preprocessing import ValueNetworkDataset, mol_to_pyg, MENDEL_INFO
+from .supervised import create_policy_dataset, run_policy_training
+__all__ = [
+    "ValueNetworkDataset",
+    "mol_to_pyg",
+    "MENDEL_INFO",
+    "create_policy_dataset",
+    "run_policy_training",
+]

synplan/ml/training/preprocessing.py ADDED Viewed

	@@ -0,0 +1,516 @@

+"""Module containing functions for preparation of the training sets for policy and value
+network."""
+import logging
+import os
+import pickle
+from abc import ABC
+from typing import Any, Dict, List, Optional, Tuple
+import ray
+import torch
+from CGRtools import smiles
+from CGRtools.containers import MoleculeContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+from ray.util.queue import Empty, Queue
+from torch import Tensor
+from torch_geometric.data import InMemoryDataset
+from torch_geometric.data.data import Data
+from torch_geometric.data.makedirs import makedirs
+from torch_geometric.transforms import ToUndirected
+from tqdm import tqdm
+from synplan.chem.utils import unite_molecules
+from synplan.utils.files import ReactionReader
+from synplan.utils.loading import load_reaction_rules
+class ValueNetworkDataset(InMemoryDataset, ABC):
+    """Value network dataset."""
+    def __init__(self, extracted_precursor: Dict[str, float]) -> None:
+        """Initializes a value network dataset object.
+        :param extracted_precursor: The dictionary with the extracted from the built
+            search trees precursor and their labels.
+        """
+        super().__init__(None, None, None)
+        if extracted_precursor:
+            self.data, self.slices = self.graphs_from_extracted_precursor(
+                extracted_precursor
+            )
+    @staticmethod
+    def mol_to_graph(molecule: MoleculeContainer, label: float) -> Optional[Data]:
+        """Takes a molecule as input, and converts the molecule to a PyTorch geometric
+        graph, assigns the reward value (label) to the graph, and returns the graph.
+        :param molecule: The input molecule.
+        :param label: The label (solved/unsolved routes in the tree) of the molecule
+            (precursor).
+        :return: A PyTorch Geometric graph representation of a molecule.
+        """
+        if len(molecule) > 2:
+            pyg = mol_to_pyg(molecule)
+            if pyg:
+                pyg.y = torch.tensor([label])
+                return pyg
+        return None
+    def graphs_from_extracted_precursor(
+        self, extracted_precursor: Dict[str, float]
+    ) -> Tuple[Data, Dict]:
+        """Converts the extracted from the search trees precursor to the PyTorch geometric
+        graphs.
+        :param extracted_precursor: The dictionary with the extracted from the built
+            search trees precursor and their labels.
+        :return: The PyTorch geometric graphs and slices.
+        """
+        processed_data = []
+        for smi, label in extracted_precursor.items():
+            mol = smiles(smi)
+            pyg = self.mol_to_graph(mol, label)
+            if pyg:
+                processed_data.append(pyg)
+        data, slices = self.collate(processed_data)
+        return data, slices
+class RankingPolicyDataset(InMemoryDataset):
+    """Ranking policy network dataset."""
+    def __init__(self, reactions_path: str, reaction_rules_path: str, output_path: str):
+        """Initializes a policy network dataset.
+        :param reactions_path: The path to the file containing the reaction data used
+            for extraction of reaction rules.
+        :param reaction_rules_path: The path to the file containing the reaction rules.
+        :param output_path: The output path to the file where policy network dataset
+            will be saved.
+        """
+        super().__init__(None, None, None)
+        self.reactions_path = reactions_path
+        self.reaction_rules_path = reaction_rules_path
+        self.output_path = output_path
+        if output_path and os.path.exists(output_path):
+            self.data, self.slices = torch.load(self.output_path)
+        else:
+            self.data, self.slices = self.prepare_data()
+    @property
+    def num_classes(self) -> int:
+        return self._infer_num_classes(self._data.y_rules)
+    def prepare_data(self) -> Tuple[Data, Dict[str, Tensor]]:
+        """Prepares data by loading reaction rules, preprocessing the molecules,
+        collating the data, and returning the data and slices.
+        :return: The PyTorch geometric graphs and slices.
+        """
+        with open(self.reaction_rules_path, "rb") as inp:
+            reaction_rules = pickle.load(inp)
+        reaction_rules = sorted(reaction_rules, key=lambda x: len(x[1]), reverse=True)
+        reaction_rule_pairs = {}
+        for rule_i, (_, reactions_ids) in enumerate(reaction_rules):
+            for reaction_id in reactions_ids:
+                reaction_rule_pairs[reaction_id] = rule_i
+        reaction_rule_pairs = dict(sorted(reaction_rule_pairs.items()))
+        list_of_graphs = []
+        with ReactionReader(self.reactions_path) as reactions:
+            for reaction_id, reaction in tqdm(
+                enumerate(reactions),
+                desc="Number of reactions processed: ",
+                bar_format="{desc}{n} [{elapsed}]",
+            ):
+                rule_id = reaction_rule_pairs.get(reaction_id)
+                if rule_id:
+                    try:  #  MENDEL_INFO does not contain cadmium (Cd) properties
+                        molecule = unite_molecules(reaction.products)
+                        pyg_graph = mol_to_pyg(molecule)
+                    except (
+                        Exception
+                    ) as e:  # TypeError: can't assign a NoneType to a torch.ByteTensor
+                        logging.debug(e)
+                        continue
+                    if pyg_graph is not None:
+                        pyg_graph.y_rules = torch.tensor([rule_id], dtype=torch.long)
+                        list_of_graphs.append(pyg_graph)
+                else:
+                    continue
+        data, slices = self.collate(list_of_graphs)
+        if self.output_path:
+            makedirs(os.path.dirname(self.output_path))
+            torch.save((data, slices), self.output_path)
+        return data, slices
+class FilteringPolicyDataset(InMemoryDataset):
+    """Filtering policy network dataset."""
+    def __init__(
+        self,
+        molecules_path: str,
+        reaction_rules_path: str,
+        output_path: str,
+        num_cpus: int,
+    ) -> None:
+        """Initializes a policy network dataset object.
+        :param molecules_path: The path to the file containing the molecules for
+            reaction rule appliance.
+        :param reaction_rules_path: The path to the file containing the reaction rules.
+        :param output_path: The output path to the file where policy network dataset
+            will be stored.
+        :param num_cpus: The number of CPUs to be used for the dataset preparation.
+        :return: None.
+        """
+        super().__init__(None, None, None)
+        self.molecules_path = molecules_path
+        self.reaction_rules_path = reaction_rules_path
+        self.output_path = output_path
+        self.num_cpus = num_cpus
+        self.batch_size = 100
+        if output_path and os.path.exists(output_path):
+            self.data, self.slices = torch.load(self.output_path)
+        else:
+            self.data, self.slices = self.prepare_data()
+    @property
+    def num_classes(self) -> int:
+        return self._data.y_rules.shape[1]
+    def prepare_data(self) -> Tuple[Data, Dict]:
+        """Prepares data by loading reaction rules, initializing Ray, preprocessing the
+        molecules, collating the data, and returning the data and slices.
+        :return: The PyTorch geometric graphs and slices.
+        """
+        ray.init(num_cpus=self.num_cpus, ignore_reinit_error=True)
+        reaction_rules = load_reaction_rules(self.reaction_rules_path)
+        reaction_rules_ids = ray.put(reaction_rules)
+        to_process = Queue(maxsize=self.batch_size * self.num_cpus)
+        processed_data = []
+        results_ids = [
+            preprocess_filtering_policy_molecules.remote(to_process, reaction_rules_ids)
+            for _ in range(self.num_cpus)
+        ]
+        with open(self.molecules_path, "r", encoding="utf-8") as inp_data:
+            for molecule in tqdm(
+                inp_data.read().splitlines(),
+                desc="Number of molecules processed: ",
+                bar_format="{desc}{n} [{elapsed}]",
+            ):
+                to_process.put(molecule)
+        results = [graph for res in ray.get(results_ids) if res for graph in res]
+        processed_data.extend(results)
+        ray.shutdown()
+        for pyg in processed_data:
+            pyg.y_rules = pyg.y_rules.to_dense()
+            pyg.y_priority = pyg.y_priority.to_dense()
+        data, slices = self.collate(processed_data)
+        if self.output_path:
+            makedirs(os.path.dirname(self.output_path))
+            torch.save((data, slices), self.output_path)
+        return data, slices
+def reaction_rules_appliance(
+    molecule: MoleculeContainer, reaction_rules: List[Reactor]
+) -> Tuple[List[int], List[int]]:
+    """Applies each reaction rule from the list of reaction rules to a given molecule
+    and returns the indexes of the successfully applied regular and prioritized reaction
+    rules.
+    :param molecule: The input molecule.
+    :param reaction_rules: The list of reaction rules.
+    :return: The two lists of indexes of successfully applied regular reaction rules and
+        priority reaction rules.
+    """
+    applied_rules, priority_rules = [], []
+    for i, rule in enumerate(reaction_rules):
+        rule_applied = False
+        rule_prioritized = False
+        try:
+            for reaction in rule([molecule]):
+                for prod in reaction.products:
+                    prod.kekule()
+                    if prod.check_valence():
+                        break
+                    rule_applied = True
+                    # check priority rules
+                    if len(reaction.products) > 1:
+                        # check coupling retro manual
+                        if all(len(mol) > 6 for mol in reaction.products):
+                            if (
+                                sum(len(mol) for mol in reaction.products)
+                                - len(reaction.reactants[0])
+                                < 6
+                            ):
+                                rule_prioritized = True
+                    else:
+                        # check cyclization retro manual
+                        if sum(len(mol.sssr) for mol in reaction.products) < sum(
+                            len(mol.sssr) for mol in reaction.reactants
+                        ):
+                            rule_prioritized = True
+            #
+            if rule_applied:
+                applied_rules.append(i)
+                #
+                if rule_prioritized:
+                    priority_rules.append(i)
+        except Exception as e:
+            logging.debug(e)
+            continue
+    return applied_rules, priority_rules
+@ray.remote
+def preprocess_filtering_policy_molecules(
+    to_process: Queue, reaction_rules: List[Reactor]
+) -> List[Optional[Data]]:
+    """Preprocesses a list of molecules by applying reaction rules and converting
+    molecules into PyTorch geometric graphs. Successfully applied reaction rules are
+    converted to binary vectors for policy network training.
+    :param to_process: The queue containing SMILES of molecules to be converted to the
+        training data.
+    :param reaction_rules: The list of reaction rules.
+    :return: The list of PyGraph objects.
+    """
+    pyg_graphs = []
+    while True:
+        try:
+            molecule = smiles(to_process.get(timeout=30))
+            if not isinstance(molecule, MoleculeContainer):
+                continue
+            # reaction reaction_rules application
+            applied_rules, priority_rules = reaction_rules_appliance(
+                molecule, reaction_rules
+            )
+            y_rules = torch.sparse_coo_tensor(
+                [applied_rules],
+                torch.ones(len(applied_rules)),
+                (len(reaction_rules),),
+                dtype=torch.uint8,
+            )
+            y_priority = torch.sparse_coo_tensor(
+                [priority_rules],
+                torch.ones(len(priority_rules)),
+                (len(reaction_rules),),
+                dtype=torch.uint8,
+            )
+            y_rules = torch.unsqueeze(y_rules, 0)
+            y_priority = torch.unsqueeze(y_priority, 0)
+            pyg_graph = mol_to_pyg(molecule)
+            if not pyg_graph:
+                continue
+            pyg_graph.y_rules = y_rules
+            pyg_graph.y_priority = y_priority
+            pyg_graphs.append(pyg_graph)
+        except Empty:
+            break
+    return pyg_graphs
+def atom_to_vector(atom: Any) -> Tensor:
+    """Given an atom, return a vector of length 8 with the following
+    information:
+    1. Atomic number
+    2. Period
+    3. Group
+    4. Number of electrons + atom's charge
+    5. Shell
+    6. Total number of hydrogens
+    7. Whether the atom is in a ring
+    8. Number of neighbors
+    :param atom: The atom object.
+    :return: The vector of the atom.
+    """
+    vector = torch.zeros(8, dtype=torch.uint8)
+    period, group, shell, electrons = MENDEL_INFO[atom.atomic_symbol]
+    vector[0] = atom.atomic_number
+    vector[1] = period
+    vector[2] = group
+    vector[3] = electrons + atom.charge
+    vector[4] = shell
+    vector[5] = atom.total_hydrogens
+    vector[6] = int(atom.in_ring)
+    vector[7] = atom.neighbors
+    return vector
+def bonds_to_vector(molecule: MoleculeContainer, atom_ind: int) -> Tensor:
+    """Takes a molecule and an atom index as input, and returns a vector representing
+    the bond orders of the atom's bonds.
+    :param molecule: The given molecule.
+    :param atom_ind: The index of the atom in the molecule to be converted to the bond
+        vector.
+    :return: The torch tensor of size 3, with each element representing the order of
+        bonds connected to the atom with the given index in the molecule.
+    """
+    vector = torch.zeros(3, dtype=torch.uint8)
+    for b_order in molecule._bonds[atom_ind].values():
+        vector[int(b_order) - 1] += 1
+    return vector
+def mol_to_matrix(molecule: MoleculeContainer) -> Tensor:
+    """Given a molecule, it returns a vector of shape (max_atoms, 12) where each row is
+    an atom and each column is a feature.
+    :param molecule: The molecule to be converted to a vector
+    :return: The atoms vectors array.
+    """
+    atoms_vectors = torch.zeros((len(molecule), 11), dtype=torch.uint8)
+    for n, atom in molecule.atoms():
+        atoms_vectors[n - 1][:8] = atom_to_vector(atom)
+    for n, _ in molecule.atoms():
+        atoms_vectors[n - 1][8:] = bonds_to_vector(molecule, n)
+    return atoms_vectors
+def mol_to_pyg(
+    molecule: MoleculeContainer, canonicalize: bool = True
+) -> Optional[Data]:
+    """Takes a list of molecules and returns a list of PyTorch Geometric graphs, a one-
+    hot encoded vectors of the atoms, and a matrices of the bonds.
+    :param molecule: The molecule to be converted to PyTorch Geometric graph.
+    :param canonicalize: If True, the input molecule is canonicalized.
+    :return: The list of PyGraph objects.
+    """
+    if len(molecule) == 1:  # to avoid a precursor to be a single atom
+        return None
+    tmp_molecule = molecule.copy()
+    try:
+        if canonicalize:
+            tmp_molecule.canonicalize()
+        tmp_molecule.kekule()
+        if tmp_molecule.check_valence():
+            return None
+    except InvalidAromaticRing:
+        return None
+    # remapping target for torch_geometric because
+    # it is necessary that the elements in edge_index only hold nodes_idx in the range { 0, ..., num_nodes - 1}
+    new_mappings = {n: i for i, (n, _) in enumerate(tmp_molecule.atoms(), 1)}
+    tmp_molecule.remap(new_mappings)
+    # get edge indexes from target mapping
+    edge_index = []
+    for atom, neighbour, bond in tmp_molecule.bonds():
+        edge_index.append([atom - 1, neighbour - 1])
+    edge_index = torch.tensor(edge_index, dtype=torch.long)
+    #
+    x = mol_to_matrix(tmp_molecule)
+    mol_pyg_graph = Data(x=x, edge_index=edge_index.t().contiguous())
+    mol_pyg_graph = ToUndirected()(mol_pyg_graph)
+    assert mol_pyg_graph.is_undirected()
+    return mol_pyg_graph
+MENDEL_INFO = {
+    "Ag": (5, 11, 1, 1),
+    "Al": (3, 13, 2, 1),
+    "Ar": (3, 18, 2, 6),
+    "As": (4, 15, 2, 3),
+    "B": (2, 13, 2, 1),
+    "Ba": (6, 2, 1, 2),
+    "Bi": (6, 15, 2, 3),
+    "Br": (4, 17, 2, 5),
+    "C": (2, 14, 2, 2),
+    "Ca": (4, 2, 1, 2),
+    "Ce": (6, None, 1, 2),
+    "Cl": (3, 17, 2, 5),
+    "Cr": (4, 6, 1, 1),
+    "Cs": (6, 1, 1, 1),
+    "Cu": (4, 11, 1, 1),
+    "Dy": (6, None, 1, 2),
+    "Er": (6, None, 1, 2),
+    "F": (2, 17, 2, 5),
+    "Fe": (4, 8, 1, 2),
+    "Ga": (4, 13, 2, 1),
+    "Gd": (6, None, 1, 2),
+    "Ge": (4, 14, 2, 2),
+    "Hg": (6, 12, 1, 2),
+    "I": (5, 17, 2, 5),
+    "In": (5, 13, 2, 1),
+    "K": (4, 1, 1, 1),
+    "La": (6, 3, 1, 2),
+    "Li": (2, 1, 1, 1),
+    "Mg": (3, 2, 1, 2),
+    "Mn": (4, 7, 1, 2),
+    "N": (2, 15, 2, 3),
+    "Na": (3, 1, 1, 1),
+    "Nd": (6, None, 1, 2),
+    "O": (2, 16, 2, 4),
+    "P": (3, 15, 2, 3),
+    "Pb": (6, 14, 2, 2),
+    "Pd": (5, 10, 3, 10),
+    "Pr": (6, None, 1, 2),
+    "Rb": (5, 1, 1, 1),
+    "S": (3, 16, 2, 4),
+    "Sb": (5, 15, 2, 3),
+    "Se": (4, 16, 2, 4),
+    "Si": (3, 14, 2, 2),
+    "Sm": (6, None, 1, 2),
+    "Sn": (5, 14, 2, 2),
+    "Sr": (5, 2, 1, 2),
+    "Te": (5, 16, 2, 4),
+    "Ti": (4, 4, 1, 2),
+    "Tl": (6, 13, 2, 1),
+    "Yb": (6, None, 1, 2),
+    "Zn": (4, 12, 1, 2),
+}

synplan/ml/training/reinforcement.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""Module containing functions for running value network tuning with reinforcement learning
+approach."""
+import os
+import random
+from collections import defaultdict
+from pathlib import Path
+from random import shuffle
+from typing import Dict, List
+import torch
+from CGRtools.containers import MoleculeContainer
+from pytorch_lightning import Trainer
+from torch.utils.data import random_split
+from torch_geometric.data.lightning import LightningDataset
+from synplan.chem.precursor import compose_precursors
+from synplan.mcts.evaluation import ValueNetworkFunction
+from synplan.mcts.expansion import PolicyNetworkFunction
+from synplan.mcts.tree import Tree
+from synplan.ml.networks.value import ValueNetwork
+from synplan.ml.training.preprocessing import ValueNetworkDataset
+from synplan.utils.config import (
+    PolicyNetworkConfig,
+    TuningConfig,
+    TreeConfig,
+    ValueNetworkConfig,
+)
+from synplan.utils.files import MoleculeReader
+from synplan.utils.loading import (
+    load_building_blocks,
+    load_reaction_rules,
+    load_value_net,
+)
+from synplan.utils.logging import DisableLogger, HiddenPrints
+def create_value_network(value_config: ValueNetworkConfig) -> ValueNetwork:
+    """Creates the initial value network.
+    :param value_config: The value network configuration.
+    :return: The valueNetwork to be trained/tuned.
+    """
+    weights_path = Path(value_config.weights_path)
+    value_network = ValueNetwork(
+        vector_dim=value_config.vector_dim,
+        batch_size=value_config.batch_size,
+        dropout=value_config.dropout,
+        num_conv_layers=value_config.num_conv_layers,
+        learning_rate=value_config.learning_rate,
+    )
+    with DisableLogger(), HiddenPrints():
+        trainer = Trainer()
+        trainer.strategy.connect(value_network)
+        trainer.save_checkpoint(weights_path)
+    return value_network
+def create_targets_batch(
+    targets: List[MoleculeContainer], batch_size: int
+) -> List[List[MoleculeContainer]]:
+    """Creates the targets batches for planning simulations and value network tuning.
+    :param targets: The list of target molecules.
+    :param batch_size: The size of each target batch.
+    :return: The list of lists corresponding to each target batch.
+    """
+    num_targets = len(targets)
+    batch_splits = list(
+        range(num_targets // batch_size + int(bool(num_targets % batch_size)))
+    )
+    if int(num_targets / batch_size) == 0:
+        print(f"1 batch were created with {num_targets} molecules")
+    else:
+        print(
+            f"{len(batch_splits)} batches were created with {batch_size} molecules each"
+        )
+    targets_batch_list = []
+    for batch_id in batch_splits:
+        batch_slices = [
+            i
+            for i in range(batch_id * batch_size, (batch_id + 1) * batch_size)
+            if i < len(targets)
+        ]
+        targets_batch_list.append([targets[i] for i in batch_slices])
+    return targets_batch_list
+def run_tree_search(
+    target: MoleculeContainer,
+    tree_config: TreeConfig,
+    policy_config: PolicyNetworkConfig,
+    value_config: ValueNetworkConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+) -> Tree:
+    """Runs tree search for the given target molecule.
+    :param target: The target molecule.
+    :param tree_config: The planning configuration of tree search.
+    :param policy_config: The policy network configuration.
+    :param value_config: The value network configuration.
+    :param reaction_rules_path: The path to the file with reaction rules.
+    :param building_blocks_path: The path to the file with building blocks.
+    :return: The built search tree for the given molecule.
+    """
+    # policy and value function loading
+    policy_function = PolicyNetworkFunction(policy_config=policy_config)
+    value_function = ValueNetworkFunction(weights_path=value_config.weights_path)
+    reaction_rules = load_reaction_rules(reaction_rules_path)
+    building_blocks = load_building_blocks(building_blocks_path, standardize=True)
+    # initialize tree
+    tree_config.evaluation_type = "gcn"
+    tree_config.silent = True
+    tree = Tree(
+        target=target,
+        config=tree_config,
+        reaction_rules=reaction_rules,
+        building_blocks=building_blocks,
+        expansion_function=policy_function,
+        evaluation_function=value_function,
+    )
+    tree._tqdm = False
+    # remove target from buildings blocs
+    if str(target) in tree.building_blocks:
+        tree.building_blocks.remove(str(target))
+    # run tree search
+    _ = list(tree)
+    return tree
+def extract_tree_precursor(tree_list: List[Tree]) -> Dict[str, float]:
+    """Takes the built tree and extracts the precursor for value network tuning. The
+    precursor from found retrosynthetic routes are labeled as a positive class and precursor
+    from not solved routes are labeled as a negative class.
+    :param tree_list: The list of built search trees.
+    :return: The dictionary with the precursor SMILES and its class (positive - 1 or negative - 0).
+    """
+    extracted_precursor = defaultdict(float)
+    for tree in tree_list:
+        for idx, node in tree.nodes.items():
+            # add solved nodes to set
+            if node.is_solved():
+                parent = idx
+                while parent and parent != 1:
+                    composed_smi = str(
+                        compose_precursors(tree.nodes[parent].new_precursors)
+                    )
+                    extracted_precursor[composed_smi] = 1.0
+                    parent = tree.parents[parent]
+            else:
+                composed_smi = str(compose_precursors(tree.nodes[idx].new_precursors))
+                extracted_precursor[composed_smi] = 0.0
+    # shuffle extracted precursor
+    processed_keys = list(extracted_precursor.keys())
+    shuffle(processed_keys)
+    extracted_precursor = {i: extracted_precursor[i] for i in processed_keys}
+    return extracted_precursor
+def balance_extracted_precursor(extracted_precursor):
+    extracted_precursor_balanced = {}
+    neg_list = [i for i, j in extracted_precursor.items() if j == 0]
+    for k, v in extracted_precursor.items():
+        if v == 1:
+            extracted_precursor_balanced[k] = v
+        if len(extracted_precursor_balanced) < len(neg_list):
+            neg_list.pop(random.choice(range(len(neg_list))))
+    return extracted_precursor_balanced
+def create_updating_set(
+    extracted_precursor: Dict[str, float], batch_size: int = 1
+) -> LightningDataset:
+    """Creates the value network updating dataset from precursor extracted from the planning
+    simulation.
+    :param extracted_precursor: The dictionary with the extracted precursor and their
+        labels.
+    :param batch_size: The size of the batch in value network updating.
+    :return: A LightningDataset object, which contains the tuning set for value network
+        tuning.
+    """
+    extracted_precursor = balance_extracted_precursor(extracted_precursor)
+    full_dataset = ValueNetworkDataset(extracted_precursor)
+    train_size = int(0.6 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_set, val_set = random_split(
+        full_dataset, [train_size, val_size], torch.Generator().manual_seed(42)
+    )
+    print(f"Training set size: {len(train_set)}")
+    print(f"Validation set size: {len(val_set)}")
+    return LightningDataset(
+        train_set, val_set, batch_size=batch_size, pin_memory=True, drop_last=True
+    )
+def tune_value_network(
+    datamodule: LightningDataset, value_config: ValueNetworkConfig
+) -> None:
+    """Trains the value network using a given tuning data and saves the trained neural
+    network.
+    :param datamodule: The tuning dataset (LightningDataset).
+    :param value_config: The value network configuration.
+    :return: None.
+    """
+    current_weights = value_config.weights_path
+    value_network = load_value_net(ValueNetwork, current_weights)
+    with DisableLogger(), HiddenPrints():
+        trainer = Trainer(
+            accelerator="gpu",
+            devices=[0],
+            max_epochs=value_config.num_epoch,
+            enable_checkpointing=False,
+            logger=False,
+            gradient_clip_val=1.0,
+            enable_progress_bar=False,
+        )
+        trainer.fit(value_network, datamodule)
+        val_score = trainer.validate(value_network, datamodule.val_dataloader())[0]
+        trainer.save_checkpoint(current_weights)
+    print(f"Value network balanced accuracy: {val_score['val_balanced_accuracy']}")
+def run_training(
+    extracted_precursor: Dict[str, float] = None,
+    value_config: ValueNetworkConfig = None,
+) -> None:
+    """Runs the training stage in value network tuning.
+    :param extracted_precursor: The precursor extracted from the planing simulations.
+    :param value_config: The value network configuration.
+    :return: None.
+    """
+    # create training set
+    training_set = create_updating_set(
+        extracted_precursor=extracted_precursor, batch_size=value_config.batch_size
+    )
+    # retrain value network
+    tune_value_network(datamodule=training_set, value_config=value_config)
+def run_planning(
+    targets_batch: List[MoleculeContainer],
+    tree_config: TreeConfig,
+    policy_config: PolicyNetworkConfig,
+    value_config: ValueNetworkConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+    targets_batch_id: int,
+):
+    """Performs planning stage (tree search) for target molecules and save extracted
+    from built trees precursor for further tuning the value network in the training stage.
+    :param targets_batch:
+    :param tree_config:
+    :param policy_config:
+    :param value_config:
+    :param reaction_rules_path:
+    :param building_blocks_path:
+    :param targets_batch_id:
+    """
+    from tqdm import tqdm
+    print(f"\nProcess batch number {targets_batch_id}")
+    tree_list = []
+    tree_config.silent = False
+    for target in tqdm(targets_batch):
+        try:
+            tree = run_tree_search(
+                target=target,
+                tree_config=tree_config,
+                policy_config=policy_config,
+                value_config=value_config,
+                reaction_rules_path=reaction_rules_path,
+                building_blocks_path=building_blocks_path,
+            )
+            tree_list.append(tree)
+        except Exception as e:
+            print(e)
+            continue
+    num_solved = sum([len(i.winning_nodes) > 0 for i in tree_list])
+    print(f"Planning is finished with {num_solved} solved targets")
+    return tree_list
+def run_updating(
+    targets_path: str,
+    tree_config: TreeConfig,
+    policy_config: PolicyNetworkConfig,
+    value_config: ValueNetworkConfig,
+    reinforce_config: TuningConfig,
+    reaction_rules_path: str,
+    building_blocks_path: str,
+    results_root: str = None,
+) -> None:
+    """Performs updating of value network.
+    :param targets_path: The path to the file with target molecules.
+    :param tree_config: The search tree configuration.
+    :param policy_config: The policy network configuration.
+    :param value_config: The value network configuration.
+    :param reinforce_config: The value network tuning configuration.
+    :param reaction_rules_path: The path to the file with reaction rules.
+    :param building_blocks_path: The path to the file with building blocks.
+    :param results_root: The path to the directory where trained value network will be
+        saved.
+    :return: None.
+    """
+    # create results root folder
+    results_root = Path(results_root)
+    if not results_root.exists():
+        results_root.mkdir()
+    # load targets list
+    with MoleculeReader(targets_path) as targets:
+        targets = list(targets)
+    # create value neural network
+    value_config.weights_path = os.path.join(results_root, "value_network.ckpt")
+    create_value_network(value_config)
+    # create targets batch
+    targets_batch_list = create_targets_batch(
+        targets, batch_size=reinforce_config.batch_size
+    )
+    # run value network tuning
+    for batch_id, targets_batch in enumerate(targets_batch_list, start=1):
+        # start tree planning simulation for batch of targets
+        tree_list = run_planning(
+            targets_batch=targets_batch,
+            tree_config=tree_config,
+            policy_config=policy_config,
+            value_config=value_config,
+            reaction_rules_path=reaction_rules_path,
+            building_blocks_path=building_blocks_path,
+            targets_batch_id=batch_id,
+        )
+        # extract pos and neg precursor from the list of built trees
+        extracted_precursor = extract_tree_precursor(tree_list)
+        # train value network for extracted precursor
+        run_training(extracted_precursor=extracted_precursor, value_config=value_config)

synplan/ml/training/supervised.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Module for the preparation and training of a policy network used in the expansion of
+nodes in tree search.
+This module includes functions for creating training datasets and running the training
+process for the policy network.
+"""
+import warnings
+from pathlib import Path
+from typing import Union, List
+import os
+import torch
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from torch.utils.data import random_split
+from torch_geometric.data.lightning import LightningDataset
+from synplan.ml.networks.policy import PolicyNetwork
+from synplan.ml.training.preprocessing import (
+    FilteringPolicyDataset,
+    RankingPolicyDataset,
+)
+from synplan.utils.config import PolicyNetworkConfig
+from synplan.utils.logging import DisableLogger, HiddenPrints
+warnings.filterwarnings("ignore")
+def create_policy_dataset(
+    reaction_rules_path: str,
+    molecules_or_reactions_path: str,
+    output_path: str,
+    dataset_type: str = "filtering",
+    batch_size: int = 100,
+    num_cpus: int = 1,
+    training_data_ratio: float = 0.8,
+):
+    """
+    Create a training dataset for a policy network.
+    :param reaction_rules_path: Path to the reaction rules file.
+    :param molecules_or_reactions_path: Path to the molecules or reactions file used to create the training set.
+    :param output_path: Path to store the processed dataset.
+    :param dataset_type: Type of the dataset to be created ('ranking' or 'filtering').
+    :param batch_size: The size of batch of molecules/reactions.
+    :param training_data_ratio: Ratio of training data to total data.
+    :param num_cpus: Number of CPUs to use for data processing.
+    :return: A `LightningDataset` object containing training and validation datasets.
+    """
+    with DisableLogger(), HiddenPrints():
+        if dataset_type == "filtering":
+            full_dataset = FilteringPolicyDataset(
+                reaction_rules_path=reaction_rules_path,
+                molecules_path=molecules_or_reactions_path,
+                output_path=output_path,
+                num_cpus=num_cpus,
+            )
+        elif dataset_type == "ranking":
+            full_dataset = RankingPolicyDataset(
+                reaction_rules_path=reaction_rules_path,
+                reactions_path=molecules_or_reactions_path,
+                output_path=output_path,
+            )
+    train_size = int(training_data_ratio * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(
+        full_dataset, [train_size, val_size], torch.Generator().manual_seed(42)
+    )
+    print(
+        f"Training set size: {len(train_dataset)}, validation set size: {len(val_dataset)}"
+    )
+    datamodule = LightningDataset(
+        train_dataset,
+        val_dataset,
+        batch_size=batch_size,
+        pin_memory=True,
+        drop_last=True,
+    )
+    return datamodule
+def run_policy_training(
+    datamodule: LightningDataset,
+    config: PolicyNetworkConfig,
+    results_path: str,
+    weights_file_name: str = "policy_network",
+    accelerator: str = "gpu",
+    devices: Union[List[int], str, int] = "auto",
+    silent: bool = False,
+) -> None:
+    """
+    Trains a policy network using a given datamodule and training configuration.
+    :param datamodule: A PyTorch Lightning `DataModule` class instance. It is responsible for loading, processing, and preparing the training data for the model.
+    :param config: The dictionary that contains various configuration settings for the policy training process.
+    :param results_path: Path to store the training results and logs.
+    :param accelerator: Supports passing different accelerator types (“cpu”, “gpu”, “tpu”, “hpu”, “mps”, “auto”) as well as custom accelerator instances. Default: "gpu".
+    :param devices: The devices to use. Can be set to a positive number (int or str), a sequence of device indices (list or str), the value -1 to indicate all available devices should be used, or "auto" for automatic selection based on the chosen accelerator. Default: "auto".
+    :param silent: Run in the silent mode with no progress bars. Default: True.
+    :param weights_file_name: The name of weights file to be saved. Default: "policy_network".
+    :return: None.
+    """
+    results_path = Path(results_path)
+    results_path.mkdir(exist_ok=True)
+    network = PolicyNetwork(
+        vector_dim=config.vector_dim,
+        n_rules=datamodule.train_dataset.dataset.num_classes,
+        batch_size=config.batch_size,
+        dropout=config.dropout,
+        num_conv_layers=config.num_conv_layers,
+        learning_rate=config.learning_rate,
+        policy_type=config.policy_type,
+    )
+    checkpoint = ModelCheckpoint(
+        dirpath=results_path, filename=weights_file_name, monitor="val_loss", mode="min"
+    )
+    if silent:
+        enable_progress_bar = False
+    else:
+        enable_progress_bar = True
+    trainer = Trainer(
+        accelerator=accelerator,
+        devices=devices,
+        max_epochs=config.num_epoch,
+        callbacks=[checkpoint],
+        logger=False,
+        gradient_clip_val=1.0,
+        enable_progress_bar=enable_progress_bar,
+    )
+    if silent:
+        with DisableLogger(), HiddenPrints():
+            trainer.fit(network, datamodule)
+    else:
+        trainer.fit(network, datamodule)
+    ba = round(trainer.logged_metrics["train_balanced_accuracy_y_step"].item(), 3)
+    print(f"Policy network balanced accuracy: {ba}")

synplan/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from typing import Union
+from os import PathLike
+path_type = Union[str, PathLike]

synplan/utils/config.py ADDED Viewed

	@@ -0,0 +1,543 @@

+"""Module containing configuration classes."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Union
+from chython import smarts
+import yaml
+from CGRtools.containers import MoleculeContainer, QueryContainer
+@dataclass
+class ConfigABC(ABC):
+    """Abstract base class for configuration classes."""
+    @staticmethod
+    @abstractmethod
+    def from_dict(config_dict: Dict[str, Any]):
+        """Create an instance of the configuration from a dictionary."""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert the configuration into a dictionary."""
+        return {
+            k: str(v) if isinstance(v, Path) else v for k, v in self.__dict__.items()
+        }
+    @staticmethod
+    @abstractmethod
+    def from_yaml(file_path: str):
+        """Deserialize a YAML file into a configuration object."""
+    def to_yaml(self, file_path: str):
+        """Serializes the configuration to a YAML file.
+        :param file_path: The path to the output YAML file.
+        """
+        with open(file_path, "w", encoding="utf-8") as file:
+            yaml.dump(self.to_dict(), file)
+    @abstractmethod
+    def _validate_params(self, params: Dict[str, Any]):
+        """Validate configuration parameters."""
+    def __post_init__(self):
+        """Validates the configuration parameters."""
+        # call _validate_params method after initialization
+        params = self.to_dict()
+        self._validate_params(params)
+@dataclass
+class RuleExtractionConfig(ConfigABC):
+    """Configuration class for extracting reaction rules.
+    :param multicenter_rules: If True, extracts a single rule
+        encompassing all centers. If False, extracts separate reaction
+        rules for each reaction center in a multicenter reaction.
+    :param as_query_container: If True, the extracted rules are
+        generated as QueryContainer objects, analogous to SMARTS objects
+        for pattern matching in chemical structures.
+    :param reverse_rule: If True, reverses the direction of the reaction
+        for rule extraction.
+    :param reactor_validation: If True, validates each generated rule in
+        a chemical reactor to ensure correct generation of products from
+        reactants.
+    :param include_func_groups: If True, includes specific functional
+        groups in the reaction rule in addition to the reaction center
+        and its environment.
+    :param func_groups_list: A list of functional groups to be
+        considered when include_func_groups is True.
+    :param include_rings: If True, includes ring structures in the
+        reaction rules.
+    :param keep_leaving_groups: If True, retains leaving groups in the
+        extracted reaction rule.
+    :param keep_incoming_groups: If True, retains incoming groups in the
+        extracted reaction rule.
+    :param keep_reagents: If True, includes reagents in the extracted
+        reaction rule.
+    :param environment_atom_count: Defines the size of the environment
+        around the reaction center to be included in the rule (0 for
+        only the reaction center, 1 for the first environment, etc.).
+    :param min_popularity: Minimum number of times a rule must be
+        applied to be considered for further analysis.
+    :param keep_metadata: If True, retains metadata associated with the
+        reaction in the extracted rule.
+    :param single_reactant_only: If True, includes only reaction rules
+        with a single reactant molecule.
+    :param atom_info_retention: Controls the amount of information about
+        each atom to retain ('none', 'reaction_center', or 'all').
+    """
+    # default low-level parameters
+    single_reactant_only: bool = True
+    keep_metadata: bool = False
+    reactor_validation: bool = True
+    reverse_rule: bool = True
+    as_query_container: bool = True
+    include_func_groups: bool = False
+    func_groups_list: List[str] = field(default_factory=list)
+    # adjustable parameters
+    environment_atom_count: int = 1
+    min_popularity: int = 3
+    include_rings: bool = True
+    multicenter_rules: bool = True
+    keep_leaving_groups: bool = True
+    keep_incoming_groups: bool = True
+    keep_reagents: bool = False
+    atom_info_retention: Dict[str, Dict[str, bool]] = field(default_factory=dict)
+    def __post_init__(self):
+        super().__post_init__()
+        self._validate_params(self.to_dict())
+        self._initialize_default_atom_info_retention()
+        self._parse_functional_groups()
+    def _initialize_default_atom_info_retention(self):
+        default_atom_info = {
+            "reaction_center": {
+                "neighbors": True,
+                "hybridization": True,
+                "implicit_hydrogens": False,
+                "ring_sizes": False,
+            },
+            "environment": {
+                "neighbors": False,
+                "hybridization": False,
+                "implicit_hydrogens": False,
+                "ring_sizes": False,
+            },
+        }
+        if not self.atom_info_retention:
+            self.atom_info_retention = default_atom_info
+        else:
+            for key in default_atom_info:
+                self.atom_info_retention[key].update(
+                    self.atom_info_retention.get(key, {})
+                )
+    def _parse_functional_groups(self):
+        func_groups_list = []
+        for group_smarts in self.func_groups_list:
+            try:
+                query = smarts(group_smarts)
+                func_groups_list.append(query)
+            except Exception as e:
+                print(f"Functional group {group_smarts} was not parsed because of {e}")
+        self.func_groups_list = func_groups_list
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "RuleExtractionConfig":
+        return RuleExtractionConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "RuleExtractionConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return RuleExtractionConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]) -> None:
+        if not isinstance(params["multicenter_rules"], bool):
+            raise ValueError("multicenter_rules must be a boolean.")
+        if not isinstance(params["as_query_container"], bool):
+            raise ValueError("as_query_container must be a boolean.")
+        if not isinstance(params["reverse_rule"], bool):
+            raise ValueError("reverse_rule must be a boolean.")
+        if not isinstance(params["reactor_validation"], bool):
+            raise ValueError("reactor_validation must be a boolean.")
+        if not isinstance(params["include_func_groups"], bool):
+            raise ValueError("include_func_groups must be a boolean.")
+        if params["func_groups_list"] is not None and not all(
+            isinstance(group, str) for group in params["func_groups_list"]
+        ):
+            raise ValueError("func_groups_list must be a list of SMARTS.")
+        if not isinstance(params["include_rings"], bool):
+            raise ValueError("include_rings must be a boolean.")
+        if not isinstance(params["keep_leaving_groups"], bool):
+            raise ValueError("keep_leaving_groups must be a boolean.")
+        if not isinstance(params["keep_incoming_groups"], bool):
+            raise ValueError("keep_incoming_groups must be a boolean.")
+        if not isinstance(params["keep_reagents"], bool):
+            raise ValueError("keep_reagents must be a boolean.")
+        if not isinstance(params["environment_atom_count"], int):
+            raise ValueError("environment_atom_count must be an integer.")
+        if not isinstance(params["min_popularity"], int):
+            raise ValueError("min_popularity must be an integer.")
+        if not isinstance(params["keep_metadata"], bool):
+            raise ValueError("keep_metadata must be a boolean.")
+        if not isinstance(params["single_reactant_only"], bool):
+            raise ValueError("single_reactant_only must be a boolean.")
+        if params["atom_info_retention"] is not None:
+            if not isinstance(params["atom_info_retention"], dict):
+                raise ValueError("atom_info_retention must be a dictionary.")
+            required_keys = {"reaction_center", "environment"}
+            if not required_keys.issubset(params["atom_info_retention"]):
+                missing_keys = required_keys - set(params["atom_info_retention"].keys())
+                raise ValueError(
+                    f"atom_info_retention missing required keys: {missing_keys}"
+                )
+            for key, value in params["atom_info_retention"].items():
+                if key not in required_keys:
+                    raise ValueError(f"Unexpected key in atom_info_retention: {key}")
+                expected_subkeys = {
+                    "neighbors",
+                    "hybridization",
+                    "implicit_hydrogens",
+                    "ring_sizes",
+                }
+                if not isinstance(value, dict) or not expected_subkeys.issubset(value):
+                    missing_subkeys = expected_subkeys - set(value.keys())
+                    raise ValueError(
+                        f"Invalid structure for {key} in atom_info_retention. Missing subkeys: {missing_subkeys}"
+                    )
+                for subkey, subvalue in value.items():
+                    if not isinstance(subvalue, bool):
+                        raise ValueError(
+                            f"Value for {subkey} in {key} of atom_info_retention must be boolean."
+                        )
+@dataclass
+class PolicyNetworkConfig(ConfigABC):
+    """Configuration class for the policy network.
+    :param vector_dim: Dimension of the input vectors.
+    :param batch_size: Number of samples per batch.
+    :param dropout: Dropout rate for regularization.
+    :param learning_rate: Learning rate for the optimizer.
+    :param num_conv_layers: Number of convolutional layers in the network.
+    :param num_epoch: Number of training epochs.
+    :param policy_type: Mode of operation, either 'filtering' or 'ranking'.
+    """
+    policy_type: str = "ranking"
+    vector_dim: int = 256
+    batch_size: int = 500
+    dropout: float = 0.4
+    learning_rate: float = 0.008
+    num_conv_layers: int = 5
+    num_epoch: int = 100
+    weights_path: str = None
+    # for filtering policy
+    priority_rules_fraction: float = 0.5
+    rule_prob_threshold: float = 0.0
+    top_rules: int = 50
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "PolicyNetworkConfig":
+        return PolicyNetworkConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "PolicyNetworkConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return PolicyNetworkConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        if params["policy_type"] not in ["filtering", "ranking"]:
+            raise ValueError("policy_type must be either 'filtering' or 'ranking'.")
+        if not isinstance(params["vector_dim"], int) or params["vector_dim"] <= 0:
+            raise ValueError("vector_dim must be a positive integer.")
+        if not isinstance(params["batch_size"], int) or params["batch_size"] <= 0:
+            raise ValueError("batch_size must be a positive integer.")
+        if (
+            not isinstance(params["num_conv_layers"], int)
+            or params["num_conv_layers"] <= 0
+        ):
+            raise ValueError("num_conv_layers must be a positive integer.")
+        if not isinstance(params["num_epoch"], int) or params["num_epoch"] <= 0:
+            raise ValueError("num_epoch must be a positive integer.")
+        if not isinstance(params["dropout"], float) or not (
+            0.0 <= params["dropout"] <= 1.0
+        ):
+            raise ValueError("dropout must be a float between 0.0 and 1.0.")
+        if (
+            not isinstance(params["learning_rate"], float)
+            or params["learning_rate"] <= 0.0
+        ):
+            raise ValueError("learning_rate must be a positive float.")
+        if (
+            not isinstance(params["priority_rules_fraction"], float)
+            or params["priority_rules_fraction"] < 0.0
+        ):
+            raise ValueError(
+                "priority_rules_fraction must be a non-negative positive float."
+            )
+        if (
+            not isinstance(params["rule_prob_threshold"], float)
+            or params["rule_prob_threshold"] < 0.0
+        ):
+            raise ValueError("rule_prob_threshold must be a non-negative float.")
+        if not isinstance(params["top_rules"], int) or params["top_rules"] <= 0:
+            raise ValueError("top_rules must be a positive integer.")
+@dataclass
+class ValueNetworkConfig(ConfigABC):
+    """Configuration class for the value network.
+    :param vector_dim: Dimension of the input vectors.
+    :param batch_size: Number of samples per batch.
+    :param dropout: Dropout rate for regularization.
+    :param learning_rate: Learning rate for the optimizer.
+    :param num_conv_layers: Number of convolutional layers in the network.
+    :param num_epoch: Number of training epochs.
+    """
+    weights_path: str = None
+    vector_dim: int = 256
+    batch_size: int = 500
+    dropout: float = 0.4
+    learning_rate: float = 0.008
+    num_conv_layers: int = 5
+    num_epoch: int = 100
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "ValueNetworkConfig":
+        return ValueNetworkConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "ValueNetworkConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return ValueNetworkConfig.from_dict(config_dict)
+    def to_yaml(self, file_path: str):
+        with open(file_path, "w", encoding="utf-8") as file:
+            yaml.dump(self.to_dict(), file)
+    def _validate_params(self, params: Dict[str, Any]):
+        if not isinstance(params["vector_dim"], int) or params["vector_dim"] <= 0:
+            raise ValueError("vector_dim must be a positive integer.")
+        if not isinstance(params["batch_size"], int) or params["batch_size"] <= 0:
+            raise ValueError("batch_size must be a positive integer.")
+        if (
+            not isinstance(params["num_conv_layers"], int)
+            or params["num_conv_layers"] <= 0
+        ):
+            raise ValueError("num_conv_layers must be a positive integer.")
+        if not isinstance(params["num_epoch"], int) or params["num_epoch"] <= 0:
+            raise ValueError("num_epoch must be a positive integer.")
+        if not isinstance(params["dropout"], float) or not (
+            0.0 <= params["dropout"] <= 1.0
+        ):
+            raise ValueError("dropout must be a float between 0.0 and 1.0.")
+        if (
+            not isinstance(params["learning_rate"], float)
+            or params["learning_rate"] <= 0.0
+        ):
+            raise ValueError("learning_rate must be a positive float.")
+@dataclass
+class TuningConfig(ConfigABC):
+    """Configuration class for the network training.
+    :param batch_size: The number of targets per batch in the planning simulation step.
+    :param num_simulations: The number of planning simulations.
+    """
+    batch_size: int = 100
+    num_simulations: int = 1
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "TuningConfig":
+        return TuningConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "TuningConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return TuningConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        if not isinstance(params["batch_size"], int) or params["batch_size"] <= 0:
+            raise ValueError("batch_size must be a positive integer.")
+@dataclass
+class TreeConfig(ConfigABC):
+    """Configuration class for the tree search algorithm.
+    :param max_iterations: The number of iterations to run the algorithm
+        for.
+    :param max_tree_size: The maximum number of nodes in the tree.
+    :param max_time: The time limit (in seconds) for the algorithm to
+        run.
+    :param max_depth: The maximum depth of the tree.
+    :param ucb_type: Type of UCB used in the search algorithm. Options
+        are "puct", "uct", "value", defaults to "uct".
+    :param c_ucb: The exploration-exploitation balance coefficient used
+        in Upper Confidence Bound (UCB).
+    :param backprop_type: Type of backpropagation algorithm. Options are
+        "muzero", "cumulative", defaults to "muzero".
+    :param search_strategy: The strategy used for tree search. Options
+        are "expansion_first", "evaluation_first".
+    :param exclude_small: Whether to exclude small molecules during the
+        search.
+    :param evaluation_agg: Method for aggregating evaluation scores.
+        Options are "max", "average", defaults to "max".
+    :param evaluation_type: The method used for evaluating nodes.
+        Options are "random", "rollout", "gcn".
+    :param init_node_value: Initial value for a new node.
+    :param epsilon: A parameter in the epsilon-greedy search strategy
+        representing the chance of random selection of reaction rules
+        during the selection stage in Monte Carlo Tree Search,
+        specifically during Upper Confidence Bound estimation. It
+        balances between exploration and exploitation.
+    :param min_mol_size: Defines the minimum size of a molecule that is
+        have to be synthesized. Molecules with 6 or fewer heavy atoms
+        are assumed to be building blocks by definition, thus setting
+        the threshold for considering larger molecules in the search,
+        defaults to 6.
+    :param silent: Whether to suppress progress output.
+    """
+    max_iterations: int = 100
+    max_tree_size: int = 1000000
+    max_time: float = 600
+    max_depth: int = 6
+    ucb_type: str = "uct"
+    c_ucb: float = 0.1
+    backprop_type: str = "muzero"
+    search_strategy: str = "expansion_first"
+    exclude_small: bool = True
+    evaluation_agg: str = "max"
+    evaluation_type: str = "gcn"
+    init_node_value: float = 0.0
+    epsilon: float = 0.0
+    min_mol_size: int = 6
+    silent: bool = False
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]) -> "TreeConfig":
+        return TreeConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str) -> "TreeConfig":
+        with open(file_path, "r", encoding="utf-8") as file:
+            config_dict = yaml.safe_load(file)
+        return TreeConfig.from_dict(config_dict)
+    def _validate_params(self, params):
+        if params["ucb_type"] not in ["puct", "uct", "value"]:
+            raise ValueError(
+                "Invalid ucb_type. Allowed values are 'puct', 'uct', 'value'."
+            )
+        if params["backprop_type"] not in ["muzero", "cumulative"]:
+            raise ValueError(
+                "Invalid backprop_type. Allowed values are 'muzero', 'cumulative'."
+            )
+        if params["evaluation_type"] not in ["random", "rollout", "gcn"]:
+            raise ValueError(
+                "Invalid evaluation_type. Allowed values are 'random', 'rollout', 'gcn'."
+            )
+        if params["evaluation_agg"] not in ["max", "average"]:
+            raise ValueError(
+                "Invalid evaluation_agg. Allowed values are 'max', 'average'."
+            )
+        if not isinstance(params["c_ucb"], float):
+            raise TypeError("c_ucb must be a float.")
+        if not isinstance(params["max_depth"], int) or params["max_depth"] < 1:
+            raise ValueError("max_depth must be a positive integer.")
+        if not isinstance(params["max_tree_size"], int) or params["max_tree_size"] < 1:
+            raise ValueError("max_tree_size must be a positive integer.")
+        if (
+            not isinstance(params["max_iterations"], int)
+            or params["max_iterations"] < 1
+        ):
+            raise ValueError("max_iterations must be a positive integer.")
+        if not isinstance(params["max_time"], int) or params["max_time"] < 1:
+            raise ValueError("max_time must be a positive integer.")
+        if not isinstance(params["exclude_small"], bool):
+            raise TypeError("exclude_small must be a boolean.")
+        if not isinstance(params["silent"], bool):
+            raise TypeError("silent must be a boolean.")
+        if not isinstance(params["init_node_value"], float):
+            raise TypeError("init_node_value must be a float if provided.")
+        if params["search_strategy"] not in ["expansion_first", "evaluation_first"]:
+            raise ValueError(
+                f"Invalid search_strategy: {params['search_strategy']}: "
+                f"Allowed values are 'expansion_first', 'evaluation_first'"
+            )
+        if not isinstance(params["epsilon"], float) or 0 >= params["epsilon"] >= 1:
+            raise ValueError("epsilon epsilon be a positive float between 0 and 1.")
+        if not isinstance(params["min_mol_size"], int) or params["min_mol_size"] < 0:
+            raise ValueError("min_mol_size must be a non-negative integer.")
+def convert_config_to_dict(config_attr: ConfigABC, config_type) -> Dict | None:
+    """Converts a configuration attribute to a dictionary if it's either a dictionary or
+    an instance of a specified configuration type.
+    :param config_attr: The configuration attribute to be converted.
+    :param config_type: The type to check against for conversion.
+    :return: The configuration attribute as a dictionary, or None if it's not an
+        instance of the given type or dict.
+    """
+    if isinstance(config_attr, dict):
+        return config_attr
+    if isinstance(config_attr, config_type):
+        return config_attr.to_dict()
+    return None

synplan/utils/files.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""Module containing classes and functions needed for reactions/molecules data
+reading/writing."""
+from os.path import splitext
+from pathlib import Path
+from typing import Iterable, Union
+from CGRtools import smiles
+from CGRtools.containers import CGRContainer, MoleculeContainer, ReactionContainer
+from CGRtools.files.RDFrw import RDFRead, RDFWrite
+from CGRtools.files.SDFrw import SDFRead, SDFWrite
+class FileHandler:
+    """General class to handle chemical files."""
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """General class to handle chemical files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        self._file = None
+        _, ext = splitext(filename)
+        file_types = {".smi": "SMI", ".smiles": "SMI", ".rdf": "RDF", ".sdf": "SDF"}
+        try:
+            self._file_type = file_types[ext]
+        except KeyError:
+            raise ValueError("I don't know the file extension,", ext)
+    def close(self):
+        self._file.close()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+class Reader(FileHandler):
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """General class to read reactions/molecules data files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+    def __enter__(self):
+        return self._file
+    def __iter__(self):
+        return iter(self._file)
+    def __next__(self):
+        return next(self._file)
+    def __len__(self):
+        return len(self._file)
+class SMILESRead:
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """Simplified class to read files containing a SMILES (Molecules or Reaction)
+        string per line.
+        :param filename: The path and name of the SMILES file to parse.
+        :return: None.
+        """
+        filename = str(Path(filename).resolve(strict=True))
+        self._file = open(filename, "r", encoding="utf-8")
+        self._data = self.__data()
+    def __data(
+        self,
+    ) -> Iterable[Union[ReactionContainer, CGRContainer, MoleculeContainer]]:
+        for line in iter(self._file.readline, ""):
+            line = line.strip()
+            x = smiles(line)
+            if isinstance(x, (ReactionContainer, CGRContainer, MoleculeContainer)):
+                x.meta["init_smiles"] = line
+                yield x
+    def __enter__(self):
+        return self
+    def read(self):
+        """Parse the whole SMILES file.
+        :return: List of parsed molecules or reactions.
+        """
+        return list(iter(self))
+    def __iter__(self):
+        return (x for x in self._data)
+    def __next__(self):
+        return next(iter(self))
+    def close(self):
+        self._file.close()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+class Writer(FileHandler):
+    def __init__(self, filename: Union[str, Path], mapping: bool = True, **kwargs):
+        """General class to write chemical files.
+        :param filename: The path and name of the file.
+        :param mapping: Whenever to save mapping or not.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+        self._mapping = mapping
+    def __enter__(self):
+        return self
+class ReactionReader(Reader):
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """Class to read reaction files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+        if self._file_type == "SMI":
+            self._file = SMILESRead(filename, **kwargs)
+        elif self._file_type == "RDF":
+            self._file = RDFRead(filename, indexable=True, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+class ReactionWriter(Writer):
+    def __init__(self, filename: Union[str, Path], mapping: bool = True, **kwargs):
+        """Class to write reaction files.
+        :param filename: The path and name of the file.
+        :param mapping: Whenever to save mapping or not.
+        :return: None.
+        """
+        super().__init__(filename, mapping, **kwargs)
+        if self._file_type == "SMI":
+            self._file = open(filename, "w", encoding="utf-8", **kwargs)
+        elif self._file_type == "RDF":
+            self._file = RDFWrite(filename, append=False, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+    def write(self, reaction: ReactionContainer):
+        """Function to write a specific reaction to the file.
+        :param reaction: The path and name of the file.
+        :return: None.
+        """
+        if self._file_type == "SMI":
+            rea_str = to_reaction_smiles_record(reaction)
+            self._file.write(rea_str + "\n")
+        elif self._file_type == "RDF":
+            self._file.write(reaction)
+class MoleculeReader(Reader):
+    def __init__(self, filename: Union[str, Path], **kwargs):
+        """Class to read molecule files.
+        :param filename: The path and name of the file.
+        :return: None.
+        """
+        super().__init__(filename, **kwargs)
+        if self._file_type == "SMI":
+            self._file = SMILESRead(filename, ignore=True, **kwargs)
+        elif self._file_type == "SDF":
+            self._file = SDFRead(filename, indexable=True, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+class MoleculeWriter(Writer):
+    def __init__(self, filename: Union[str, Path], mapping: bool = True, **kwargs):
+        """Class to write molecule files.
+        :param filename: The path and name of the file.
+        :param mapping: Whenever to save mapping or not.
+        :return: None.
+        """
+        super().__init__(filename, mapping, **kwargs)
+        if self._file_type == "SMI":
+            self._file = open(filename, "w", encoding="utf-8", **kwargs)
+        elif self._file_type == "SDF":
+            self._file = SDFWrite(filename, append=False, **kwargs)
+        else:
+            raise ValueError("File type incompatible -", filename)
+    def write(self, molecule: MoleculeContainer):
+        """Function to write a specific molecule to the file.
+        :param molecule: The path and name of the file.
+        :return: None.
+        """
+        if self._file_type == "SMI":
+            mol_str = str(molecule)
+            self._file.write(mol_str + "\n")
+        elif self._file_type == "SDF":
+            self._file.write(molecule)
+def to_reaction_smiles_record(reaction: ReactionContainer) -> str:
+    """Converts the reaction to the SMILES record. Needed for reaction/molecule writers.
+    :param reaction: The reaction to be written.
+    :return: The SMILES record to be written.
+    """
+    if isinstance(reaction, str):
+        return reaction
+    reaction_record = [format(reaction, "m")]
+    sorted_meta = sorted(reaction.meta.items(), key=lambda x: x[0])
+    for _, meta_info in sorted_meta:
+        meta_info = ""
+        meta_info = ";".join(meta_info.split("\n"))
+        reaction_record.append(str(meta_info))
+    return "\t".join(reaction_record)