SynPlanner

Sleeping

App Files Files Community

Gilmullin Almaz commited on Mar 21, 2025

Commit

27a7101

1 Parent(s): 57a9d9a

draft subclustering - need to solve resetting

Browse files

Files changed (1) hide show

app.py +147 -315

app.py CHANGED Viewed

@@ -23,6 +23,9 @@ from synplan.utils.visualisation import generate_results_html, get_route_svg
 from cluster.super_cgr import *
 from cluster.rs_cgr import *
 from cluster.clustering import *
 from StructureFingerprint import MorganFingerprint
 import psutil
@@ -33,83 +36,6 @@ disable_progress_bars("huggingface_hub")
 smiles_parser = SMILESRead.create_parser(ignore=True)
-def reassign_nums_chunk(route_dict):
-    """Process a chunk of routes for reassigning numbers"""
-    return {k: reassign_nums(v) for k, v in route_dict.items()}
-def cluster_molecules_optimized(fingerprints_dict, max_clusters):
-    """Memory-optimized version of cluster_molecules.
-    Args:
-        fingerprints_dict (dict): Dictionary of pre-computed fingerprints
-        max_clusters (int): Maximum number of clusters
-    Returns:
-        dict: Clustering results containing clusters_dict and cluster_labels
-    """
-    try:
-        # Convert dictionary to arrays for efficient processing
-        labels = np.array(list(fingerprints_dict.keys()))
-        fingerprints = np.array(list(fingerprints_dict.values()))
-        # Calculate similarity matrix in chunks to save memory
-        chunk_size = 100
-        n_samples = len(fingerprints)
-        similarity_matrix = np.zeros((n_samples, n_samples))
-        for i in range(0, n_samples, chunk_size):
-            chunk_end = min(i + chunk_size, n_samples)
-            chunk = fingerprints[i:chunk_end]
-            # Calculate similarity for this chunk against all fingerprints
-            similarity_chunk = tanimoto_similarity_continuous(chunk, fingerprints)
-            similarity_matrix[i:chunk_end] = similarity_chunk
-            # Clear memory
-            del similarity_chunk
-            gc.collect()
-        # Convert to distance matrix
-        distance_matrix = 1 - similarity_matrix
-        # Free memory
-        del similarity_matrix
-        gc.collect()
-        # Calculate condensed distance matrix
-        condensed_distance = squareform(distance_matrix)
-        # Free memory
-        del distance_matrix
-        gc.collect()
-        # Calculate linkage
-        Z = fastcluster.linkage(condensed_distance, method='average')
-        # Free memory
-        del condensed_distance
-        gc.collect()
-        # Perform clustering
-        cluster_labels = fcluster(Z, max_clusters, criterion='maxclust')
-        # Create clusters dictionary
-        clusters_dict = {}
-        for cluster in range(1, max_clusters + 1):
-            cluster_indices = np.where(cluster_labels == cluster)[0]
-            clusters_dict[cluster] = list(labels[cluster_indices])
-        return {
-            'clusters_dict': clusters_dict,
-            'cluster_labels': cluster_labels,
-            'linkage_matrix': Z
-        }
-    except Exception as e:
-        print(f"Error in cluster_molecules_optimized: {str(e)}")
-        raise e
 def download_button(object_to_download, download_filename, button_text, pickle_it=False):
     """
     Issued from
@@ -186,6 +112,23 @@ def download_button(object_to_download, download_filename, button_text, pickle_i
 st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
 intro_text = '''
 This is a demo of the graphical user interface of
 [SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
@@ -268,7 +211,8 @@ search_strategy = search_strategy_translator[search_strategy_input]
 submit_planning = st.button('Start retrosynthetic planning')
-if submit_planning:
     with st.status("Downloading data"):
         st.write("Downloading building blocks")
         building_blocks = load_building_blocks(building_blocks_path, standardize=False)
@@ -306,12 +250,21 @@ if submit_planning:
     res = extract_tree_stats(tree, target_molecule)
     st.header('Results')
     if res["solved"]:
-        st.balloons()
         st.subheader("Examples of found retrosynthetic routes")
         image_counter = 0
         visualised_node_ids = set()
         for n, node_id in enumerate(sorted(set(tree.winning_nodes))):
@@ -322,255 +275,134 @@ if submit_planning:
                 image_counter += 1
                 num_steps = len(tree.synthesis_route(node_id))
                 route_score = round(tree.route_score(node_id), 3)
-                st.image(get_route_svg(tree, node_id), caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
-        ### Modified part
-        # cluster_box, z = st.columns(2, gap="medium")
-        # with cluster_box:
-        #     num_clusters = st.slider('Number of clusters to display', min_value=2, max_value=10, value=2)
-        # submit_clustering = st.button('Start clustering')
-        # if submit_clustering:
-        #     st.subheader("Examples of clusters")
-        #     super_cgrs_dict = reassign_nums(tree)
-        #     reduced_super_cgrs_dict = process_all_rs_cgrs(super_cgrs_dict)
-        #     mfp = MorganFingerprint()
-        #     results = cluster_molecules(reduced_super_cgrs_dict, mfp, max_clusters=num_clusters)
-        # cluster_box, z = st.columns(2, gap="medium")
-        # with cluster_box:
-        #     # Initialize session state if not exists
-        #     if 'memory_warning_shown' not in st.session_state:
-        #         st.session_state.memory_warning_shown = False
-        #     current_memory = psutil.Process().memory_info().rss / 1024 / 1024
-        #     st.write(f"Current memory usage: {current_memory:.2f} MB")
-        #     st.write(f"Number of winning nodes: {len(tree.winning_nodes)}")
-        #     # Memory warning
-        #     if current_memory > 1000 and not st.session_state.memory_warning_shown:
-        #         st.warning("Memory usage is high. Consider reducing the number of routes or clearing cache.")
-        #         st.session_state.memory_warning_shown = True
-        #     # Store the previous value in session state
-        #     if 'prev_num_clusters' not in st.session_state:
-        #         st.session_state.prev_num_clusters = 2
-        #     num_clusters = st.slider(
-        #         'Number of clusters to display',
-        #         min_value=2,
-        #         max_value=min(10, len(tree.winning_nodes)),
-        #         value=st.session_state.prev_num_clusters
-        #     )
-        #     # Update the stored value only if it changed
-        #     if num_clusters != st.session_state.prev_num_clusters:
-        #         st.session_state.prev_num_clusters = num_clusters
-        # submit_clustering = st.button('Start clustering')
-        # if submit_clustering:
-        #     try:
-        #         with st.spinner("Processing clusters..."):
-        #             # Clear memory before starting
-        #             gc.collect()
-        #             st.write("Starting clustering process...")
-        #             memory_before = psutil.Process().memory_info().rss / 1024 / 1024
-        #             st.write(f"Memory before clustering: {memory_before:.2f} MB")
-        #             super_cgrs_dict = reassign_nums(tree)
-        #             del tree  # Free up memory from the tree object since we don't need it anymore
-        #             gc.collect()
-        #             reduced_super_cgrs_dict = process_all_rs_cgrs(super_cgrs_dict)
-        #             del super_cgrs_dict  # Free up memory
-        #             gc.collect()
-        #             memory_after = psutil.Process().memory_info().rss / 1024 / 1024
-        #             st.write(f"Memory after CGR processing: {memory_after:.2f} MB")
-        #             mfp = MorganFingerprint()
-        #             results = cluster_molecules(reduced_super_cgrs_dict, mfp, max_clusters=num_clusters)
-        #             del reduced_super_cgrs_dict  # Free up memory
-        #             gc.collect()
-        #             st.write("Clustering completed")
-        #     except Exception as e:
-        #         st.error(f"Clustering failed with error: {str(e)}")
-        #         st.write(f"Memory at error: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
-        #         raise e
-            # Access results
-            # clusters = results['clusters_dict']
-            # for cluster_num, node_id_list in clusters.items():
-            #     st.markdown(f"Cluster's number: ``{cluster_num}``")
-            #     node_id = node_id_list[0]
-            #     num_steps = len(tree.synthesis_route(node_id))
-            #     route_score = round(tree.route_score(node_id), 3)
-            #     st.image(get_route_svg(tree, node_id), caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
-        @st.cache_data(hash_funcs={Tree: lambda _: None})
-        def prepare_clustering_data(tree):
-            try:
-                # Log the start and basic info from the Tree
-                print("Starting clustering data preparation.")
-                total_nodes = len(tree.winning_nodes)
-                print(f"Total winning nodes: {total_nodes}")
-                print(f"Tree id: {id(tree)}")
-                chunk_size = 10
-                super_cgrs_dict = {}
-                # Process winning nodes in chunks
-                for i in range(0, total_nodes, chunk_size):
-                    current_chunk = list(tree.winning_nodes)[i:i+chunk_size]
-                    print(f"Processing chunk {i // chunk_size + 1}: Nodes {current_chunk}")
-                    temp_dict = {}
-                    for node in current_chunk:
-                        try:
-                            # Log before processing each node
-                            print(f"Processing node {node}")
-                            route = tree.synthesis_route(node)
-                            temp_dict[node] = route
-                            print(f"Node {node} processed successfully (route length: {len(route)}).")
-                        except Exception as e:
-                            print(f"Error processing node {node}: {e}")
-                    # Log before calling reassign_nums_chunk
-                    print(f"Calling reassign_nums_chunk for nodes: {list(temp_dict.keys())}")
-                    chunk_super_cgrs = reassign_nums_chunk(temp_dict)
-                    super_cgrs_dict.update(chunk_super_cgrs)
-                    print(f"Chunk {i // chunk_size + 1} processed. Keys: {list(chunk_super_cgrs.keys())}")
-                    del temp_dict
-                    gc.collect()
-                # Process reduced CGRs in chunks
-                reduced_super_cgrs_dict = {}
-                for i in range(0, len(super_cgrs_dict), chunk_size):
-                    keys = list(super_cgrs_dict.keys())[i:i+chunk_size]
-                    chunk_dict = {k: super_cgrs_dict[k] for k in keys}
-                    print(f"Reducing chunk for keys: {keys}")
-                    reduced_chunk = process_all_rs_cgrs(chunk_dict)
-                    reduced_super_cgrs_dict.update(reduced_chunk)
-                    print(f"Reduced chunk processed for keys: {list(reduced_chunk.keys())}")
-                    del chunk_dict
-                    gc.collect()
-                print("Clustering data preparation complete.")
-                return reduced_super_cgrs_dict
-            except Exception as e:
-                print(f"Error in prepare_clustering_data: {str(e)}")
-                st.error(f"Error in prepare_clustering_data: {str(e)}")
-                return None
-        def memory_status():
-            """Get current memory status"""
-            process = psutil.Process()
-            memory = process.memory_info().rss / 1024 / 1024
-            return f"Memory usage: {memory:.2f} MB"
-        # Initialize session state for tree and clustering data
-        if 'tree_data' not in st.session_state:
-            st.session_state.tree_data = tree
-        if 'clustering_state' not in st.session_state:
-            st.session_state.clustering_state = {
-                'prepared': False,
-                'data': None,
-                'num_clusters': 2
-            }
-        cluster_box, z = st.columns(2, gap="medium")
-        with cluster_box:
-            st.write(memory_status())
-            st.write(f"Number of winning nodes: {len(st.session_state.tree_data.winning_nodes)}")
-            # Step 1: Prepare Data Button
-            if not st.session_state.clustering_state['prepared']:
-                if st.button('Step 1: Prepare clustering data'):
-                    with st.spinner("Preparing data..."):
-                        try:
-                            st.session_state.clustering_state['data'] = prepare_clustering_data(st.session_state.tree_data)
-                            st.session_state.clustering_state['prepared'] = True
-                            st.success("Data prepared! Now you can proceed to Step 2.")
-                        except Exception as e:
-                            st.error(f"Preparation failed: {str(e)}")
-            # Step 2: Only show clustering controls if data is prepared
-            if st.session_state.clustering_state['prepared']:
-                st.markdown("### Step 2: Select number of clusters")
-                # Store slider value in session state
-                st.session_state.clustering_state['num_clusters'] = st.slider(
-                    'Number of clusters',
-                    min_value=2,
-                    max_value=min(10, len(st.session_state.tree_data.winning_nodes)),
-                    value=st.session_state.clustering_state['num_clusters']
-                )
-                # Step 3: Generate Clusters Button
-                if st.button('Step 3: Generate clusters'):
-                    with st.spinner("Clustering..."):
-                        try:
-                            results = perform_clustering(
-                                st.session_state.clustering_state['data'],
-                                st.session_state.clustering_state['num_clusters']
-                            )
-                            if results:
-                                st.success("Clustering complete!")
-                                for cluster_num, node_ids in results['clusters_dict'].items():
-                                    with st.expander(f"Cluster {cluster_num}"):
-                                        if node_ids:
-                                            node_id = node_ids[0]
-                                            num_steps = len(st.session_state.tree_data.synthesis_route(node_id))
-                                            route_score = round(st.session_state.tree_data.route_score(node_id), 3)
-                                            st.image(
-                                                get_route_svg(st.session_state.tree_data, node_id),
-                                                caption=f"Route {node_id}; {num_steps} steps; Score: {route_score}"
-                                            )
-                        except Exception as e:
-                            st.error(f"Clustering failed: {str(e)}")
-            # Clear memory button
-            if st.button('Clear memory and start over'):
-                st.cache_data.clear()
-                del st.session_state.clustering_state
-                del st.session_state.tree_data
-                gc.collect()
-                st.success("Memory cleared! Please refresh the page to start over.")
-                st.rerun()
         stat_col, download_col = st.columns(2, gap="medium")
         with stat_col:
             st.subheader("Statistics")
             df = pd.DataFrame(res, index=[0])
             st.write(df[["target_smiles", "num_routes", "num_nodes", "num_iter", "search_time"]])
         with download_col:
             st.subheader("Downloads")
             html_body = generate_results_html(tree, html_path=None, extended=True)
             dl_html = download_button(html_body, 'results_synplanner.html', 'Download results as a HTML file')
-            dl_csv = download_button(pd.DataFrame(res, index=[0]), 'results_synplanner.csv',
-                                     'Download statistics as a csv file')
             st.markdown(dl_html + dl_csv, unsafe_allow_html=True)
-    else:
         st.write("Found no reaction path.")
 st.divider()
 st.header('Restart from the beginning?')
 if st.button("Restart"):
     st.rerun()

 from cluster.super_cgr import *
 from cluster.rs_cgr import *
 from cluster.clustering import *
+from cluster.visualize import *
+from cluster.utils import *
+from cluster.subcluster import *
 from StructureFingerprint import MorganFingerprint
 import psutil
 smiles_parser = SMILESRead.create_parser(ignore=True)
 def download_button(object_to_download, download_filename, button_text, pickle_it=False):
     """
     Issued from
 st.set_page_config(page_title="SynPlanner GUI", page_icon="🧪", layout="wide")
+# Initialize session state variables if they don't exist.
+if "planning_done" not in st.session_state:
+    st.session_state.planning_done = False
+if "tree" not in st.session_state:
+    st.session_state.tree = None
+if "res" not in st.session_state:
+    st.session_state.res = None
+if "num_clusters" not in st.session_state:
+    st.session_state.num_clusters = 10
+if 'clustering_started' not in st.session_state:
+    st.session_state.clustering_started = False
+if 'clusters_downloaded' not in st.session_state:
+    st.session_state.clusters_downloaded = False
+# st.write("Initial session state:", dict(st.session_state))
 intro_text = '''
 This is a demo of the graphical user interface of
 [SynPlanner](https://github.com/Laboratoire-de-Chemoinformatique/SynPlanner/).
 submit_planning = st.button('Start retrosynthetic planning')
+# if submit_planning:
+if submit_planning and not st.session_state.planning_done:
     with st.status("Downloading data"):
         st.write("Downloading building blocks")
         building_blocks = load_building_blocks(building_blocks_path, standardize=False)
     res = extract_tree_stats(tree, target_molecule)
+    # Store planning outputs in session_state so they persist
+    st.session_state['tree'] = tree
+    st.session_state['res'] = res
+    st.session_state.planning_done = True
+    # Display results if planning has been completed
+if st.session_state.planning_done and st.session_state.res is not None and st.session_state.clustering_started:
+    res = st.session_state.res
+    tree = st.session_state.tree
     st.header('Results')
     if res["solved"]:
+        # st.balloons()
         st.subheader("Examples of found retrosynthetic routes")
         image_counter = 0
         visualised_node_ids = set()
         for n, node_id in enumerate(sorted(set(tree.winning_nodes))):
                 image_counter += 1
                 num_steps = len(tree.synthesis_route(node_id))
                 route_score = round(tree.route_score(node_id), 3)
+                st.image(get_route_svg(tree, node_id),
+                        caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
         stat_col, download_col = st.columns(2, gap="medium")
         with stat_col:
             st.subheader("Statistics")
             df = pd.DataFrame(res, index=[0])
             st.write(df[["target_smiles", "num_routes", "num_nodes", "num_iter", "search_time"]])
         with download_col:
             st.subheader("Downloads")
             html_body = generate_results_html(tree, html_path=None, extended=True)
             dl_html = download_button(html_body, 'results_synplanner.html', 'Download results as a HTML file')
+            dl_csv = download_button(pd.DataFrame(res, index=[0]),
+                                    'results_synplanner.csv', 'Download statistics as a csv file')
             st.markdown(dl_html + dl_csv, unsafe_allow_html=True)
+        st.header("Clustering the retrosynthetic routes")
+        # Initialize slider state if not already set
+        if 'num_clusters' not in st.session_state:
+            st.session_state['num_clusters'] = 10
+        cluster_box, _ = st.columns(2, gap="medium")
+        with cluster_box:
+            num_clusters = st.slider(
+                'Number of clusters to display',
+                min_value=2,
+                max_value=10,
+                value=st.session_state['num_clusters'],
+                key='cluster_slider'
+            )
+            # Save the current slider value to session_state
+            st.session_state['num_clusters'] = num_clusters
+        if st.button('Start clustering', key='submit_clustering'):
+            st.session_state.clustering_started = True
+            # st.write("Clustering started; session state now:", dict(st.session_state))
+        #     st.write("Clustering started!")
+            st.subheader("Examples of clusters")
+            super_cgrs_dict = reassign_nums(tree)
+            reduced_super_cgrs_dict = process_all_rs_cgrs(super_cgrs_dict)
+            mfp = MorganFingerprint()
+            results = cluster_molecules(reduced_super_cgrs_dict, mfp, max_clusters=num_clusters)
+            clusters = results['clusters_dict']
+            for cluster_num, node_id_list in clusters.items():
+                st.markdown(f"Cluster's number: {cluster_num}; Size {len(node_id_list)}")
+                node_id = node_id_list[0]
+                num_steps = len(tree.synthesis_route(node_id))
+                route_score = round(tree.route_score(node_id), 3)
+                st.image(get_route_svg(tree, node_id), caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
+            cluster_sizes = [len(cluster) for cluster in clusters.values()]
+            cluster_stat_col, cluster_download_col = st.columns(2, gap="medium")
+            with cluster_stat_col:
+                st.subheader("Statistics")
+                # st.write(cluster_sizes)
+                cluster_df = pd.DataFrame({'Cluster': range(len(cluster_sizes)), 'Routes': cluster_sizes})
+                # cluster_df = pd.DataFrame(cluster_sizes, index=[0])
+                st.write(cluster_df)
+            def on_download_click():
+                st.session_state.clusters_downloaded = True
+                st.write("Download clusters button pressed via on_click. Updated session state:", dict(st.session_state))
+                save_route_images(tree, reactions_dict, cluster_dict=clusters_converted)
+                # Here you can call save_route_images(...) if desired.
+            with cluster_download_col:
+                st.subheader("Downloads: Don't work. Resets evey time")
+                reactions_dict = extract_reactions(tree)
+                clusters_converted = {int(key): value for key, value in clusters.items()} if clusters else clusters
+                # Use on_click to capture the click event reliably.
+                st.button('Download clusters', key='download_clusters_button', on_click=on_download_click)
+                # Log whether the flag has been set after the button definition.
+                st.write("Clusters downloaded flag (from session_state):", st.session_state.get("clusters_downloaded"))
+            #             # save_route_images(tree, reactions_dict, cluster_dict=clusters_converted)
+            # with cluster_download_col:
+            #     st.subheader("Downloads")
+            #     reactions_dict = extract_reactions(tree)
+            #     clusters_converted = {int(key): value for key, value in clusters.items()} if clusters else clusters
+            #     if st.session_state.clustering_started:
+            #         st.write("Rendering download clusters button. Session state:", dict(st.session_state))
+            #         # Use a more unique key for the download button.
+            #         download_clusters = st.button('Download clusters', key='download_clusters_button')
+            #         st.write("download_clusters value:", download_clusters)
+            #         if download_clusters:
+            #             st.session_state.clusters_downloaded = True
+            #             st.write("Download clusters button pressed. Updated session state:", dict(st.session_state))
+            col1, _ = st.columns([.2, .8])
+            with col1:
+                fig = pie_chart(cluster_sizes)
+                st.pyplot(fig)
+            st.header("Sub Clustering the retrosynthetic routes - Resets every time when i interact with input widget")
+            sub = sublcuster_all(clusters, reactions_dict)
+            col2, _ = st.columns([.2, .8])
+            with col2:
+                user_input_cluster_num = st.number_input("Enter a number:", min_value=1,
+                                                          max_value=max(clusters.keys()), value=1, step=1)
+                st.write(f"You entered the # cluster: {user_input_cluster_num}")
+                sub_step_cluster = sub[user_input_cluster_num]
+                allowed_numbers = sub_step_cluster.keys()
+                selected_number = st.selectbox("Choose a number:", allowed_numbers)
+                st.write(f"You entered number of steps: {selected_number}")
+                subclusters = sub_step_cluster[selected_number]
+            st.subheader(f"Found number of subclusters: {len(subclusters)}")
+            for subcluster_num, subcluster_set in enumerate(subclusters):
+                st.write(f"Subcluster #: {subcluster_num + 1}")
+                for route_id in subcluster_set:
+                    st.write(f"Node_ID: {route_id}")
+                    st.image(get_route_svg(tree, route_id), caption=f"Route {node_id}; {num_steps} steps; Route score: {route_score}")
+else:
         st.write("Found no reaction path.")
 st.divider()
 st.header('Restart from the beginning?')
 if st.button("Restart"):
+    st.session_state.planning_done = False
     st.rerun()