SynPlanner

Running

App Files Files Community

Gilmullin Almaz commited on Mar 7, 2025

Commit

dfa290e

1 Parent(s): 81a56f7

rm extra files

Browse files

Files changed (4) hide show

__init__.py +0 -0
clustering.py +0 -171
rs_cgr.py +0 -35
super_cgr.py +0 -204

__init__.py DELETED Viewed

File without changes

clustering.py DELETED Viewed

@@ -1,171 +0,0 @@
-import numpy as np
-import pandas as pd
-from scipy.spatial.distance import squareform
-from scipy.cluster.hierarchy import fcluster
-from sklearn.metrics import silhouette_score, calinski_harabasz_score
-import fastcluster
-def tanimoto_similarity_continuous(matrix_1, matrix_2):
-    """
-    "The Tanimoto coefficient is a measure of the similarity between two sets.
-    It is defined as the size of the intersection divided by the size of the union of the sample sets."
-    The Tanimoto coefficient is also known as the Jaccard index
-    Adoppted from https://github.com/cimm-kzn/CIMtools/blob/master/CIMtools/metrics/pairwise.py
-    :param matrix_1: 2D array of features.
-    :param matrix_2: 2D array of features.
-    :return: The Tanimoto coefficient between the two arrays.
-    """
-    x_dot = np.dot(matrix_1, matrix_2.T)
-    x2 = (matrix_1**2).sum(axis=1)
-    y2 = (matrix_2**2).sum(axis=1)
-    len_x2 = len(x2)
-    len_y2 = len(y2)
-    result = x_dot / (np.array([x2] * len_y2).T + np.array([y2] * len_x2) - x_dot)
-    result[np.isnan(result)] = 0
-    return result
-def calculate_fingerprints(cgrs, fingerprint_method):
-    """Calculate fingerprints for a collection of CGRs.
-    Args:
-        cgrs (dict): Dictionary of CGRs
-        fingerprint_method: Initialized fingerprint calculator (e.g., MorganFingerprint instance)
-    Returns:
-        np.ndarray: Array of fingerprints
-    """
-    fingerprints = []
-    for cgr in cgrs.values():
-        fp = fingerprint_method.transform([cgr])[0]
-        fingerprints.append(fp)
-    return np.array(fingerprints)
-def create_similarity_matrix(fingerprints, labels):
-    """Create a similarity matrix from fingerprints.
-    Args:
-        fingerprints (np.ndarray): Array of fingerprints
-        labels (list): Labels for the fingerprints
-    Returns:
-        pd.DataFrame: Similarity matrix as a DataFrame
-    """
-    similarity_matrix = tanimoto_similarity_continuous(fingerprints, fingerprints)
-    return pd.DataFrame(similarity_matrix, columns=labels, index=labels)
-def calculate_linkage(similarity_df, method='average'):
-    """Calculate linkage matrix for hierarchical clustering.
-    Args:
-        similarity_df (pd.DataFrame): Similarity matrix
-        method (str): Linkage method
-    Returns:
-        np.ndarray: Linkage matrix
-    """
-    distance_matrix = 1 - similarity_df
-    condensed_distance = squareform(distance_matrix)
-    return fastcluster.linkage(condensed_distance, method=method)
-def optimal_cluster_num(Z, distance_matrix, max_clusters=10):
-    """Find optimal number of clusters using silhouette score.
-    Args:
-        Z (np.ndarray): Linkage matrix
-        distance_matrix (np.ndarray): Distance matrix
-        max_clusters (int): Maximum number of clusters to consider
-    Returns:
-        int: Optimal number of clusters
-    """
-    cluster_range = range(2, max_clusters)
-    silhouette_scores = []
-    for n_clusters in cluster_range:
-        cluster_labels = fcluster(Z, n_clusters, criterion='maxclust')
-        score = silhouette_score(distance_matrix, cluster_labels, metric='precomputed')
-        silhouette_scores.append(score)
-    return cluster_range[np.argmax(silhouette_scores)]
-def perform_clustering(Z, threshold=0.0, max_clusters=10):
-    """Perform hierarchical clustering with automatic cluster number optimization.
-    Args:
-        Z (np.ndarray): Linkage matrix
-        threshold (float): Distance threshold for initial clustering
-        max_clusters (int): Maximum number of clusters
-    Returns:
-        np.ndarray: Cluster labels
-    """
-    cluster_labels = fcluster(Z, t=threshold, criterion='distance')
-    unique_clusters = np.unique(cluster_labels)
-    if max(unique_clusters) > max_clusters:
-        optimal_n_clusters = optimal_cluster_num(Z, 1 - similarity_df, max_clusters)
-        cluster_labels = fcluster(Z, optimal_n_clusters, criterion='maxclust')
-    return cluster_labels
-def create_clusters_dict(cluster_labels, labels):
-    """Create a dictionary of clusters with their members.
-    Args:
-        cluster_labels (np.ndarray): Cluster assignments
-        labels (list): Labels for the items
-    Returns:
-        dict: Dictionary mapping cluster numbers to lists of member labels
-    """
-    unique_clusters = np.unique(cluster_labels)
-    clusters_dict = {}
-    for cluster in unique_clusters:
-        cluster_indices = np.where(cluster_labels == cluster)[0]
-        clusters_dict[cluster] = list(labels[cluster_indices])
-    return clusters_dict
-def cluster_molecules(cgrs, fingerprint_method, threshold=0.0, max_clusters=10, linkage_method='average'):
-    """Main function to perform molecular clustering.
-    Args:
-        cgrs (dict): Dictionary of CGRs
-        fingerprint_method: Initialized fingerprint calculator
-        threshold (float): Distance threshold for clustering
-        max_clusters (int): Maximum number of clusters
-        linkage_method (str): Method for hierarchical clustering
-    Returns:
-        dict: Clustering results containing clusters_dict and cluster_labels
-    """
-    # Calculate fingerprints
-    fingerprints = calculate_fingerprints(cgrs, fingerprint_method)
-    # Create similarity matrix
-    labels = list(cgrs.keys())
-    similarity_df = create_similarity_matrix(fingerprints, labels)
-    # Calculate linkage
-    Z = calculate_linkage(similarity_df, method=linkage_method)
-    # Perform clustering
-    cluster_labels = perform_clustering(Z, threshold, max_clusters)
-    # Create clusters dictionary
-    clusters_dict = create_clusters_dict(cluster_labels, np.array(labels))
-    return {
-        'clusters_dict': clusters_dict,
-        'cluster_labels': cluster_labels,
-        'similarity_matrix': similarity_df,
-        'linkage_matrix': Z
-    }

rs_cgr.py DELETED Viewed

@@ -1,35 +0,0 @@
-def cleaning_cgr(cgr):
-    cgr_prods = [cgr.substructure(c) for c in cgr.connected_components]
-    target_cgr = cgr_prods[0]
-    decomposed = ReactionContainer.from_cgr(cgr)
-    bond_items = list(target_cgr._bonds.items())
-    for atom1, bond_set in bond_items:
-        bond_set_items = list(bond_set.items())
-        for atom2, bond in bond_set_items:
-            # Leaving groups removal
-            if bond.p_order == None and bond.order is not None:
-                # print(atom1, atom2)
-                # print(bond)
-                target_cgr.delete_bond(atom1, atom2)
-                # target_cgr.clean2d()
-                # display(SVG(target_cgr.depict()))
-            ## Modified bond, but not leaving group
-            elif type(bond.p_order) is int and type(bond.order) is int and bond.p_order < bond.order:
-                p_order = int(bond.p_order)
-                target_cgr.delete_bond(atom1, atom2)
-                target_cgr.add_bond(atom1, atom2, DynamicBond(p_order, p_order))
-    clean_cgr = [target_cgr.substructure(c) for c in target_cgr.connected_components][0]
-    # Charge neutralizer
-    if clean_cgr._p_charges != clean_cgr._charges:
-        for num, charge in clean_cgr._charges.items():
-            if charge != 0:
-                clean_cgr._atoms[num].charge = 0
-    return clean_cgr

super_cgr.py DELETED Viewed

@@ -1,204 +0,0 @@
-def find_next_atom_num(accum_cgr, reactions):
-    """Find the next available atom number."""
-    max_num = 0
-    for reaction in reactions:
-        cgr = reaction.compose()
-        max_num = max(max_num, max(cgr._atoms.keys()))
-    return max_num + 1
-def get_clean_mapping(curr_prod, prod, reverse=False):
-    """Get clean mapping between molecules while avoiding number conflicts."""
-    dict_map = {}
-    m = list(curr_prod.get_mapping(prod))
-    if len(m) == 0:
-        return dict_map
-    # Get existing atom numbers in both molecules
-    curr_atoms = set(curr_prod._atoms.keys())
-    prod_atoms = set(prod._atoms.keys())
-    rr = m[0]
-    # Build mapping while checking for conflicts
-    for key, value in rr.items():
-        if key != value:
-            if value in rr.keys() and rr[value] != key:
-                # Skip cyclic mappings that could cause conflicts
-                continue
-            source = value if reverse else key
-            target = key if reverse else value
-            # Check if target number already exists in the molecule
-            if reverse and target in curr_atoms:
-                continue
-            if not reverse and target in prod_atoms:
-                continue
-            dict_map[source] = target
-    return dict_map
-def validate_molecule_components(curr_mol, node_id):
-    """Validate that molecule has only one connected component."""
-    new_rmol = [curr_mol.substructure(c) for c in curr_mol.connected_components]
-    if len(new_rmol) > 1:
-        print(f'Error tree {node_id}: We have more than one molecule in one node')
-def get_leaving_groups(products):
-    """Extract leaving group atom numbers from products."""
-    lg_atom_nums = []
-    for i, prod in enumerate(products):
-        if i != 0:  # Skip first product (main product)
-            lg_atom_nums.extend(prod._atoms.keys())
-    return lg_atom_nums
-def process_first_reaction(first_react, tree, node_id, min_mol_size):
-    """Process first reaction in the route and initialize building block set."""
-    bb_set = set()
-    for curr_mol in first_react.reactants:
-        react_key = tuple(curr_mol._atoms)
-        react_key_set = set(react_key)
-        if len(curr_mol) <= min_mol_size or str(curr_mol) in tree.building_blocks:
-            bb_set = react_key_set
-        validate_molecule_components(curr_mol, node_id)
-    return bb_set
-def update_reaction_dict(reaction, node_id, mapping, react_dict, tree, min_mol_size, bb_set, prev_remap=None):
-    """Update reaction dictionary with new mappings."""
-    for curr_mol in reaction.reactants:
-        react_key = tuple(curr_mol._atoms)
-        react_key_set = set(react_key)
-        validate_molecule_components(curr_mol, node_id)
-        if len(curr_mol) <= min_mol_size or str(curr_mol) in tree.building_blocks:
-            bb_set = bb_set.union(react_key_set)
-        # Filter the mapping to include only keys present in the current react_key
-        filtered_mapping = {k: v for k, v in mapping.items() if k in react_key_set}
-        if prev_remap:
-            prev_remappping = {k: v for k, v in prev_remap.items() if k in react_key_set}
-            filtered_mapping.update(prev_remappping)
-        react_dict[react_key] = filtered_mapping
-    return react_dict, bb_set
-def process_target_blocks(curr_products, curr_prod, lg_atom_nums, curr_lg_atom_nums, bb_set):
-    """Process and collect target blocks for remapping."""
-    target_block = []
-    if len(curr_products) > 1:
-        for prod in curr_products:
-            dict_map = get_clean_mapping(curr_prod, prod)
-            if prod._atoms.keys() != curr_prod._atoms.keys():
-                for key in list(prod._atoms.keys()):
-                    if key in lg_atom_nums or key in curr_lg_atom_nums:
-                        target_block.append(key)
-                    if key in bb_set:
-                        target_block.append(key)
-    return target_block
-def process_single_route(tree, node_id, min_mol_size=6):
-    """Process a single synthesis route maintaining consistent state."""
-    try:
-        reactions = tree.synthesis_route(node_id)
-        first_react = reactions[-1]
-        accum_cgr = first_react.compose()
-        bb_set = process_first_reaction(first_react, tree, node_id, min_mol_size)
-        react_dict = {}
-        max_num = find_next_atom_num(accum_cgr, reactions)
-        for step in range(len(reactions) - 2, -1, -1):
-            # print("\nProcessing step:", step + 1)
-            reaction = reactions[step]
-            curr_cgr = reaction.compose()
-            curr_prod = reaction.products[0]
-            accum_products = accum_cgr.decompose()[1].split()
-            lg_atom_nums = get_leaving_groups(accum_products)
-            curr_products = curr_cgr.decompose()[1].split()
-            tuple_atoms = tuple(curr_prod._atoms)
-            prev_remap = {}
-            if tuple_atoms in react_dict.keys() and len(react_dict[tuple_atoms]) != 0:
-                prev_remap = react_dict[tuple_atoms]
-                curr_cgr = curr_cgr.remap(prev_remap, copy=True)
-            curr_lg_atom_nums = []
-            for i in range(1, len(curr_products)):
-                prod = curr_products[i]
-                curr_lg_atom_nums += list(prod._atoms.keys())
-            target_block = process_target_blocks(curr_products, curr_prod, lg_atom_nums, curr_lg_atom_nums, bb_set)
-            mapping = {}
-            for atom_num in sorted(target_block):
-                if atom_num in accum_cgr._atoms and atom_num not in mapping:
-                    mapping[atom_num] = max_num
-                    max_num += 1
-            for i in range(len(accum_products)):
-                accum_prod = accum_products[i]
-                dict_map = get_clean_mapping(curr_prod, accum_prod, reverse=True)
-            if dict_map:
-                curr_cgr.remap(dict_map)
-            #maybe remap, then decompose and to BB
-            react_dict, bb_set = update_reaction_dict(reaction, node_id, mapping, react_dict, tree, min_mol_size, bb_set, prev_remap)
-            if mapping:
-                curr_cgr.remap(mapping)
-            accum_cgr = curr_cgr.compose(accum_cgr)
-        return {
-            'cgr': accum_cgr,
-        }
-    except Exception as e:
-        print(f"Error processing node {node_id}: {e}")
-        return None
-def reassign_nums(tree, node_id=None, min_mol_size=6):
-    """
-    Process routes and reassign atom numbers.
-    Args:
-        tree: Synthesis tree
-        node_id: Optional specific node ID to process. If None, processes all winning nodes
-        min_mol_size: Minimum size for building blocks
-    Returns:
-        If node_id is None:
-            dict: Dictionary mapping node IDs to their processed CGRs
-        If node_id is specified:
-            dict: Information about the processed route
-    """
-    if node_id is not None:
-        return process_single_route(tree, node_id, min_mol_size)
-    complex_cgr_dict = {}
-    reactions_dict = {}
-    cgrs_list = []
-    for node_id in set(tree.winning_nodes):
-        result = process_single_route(tree, node_id, min_mol_size)
-        if result:
-            complex_cgr_dict[node_id] = result['cgr']
-    return dict(sorted(complex_cgr_dict.items()))