Gilmullin Almaz commited on
Commit
dfa290e
·
1 Parent(s): 81a56f7

rm extra files

Browse files
Files changed (4) hide show
  1. __init__.py +0 -0
  2. clustering.py +0 -171
  3. rs_cgr.py +0 -35
  4. super_cgr.py +0 -204
__init__.py DELETED
File without changes
clustering.py DELETED
@@ -1,171 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from scipy.spatial.distance import squareform
4
- from scipy.cluster.hierarchy import fcluster
5
- from sklearn.metrics import silhouette_score, calinski_harabasz_score
6
- import fastcluster
7
-
8
- def tanimoto_similarity_continuous(matrix_1, matrix_2):
9
- """
10
- "The Tanimoto coefficient is a measure of the similarity between two sets.
11
- It is defined as the size of the intersection divided by the size of the union of the sample sets."
12
-
13
- The Tanimoto coefficient is also known as the Jaccard index
14
-
15
- Adoppted from https://github.com/cimm-kzn/CIMtools/blob/master/CIMtools/metrics/pairwise.py
16
-
17
- :param matrix_1: 2D array of features.
18
- :param matrix_2: 2D array of features.
19
- :return: The Tanimoto coefficient between the two arrays.
20
- """
21
- x_dot = np.dot(matrix_1, matrix_2.T)
22
-
23
- x2 = (matrix_1**2).sum(axis=1)
24
- y2 = (matrix_2**2).sum(axis=1)
25
-
26
- len_x2 = len(x2)
27
- len_y2 = len(y2)
28
-
29
- result = x_dot / (np.array([x2] * len_y2).T + np.array([y2] * len_x2) - x_dot)
30
- result[np.isnan(result)] = 0
31
-
32
- return result
33
-
34
- def calculate_fingerprints(cgrs, fingerprint_method):
35
- """Calculate fingerprints for a collection of CGRs.
36
-
37
- Args:
38
- cgrs (dict): Dictionary of CGRs
39
- fingerprint_method: Initialized fingerprint calculator (e.g., MorganFingerprint instance)
40
-
41
- Returns:
42
- np.ndarray: Array of fingerprints
43
- """
44
- fingerprints = []
45
- for cgr in cgrs.values():
46
- fp = fingerprint_method.transform([cgr])[0]
47
- fingerprints.append(fp)
48
- return np.array(fingerprints)
49
-
50
- def create_similarity_matrix(fingerprints, labels):
51
- """Create a similarity matrix from fingerprints.
52
-
53
- Args:
54
- fingerprints (np.ndarray): Array of fingerprints
55
- labels (list): Labels for the fingerprints
56
-
57
- Returns:
58
- pd.DataFrame: Similarity matrix as a DataFrame
59
- """
60
- similarity_matrix = tanimoto_similarity_continuous(fingerprints, fingerprints)
61
- return pd.DataFrame(similarity_matrix, columns=labels, index=labels)
62
-
63
- def calculate_linkage(similarity_df, method='average'):
64
- """Calculate linkage matrix for hierarchical clustering.
65
-
66
- Args:
67
- similarity_df (pd.DataFrame): Similarity matrix
68
- method (str): Linkage method
69
-
70
- Returns:
71
- np.ndarray: Linkage matrix
72
- """
73
- distance_matrix = 1 - similarity_df
74
- condensed_distance = squareform(distance_matrix)
75
- return fastcluster.linkage(condensed_distance, method=method)
76
-
77
- def optimal_cluster_num(Z, distance_matrix, max_clusters=10):
78
- """Find optimal number of clusters using silhouette score.
79
-
80
- Args:
81
- Z (np.ndarray): Linkage matrix
82
- distance_matrix (np.ndarray): Distance matrix
83
- max_clusters (int): Maximum number of clusters to consider
84
-
85
- Returns:
86
- int: Optimal number of clusters
87
- """
88
- cluster_range = range(2, max_clusters)
89
- silhouette_scores = []
90
-
91
- for n_clusters in cluster_range:
92
- cluster_labels = fcluster(Z, n_clusters, criterion='maxclust')
93
- score = silhouette_score(distance_matrix, cluster_labels, metric='precomputed')
94
- silhouette_scores.append(score)
95
-
96
- return cluster_range[np.argmax(silhouette_scores)]
97
-
98
- def perform_clustering(Z, threshold=0.0, max_clusters=10):
99
- """Perform hierarchical clustering with automatic cluster number optimization.
100
-
101
- Args:
102
- Z (np.ndarray): Linkage matrix
103
- threshold (float): Distance threshold for initial clustering
104
- max_clusters (int): Maximum number of clusters
105
-
106
- Returns:
107
- np.ndarray: Cluster labels
108
- """
109
- cluster_labels = fcluster(Z, t=threshold, criterion='distance')
110
- unique_clusters = np.unique(cluster_labels)
111
-
112
- if max(unique_clusters) > max_clusters:
113
- optimal_n_clusters = optimal_cluster_num(Z, 1 - similarity_df, max_clusters)
114
- cluster_labels = fcluster(Z, optimal_n_clusters, criterion='maxclust')
115
-
116
- return cluster_labels
117
-
118
- def create_clusters_dict(cluster_labels, labels):
119
- """Create a dictionary of clusters with their members.
120
-
121
- Args:
122
- cluster_labels (np.ndarray): Cluster assignments
123
- labels (list): Labels for the items
124
-
125
- Returns:
126
- dict: Dictionary mapping cluster numbers to lists of member labels
127
- """
128
- unique_clusters = np.unique(cluster_labels)
129
- clusters_dict = {}
130
-
131
- for cluster in unique_clusters:
132
- cluster_indices = np.where(cluster_labels == cluster)[0]
133
- clusters_dict[cluster] = list(labels[cluster_indices])
134
-
135
- return clusters_dict
136
-
137
- def cluster_molecules(cgrs, fingerprint_method, threshold=0.0, max_clusters=10, linkage_method='average'):
138
- """Main function to perform molecular clustering.
139
-
140
- Args:
141
- cgrs (dict): Dictionary of CGRs
142
- fingerprint_method: Initialized fingerprint calculator
143
- threshold (float): Distance threshold for clustering
144
- max_clusters (int): Maximum number of clusters
145
- linkage_method (str): Method for hierarchical clustering
146
-
147
- Returns:
148
- dict: Clustering results containing clusters_dict and cluster_labels
149
- """
150
- # Calculate fingerprints
151
- fingerprints = calculate_fingerprints(cgrs, fingerprint_method)
152
-
153
- # Create similarity matrix
154
- labels = list(cgrs.keys())
155
- similarity_df = create_similarity_matrix(fingerprints, labels)
156
-
157
- # Calculate linkage
158
- Z = calculate_linkage(similarity_df, method=linkage_method)
159
-
160
- # Perform clustering
161
- cluster_labels = perform_clustering(Z, threshold, max_clusters)
162
-
163
- # Create clusters dictionary
164
- clusters_dict = create_clusters_dict(cluster_labels, np.array(labels))
165
-
166
- return {
167
- 'clusters_dict': clusters_dict,
168
- 'cluster_labels': cluster_labels,
169
- 'similarity_matrix': similarity_df,
170
- 'linkage_matrix': Z
171
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rs_cgr.py DELETED
@@ -1,35 +0,0 @@
1
- def cleaning_cgr(cgr):
2
- cgr_prods = [cgr.substructure(c) for c in cgr.connected_components]
3
- target_cgr = cgr_prods[0]
4
-
5
- decomposed = ReactionContainer.from_cgr(cgr)
6
-
7
- bond_items = list(target_cgr._bonds.items())
8
- for atom1, bond_set in bond_items:
9
- bond_set_items = list(bond_set.items())
10
- for atom2, bond in bond_set_items:
11
-
12
- # Leaving groups removal
13
- if bond.p_order == None and bond.order is not None:
14
- # print(atom1, atom2)
15
- # print(bond)
16
- target_cgr.delete_bond(atom1, atom2)
17
- # target_cgr.clean2d()
18
- # display(SVG(target_cgr.depict()))
19
-
20
- ## Modified bond, but not leaving group
21
- elif type(bond.p_order) is int and type(bond.order) is int and bond.p_order < bond.order:
22
- p_order = int(bond.p_order)
23
- target_cgr.delete_bond(atom1, atom2)
24
- target_cgr.add_bond(atom1, atom2, DynamicBond(p_order, p_order))
25
-
26
- clean_cgr = [target_cgr.substructure(c) for c in target_cgr.connected_components][0]
27
-
28
- # Charge neutralizer
29
- if clean_cgr._p_charges != clean_cgr._charges:
30
- for num, charge in clean_cgr._charges.items():
31
- if charge != 0:
32
- clean_cgr._atoms[num].charge = 0
33
-
34
-
35
- return clean_cgr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
super_cgr.py DELETED
@@ -1,204 +0,0 @@
1
- def find_next_atom_num(accum_cgr, reactions):
2
- """Find the next available atom number."""
3
- max_num = 0
4
- for reaction in reactions:
5
- cgr = reaction.compose()
6
- max_num = max(max_num, max(cgr._atoms.keys()))
7
- return max_num + 1
8
-
9
- def get_clean_mapping(curr_prod, prod, reverse=False):
10
- """Get clean mapping between molecules while avoiding number conflicts."""
11
- dict_map = {}
12
- m = list(curr_prod.get_mapping(prod))
13
-
14
- if len(m) == 0:
15
- return dict_map
16
-
17
- # Get existing atom numbers in both molecules
18
- curr_atoms = set(curr_prod._atoms.keys())
19
- prod_atoms = set(prod._atoms.keys())
20
-
21
- rr = m[0]
22
-
23
- # Build mapping while checking for conflicts
24
- for key, value in rr.items():
25
- if key != value:
26
- if value in rr.keys() and rr[value] != key:
27
- # Skip cyclic mappings that could cause conflicts
28
- continue
29
-
30
- source = value if reverse else key
31
- target = key if reverse else value
32
-
33
- # Check if target number already exists in the molecule
34
- if reverse and target in curr_atoms:
35
- continue
36
- if not reverse and target in prod_atoms:
37
- continue
38
-
39
- dict_map[source] = target
40
-
41
- return dict_map
42
-
43
- def validate_molecule_components(curr_mol, node_id):
44
- """Validate that molecule has only one connected component."""
45
- new_rmol = [curr_mol.substructure(c) for c in curr_mol.connected_components]
46
- if len(new_rmol) > 1:
47
- print(f'Error tree {node_id}: We have more than one molecule in one node')
48
-
49
- def get_leaving_groups(products):
50
- """Extract leaving group atom numbers from products."""
51
- lg_atom_nums = []
52
- for i, prod in enumerate(products):
53
- if i != 0: # Skip first product (main product)
54
- lg_atom_nums.extend(prod._atoms.keys())
55
- return lg_atom_nums
56
-
57
- def process_first_reaction(first_react, tree, node_id, min_mol_size):
58
- """Process first reaction in the route and initialize building block set."""
59
- bb_set = set()
60
-
61
- for curr_mol in first_react.reactants:
62
- react_key = tuple(curr_mol._atoms)
63
- react_key_set = set(react_key)
64
-
65
- if len(curr_mol) <= min_mol_size or str(curr_mol) in tree.building_blocks:
66
- bb_set = react_key_set
67
-
68
- validate_molecule_components(curr_mol, node_id)
69
-
70
- return bb_set
71
-
72
- def update_reaction_dict(reaction, node_id, mapping, react_dict, tree, min_mol_size, bb_set, prev_remap=None):
73
- """Update reaction dictionary with new mappings."""
74
- for curr_mol in reaction.reactants:
75
- react_key = tuple(curr_mol._atoms)
76
- react_key_set = set(react_key)
77
-
78
- validate_molecule_components(curr_mol, node_id)
79
-
80
- if len(curr_mol) <= min_mol_size or str(curr_mol) in tree.building_blocks:
81
- bb_set = bb_set.union(react_key_set)
82
-
83
- # Filter the mapping to include only keys present in the current react_key
84
- filtered_mapping = {k: v for k, v in mapping.items() if k in react_key_set}
85
- if prev_remap:
86
- prev_remappping = {k: v for k, v in prev_remap.items() if k in react_key_set}
87
- filtered_mapping.update(prev_remappping)
88
- react_dict[react_key] = filtered_mapping
89
-
90
- return react_dict, bb_set
91
-
92
- def process_target_blocks(curr_products, curr_prod, lg_atom_nums, curr_lg_atom_nums, bb_set):
93
- """Process and collect target blocks for remapping."""
94
- target_block = []
95
- if len(curr_products) > 1:
96
- for prod in curr_products:
97
- dict_map = get_clean_mapping(curr_prod, prod)
98
- if prod._atoms.keys() != curr_prod._atoms.keys():
99
- for key in list(prod._atoms.keys()):
100
- if key in lg_atom_nums or key in curr_lg_atom_nums:
101
- target_block.append(key)
102
- if key in bb_set:
103
- target_block.append(key)
104
- return target_block
105
-
106
- def process_single_route(tree, node_id, min_mol_size=6):
107
- """Process a single synthesis route maintaining consistent state."""
108
- try:
109
- reactions = tree.synthesis_route(node_id)
110
-
111
- first_react = reactions[-1]
112
-
113
- accum_cgr = first_react.compose()
114
- bb_set = process_first_reaction(first_react, tree, node_id, min_mol_size)
115
-
116
- react_dict = {}
117
-
118
- max_num = find_next_atom_num(accum_cgr, reactions)
119
-
120
- for step in range(len(reactions) - 2, -1, -1):
121
- # print("\nProcessing step:", step + 1)
122
- reaction = reactions[step]
123
- curr_cgr = reaction.compose()
124
-
125
- curr_prod = reaction.products[0]
126
- accum_products = accum_cgr.decompose()[1].split()
127
- lg_atom_nums = get_leaving_groups(accum_products)
128
-
129
- curr_products = curr_cgr.decompose()[1].split()
130
-
131
- tuple_atoms = tuple(curr_prod._atoms)
132
- prev_remap = {}
133
-
134
- if tuple_atoms in react_dict.keys() and len(react_dict[tuple_atoms]) != 0:
135
- prev_remap = react_dict[tuple_atoms]
136
- curr_cgr = curr_cgr.remap(prev_remap, copy=True)
137
-
138
- curr_lg_atom_nums = []
139
- for i in range(1, len(curr_products)):
140
- prod = curr_products[i]
141
- curr_lg_atom_nums += list(prod._atoms.keys())
142
-
143
- target_block = process_target_blocks(curr_products, curr_prod, lg_atom_nums, curr_lg_atom_nums, bb_set)
144
-
145
- mapping = {}
146
- for atom_num in sorted(target_block):
147
- if atom_num in accum_cgr._atoms and atom_num not in mapping:
148
- mapping[atom_num] = max_num
149
- max_num += 1
150
-
151
- for i in range(len(accum_products)):
152
- accum_prod = accum_products[i]
153
- dict_map = get_clean_mapping(curr_prod, accum_prod, reverse=True)
154
-
155
- if dict_map:
156
- curr_cgr.remap(dict_map)
157
-
158
-
159
- #maybe remap, then decompose and to BB
160
- react_dict, bb_set = update_reaction_dict(reaction, node_id, mapping, react_dict, tree, min_mol_size, bb_set, prev_remap)
161
-
162
-
163
- if mapping:
164
- curr_cgr.remap(mapping)
165
-
166
- accum_cgr = curr_cgr.compose(accum_cgr)
167
-
168
-
169
- return {
170
- 'cgr': accum_cgr,
171
- }
172
-
173
- except Exception as e:
174
- print(f"Error processing node {node_id}: {e}")
175
- return None
176
-
177
- def reassign_nums(tree, node_id=None, min_mol_size=6):
178
- """
179
- Process routes and reassign atom numbers.
180
-
181
- Args:
182
- tree: Synthesis tree
183
- node_id: Optional specific node ID to process. If None, processes all winning nodes
184
- min_mol_size: Minimum size for building blocks
185
-
186
- Returns:
187
- If node_id is None:
188
- dict: Dictionary mapping node IDs to their processed CGRs
189
- If node_id is specified:
190
- dict: Information about the processed route
191
- """
192
- if node_id is not None:
193
- return process_single_route(tree, node_id, min_mol_size)
194
-
195
- complex_cgr_dict = {}
196
- reactions_dict = {}
197
- cgrs_list = []
198
- for node_id in set(tree.winning_nodes):
199
- result = process_single_route(tree, node_id, min_mol_size)
200
- if result:
201
- complex_cgr_dict[node_id] = result['cgr']
202
-
203
- return dict(sorted(complex_cgr_dict.items()))
204
-