Spaces:
Sleeping
Sleeping
| import re | |
| from typing import Any, Dict, List, Optional, Union | |
| from collections import Counter | |
| from rdkit import Chem | |
| from rdkit.Chem import Draw | |
| from protac_splitter.chemoinformatics import ( | |
| dummy2query, | |
| remove_dummy_atoms, | |
| canonize, | |
| canonize_smiles, | |
| GetSubstructMatchesWithTimeout, | |
| ) | |
| from protac_splitter.display_utils import ( | |
| safe_display, | |
| display_mol, | |
| ) | |
| from protac_splitter.evaluation import check_reassembly | |
| def get_substructs_from_mapped_linker( | |
| protac_smiles: str, | |
| linker_smiles: str, | |
| e3_attachment_id: int = 2, | |
| poi_attachment_id: int = 1, | |
| verbose: int = 0, | |
| ) -> Dict[str, str]: | |
| """ Get the substructures of a PROTAC molecule from a mapped linker SMILES. | |
| This function will return the substructures given a linker with | |
| directionality, _i.e._, with the two attachment points mapped. | |
| Args: | |
| protac_smiles: The SMILES of the PROTAC molecule. | |
| linker_smiles: The SMILES of the linker molecule. Must have attachment points. | |
| verbose: Verbosity level. | |
| Returns: | |
| A dictionary with the substructure names as keys ('e3', 'linker', and 'poi') and their SMILES as values. None if the matching fails. | |
| """ | |
| protac_smiles = canonize_smiles(protac_smiles) | |
| linker_smiles = canonize_smiles(linker_smiles) | |
| protac_mol = Chem.MolFromSmiles(protac_smiles) | |
| linker_mol = Chem.MolFromSmiles(linker_smiles) | |
| # Check if the linker is a substructure of the PROTAC | |
| if not protac_mol.HasSubstructMatch(dummy2query(linker_mol), useChirality=True): | |
| return None | |
| # Split the big molecule into the two fragments | |
| frags = Chem.ReplaceCore(protac_mol, dummy2query(linker_mol), labelByIndex=True, replaceDummies=False) | |
| if frags is None: | |
| return None | |
| try: | |
| frags = Chem.GetMolFrags(frags, asMols=True, sanitizeFrags=True) | |
| except Exception as e: | |
| # print(e) | |
| return None | |
| if verbose: | |
| safe_display(protac_mol) | |
| safe_display(linker_mol) | |
| # The linker has a map number at its attachment points: the following is a | |
| # dictionary that maps the atom index of the attachment points to their | |
| # respective map numbers, i.e., the attachment IDs. | |
| linker_idx2map = {} | |
| for atom in linker_mol.GetAtoms(): | |
| if atom.GetAtomicNum() == 0: | |
| linker_idx2map[atom.GetIdx()] = atom.GetAtomMapNum() | |
| if verbose: | |
| print(f'linker indexes: {linker_idx2map}') | |
| print('-' * 80) | |
| substructs = {'linker': linker_smiles} | |
| # After splitting the PROTAC with ReplaceCore, the fragments will have as | |
| # attachment points the same atom indexes as the linker. We can then use the | |
| # map numbers from the linker to identify the attachment points in the | |
| # PROTAC fragments and assign the correct map number to them, i.e., the | |
| # attachment ID. | |
| for i, side_mol in enumerate(frags): | |
| side_smiles = Chem.MolToSmiles(side_mol, canonical=True) | |
| # Use a regex to get the number in the pattern, e.g., [9*], in the SMILES | |
| attachment_point = re.findall(r'\[(\d+)\*\]', side_smiles) | |
| if attachment_point: | |
| attachment_point = int(attachment_point[0]) | |
| else: | |
| attachment_point = None | |
| if verbose: | |
| print(f'Side {i + 1} SMILES: {side_smiles}') | |
| print(f'Attachment point: {attachment_point}') | |
| safe_display(side_mol) | |
| # Get the map from the linker | |
| linker_attachment_point = linker_idx2map.get(attachment_point, None) | |
| # Modify the SMILES to include the map number | |
| if linker_attachment_point is not None: | |
| side_smiles = re.sub(r'\[(\d+)\*\]', f'[*:{linker_attachment_point}]', side_smiles) | |
| if f'[*:{e3_attachment_id}]' in side_smiles: | |
| substructs['e3'] = canonize_smiles(side_smiles) | |
| elif f'[*:{poi_attachment_id}]' in side_smiles: | |
| substructs['poi'] = canonize_smiles(side_smiles) | |
| if verbose: | |
| print(f'Modified SMILES: {side_smiles}') | |
| safe_display(Chem.MolFromSmiles(side_smiles)) | |
| # Canonize the substructures SMILES | |
| substructs = {k: canonize_smiles(v) for k, v in substructs.items()} | |
| # Check that the reassembled PROTAC matches the original PROTAC | |
| if not check_reassembly(protac_smiles, '.'.join(substructs.values())): | |
| return None | |
| return substructs | |
| def get_attachment_bonds(mol: Chem.Mol, match_atoms: List[int]) -> List[int]: | |
| """ Get the bonds to break to separate the substructure from the PROTAC or R-groups molecule. | |
| Args: | |
| mol: The molecule to break, i.e., the PROTAC. | |
| match_atoms: The atoms matched in the PROTAC molecule, from the GetSubstructMatch function. | |
| Returns: | |
| List[int]: The bond indices to break. | |
| """ | |
| bonds_to_break = [] | |
| for idx in match_atoms: | |
| atom = mol.GetAtomWithIdx(idx) | |
| # Skip non-heavy atoms | |
| if atom.GetAtomicNum() == 1: | |
| continue | |
| for bond in atom.GetBonds(): | |
| neighbor_idx = bond.GetOtherAtomIdx(idx) | |
| # Skip if the neighbor atom if non-heavy | |
| if mol.GetAtomWithIdx(neighbor_idx).GetAtomicNum() == 1: | |
| continue | |
| if neighbor_idx not in match_atoms: | |
| bonds_to_break.append(bond.GetIdx()) | |
| # If more than one bond is found, e.g., if the substructure is | |
| # connected to the PROTAC/R-groups in multiple places like in a | |
| # ring, reset list of bonds and go to the next atom. | |
| if len(bonds_to_break) > 1: | |
| bonds_to_break = [] | |
| break | |
| return bonds_to_break | |
| def get_substructs_from_unmapped_e3_poi( | |
| protac_smiles: str, | |
| mol_protac: Chem.Mol, | |
| mol_poi: Chem.Mol, | |
| mol_e3: Chem.Mol, | |
| poi_attachment_id: int = 1, | |
| e3_attachment_id: int = 2, | |
| verbose: int = 0, | |
| stats: Counter = None, | |
| ) -> Optional[Dict[str, str]]: | |
| """ Get the matches of the POI, E3, and linker in the PROTAC molecule. | |
| This function will return the substructures given a PROTAC and its unmapped | |
| POI and E3 ligand substructures, _i.e._, they do not need to have the | |
| attachment points in their SMILES strings. | |
| Args: | |
| mol_protac: The PROTAC molecule. | |
| mol_poi: The POI ligand molecule. Must NOT contain the attachment point. | |
| mol_e3: The E3 binder molecule. Must NOT contain the attachment point. | |
| verbose: The verbosity level. | |
| Returns: | |
| Dict: The matches of the POI, E3, and linker in the PROTAC molecule. None if no match is found. | |
| """ | |
| if verbose: | |
| safe_display(mol_protac) | |
| poi_match = mol_protac.GetSubstructMatch(mol_poi, useChirality=True) | |
| # Get bonds to break to separate the POI ligand | |
| bonds_to_break_poi = get_attachment_bonds(mol_protac, poi_match) | |
| # Return if no bonds are found | |
| if len(bonds_to_break_poi) != 1: | |
| if stats is not None: | |
| stats['multiple POI attachment bonds'] += 1 | |
| if verbose: | |
| print('ERROR: Multiple POI attachment bonds') | |
| return None | |
| # Break the bonds to isolate the POI ligand | |
| frag_mol_poi = Chem.FragmentOnBonds(mol_protac, bonds_to_break_poi, addDummies=True, dummyLabels=[(poi_attachment_id, poi_attachment_id)]) | |
| # Get the fragments resulting from bond breaking | |
| try: | |
| frags = Chem.GetMolFrags(frag_mol_poi, asMols=True, sanitizeFrags=True) | |
| except Exception as e: | |
| print(e) | |
| return None | |
| # Identify the POI ligand fragment | |
| poi_fragment = None | |
| for frag in frags: | |
| if frag.HasSubstructMatch(mol_poi): | |
| poi_fragment = frag | |
| break | |
| if poi_fragment is None: | |
| if stats is not None: | |
| stats['POI fragment not found'] += 1 | |
| if verbose: | |
| print('ERROR: POI fragment not found') | |
| return None | |
| # Combine the remaining fragments to get the R-groups | |
| # TODO: Check that the length of frags is 1, otherwise, there are multiple fragments | |
| r_group_mol = [frag for frag in frags if frag != poi_fragment] | |
| if len(r_group_mol) != 1: | |
| if stats is not None: | |
| stats['multiple POI fragments'] += 1 | |
| if verbose: | |
| for frag in frags: | |
| safe_display(frag) | |
| print('ERROR: Multiple POI fragments') | |
| return None | |
| r_group_mol = r_group_mol[0] | |
| if verbose: | |
| print('POI:', Chem.MolToSmiles(poi_fragment, canonical=True)) | |
| safe_display(poi_fragment) | |
| e3_match = r_group_mol.GetSubstructMatch(mol_e3, useChirality=True) | |
| # Get bonds to break to isolate the E3 binder | |
| bonds_to_break_e3 = get_attachment_bonds(r_group_mol, e3_match) | |
| # Return if no bonds are found | |
| if len(bonds_to_break_e3) != 1: | |
| if stats is not None: | |
| stats['multiple E3 attachment bonds'] += 1 | |
| if verbose: | |
| safe_display(r_group_mol) | |
| print('ERROR: Multiple E3 attachment bonds') | |
| return None | |
| # Break the bonds to isolate the E3 binder | |
| frag_mol_e3 = Chem.FragmentOnBonds(r_group_mol, bonds_to_break_e3, addDummies=True, dummyLabels=[(e3_attachment_id, e3_attachment_id)]) | |
| # Get fragments after breaking bonds in R-groups | |
| try: | |
| frags = Chem.GetMolFrags(frag_mol_e3, asMols=True, sanitizeFrags=True) | |
| except Exception as e: | |
| print(e) | |
| return None | |
| # Identify the E3 binder fragment | |
| e3_fragment = None | |
| for frag in frags: | |
| if frag.HasSubstructMatch(mol_e3): | |
| e3_fragment = frag | |
| break | |
| if e3_fragment is None: | |
| if stats is not None: | |
| stats['E3 fragment not found'] += 1 | |
| if verbose: | |
| print('ERROR: E3 fragment not found') | |
| return None | |
| if verbose: | |
| print('E3:', Chem.MolToSmiles(e3_fragment, canonical=True)) | |
| safe_display(e3_fragment) | |
| # The remaining fragment is the linker | |
| # TODO: Check that the length of frags is 1, otherwise, there are multiple fragments | |
| linker_mol = [frag for frag in frags if frag != e3_fragment] | |
| if len(linker_mol) != 1: | |
| if stats is not None: | |
| stats['multiple E3 fragments'] += 1 | |
| if verbose: | |
| for frag in frags: | |
| safe_display(frag) | |
| print('ERROR: Multiple E3 fragments') | |
| return None | |
| linker_mol = linker_mol[0] | |
| poi_smiles = Chem.MolToSmiles(poi_fragment, canonical=True).replace(f'[{poi_attachment_id}*]', f'[*:{poi_attachment_id}]') | |
| e3_smiles = Chem.MolToSmiles(e3_fragment, canonical=True).replace(f'[{e3_attachment_id}*]', f'[*:{e3_attachment_id}]') | |
| linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True).replace(f'[{poi_attachment_id}*]', f'[*:{poi_attachment_id}]').replace(f'[{e3_attachment_id}*]', f'[*:{e3_attachment_id}]') | |
| # Get the substructure names and canonize their SMILES | |
| substructs = {'poi': poi_smiles, 'e3': e3_smiles, 'linker': linker_smiles} | |
| substructs = {k: canonize_smiles(v) for k, v in substructs.items()} | |
| if verbose: | |
| print('Linker:', Chem.MolToSmiles(linker_mol, canonical=True)) | |
| safe_display(linker_mol) | |
| # Check that the reassembled PROTAC matches the original PROTAC | |
| if check_reassembly(protac_smiles, '.'.join(substructs.values())): | |
| return substructs | |
| if stats is not None: | |
| stats['reassembling failed'] += 1 | |
| if verbose: | |
| print('ERROR: Reassembling failed') | |
| return None | |
| def get_substructure_from_non_perfect_match( | |
| protac_mol: Chem.Mol, | |
| substruct_mol: Chem.Mol, | |
| attachment_id: int, | |
| verbose: int = 0, | |
| ) -> Chem.Mol: | |
| """ Extract the correct substructure from a PROTAC molecule, given the | |
| SMILES of a wrong substructure resulting in many fragments and matches. | |
| Sometimes the substructure we have is not a _perfect_ substructure of the | |
| PROTAC, _i.e._, it will generate more than two fragments when trying to | |
| replace the PROTAC core with it. In this case, this function will perform | |
| the following steps: | |
| 1. Get the largest fragment by trying to replace the PROTAC core with the | |
| substructure. This largest fragment will be the other substructure plus | |
| the linker. | |
| 2. We can now remove the largest fragment from the PROTAC to get the | |
| "original" substructure without the smaller dangling fragments. | |
| Args: | |
| protac_mol (Chem.Mol): The PROTAC molecule. | |
| substruct_smiles (Chem.Mol): The molecule of the wrong substructure, either the POI ligand or the E3 binder. | |
| attachment_id (int): The attachment ID. | |
| Returns: | |
| Chem.Mol: The extracted substructure molecule. If failing, it will return None. | |
| """ | |
| # Remove the substructure, even if there are "dangling" fragments, to obtain: PROTAC - substruct = (POI + Linker) + remainders | |
| linker_and_other_mol = Chem.DeleteSubstructs(protac_mol, substruct_mol, useChirality=True) | |
| # Get the largest fragment, i.e., the PROTAC - substruct = POI + Linker | |
| try: | |
| fragments = Chem.GetMolFrags(linker_and_other_mol, asMols=True) | |
| except Exception as e: | |
| if verbose: | |
| print(e) | |
| return None | |
| if len(fragments) == 1: | |
| if verbose: | |
| print("WARNING. There are no small fragments, there's only one fragment.") | |
| if not fragments: | |
| if verbose: | |
| print('ERROR. No fragments found.') | |
| return None | |
| largest_fragment = max(fragments, key=lambda x: x.GetNumAtoms()) | |
| # Get the match of the largest fragment in the PROTAC molecule | |
| largest_match = protac_mol.GetSubstructMatch(largest_fragment, useChirality=True) | |
| # Get bonds to break to isolate the substructure, i.e., the opposite of the POI + Linker | |
| bonds_to_break = get_attachment_bonds(protac_mol, largest_match) | |
| if len(bonds_to_break) != 1: | |
| if verbose: | |
| print(f'ERROR. The bond to break is not a single one: {bonds_to_break}') | |
| return None | |
| # Break the bonds to isolate the substructure | |
| frag_mol_substruct = Chem.FragmentOnBonds(protac_mol, bonds_to_break, addDummies=True, dummyLabels=[(attachment_id, attachment_id)]) | |
| # Get fragments after breaking bonds, i.e., the POI + Linker and the substructure without "remainders" | |
| try: | |
| frags = Chem.GetMolFrags(frag_mol_substruct, asMols=True, sanitizeFrags=True) | |
| except Exception as e: | |
| if verbose: | |
| print(e) | |
| return None | |
| # Get the smallest between the substructure and the POI+Linker fragments | |
| substruct_mol = min(frags, key=lambda x: x.GetNumAtoms()) | |
| substruct_smiles = Chem.MolToSmiles(substruct_mol, canonical=True).replace(f'[{attachment_id}*]', f'[*:{attachment_id}]') | |
| substruct_mol = Chem.MolFromSmiles(canonize(substruct_smiles)) | |
| # Check that the substructure matches in the PROTAC molecule | |
| if not protac_mol.HasSubstructMatch(dummy2query(substruct_mol), useChirality=True): | |
| if verbose: | |
| print('ERROR. Substructure does not match in PROTAC molecule:') | |
| print('PROTAC molecule:') | |
| safe_display(protac_mol) | |
| print('Substructure molecule:') | |
| safe_display(substruct_mol) | |
| return None | |
| return substruct_mol | |
| def get_mapped_substr_from_protac( | |
| protac: Chem.Mol, | |
| substr: Chem.Mol, | |
| attachment_id: int = 1, | |
| ) -> Optional[Chem.Mol]: | |
| """ Get the mapped substructure from a PROTAC molecule and an unmapped substructure. | |
| Args: | |
| protac: The PROTAC molecule. | |
| substr: The unmapped substructure. | |
| attachment_id: The attachment point ID to be assigned to the substructure. | |
| Returns: | |
| The mapped substructure molecule. None if the function fails to find the substructure. | |
| """ | |
| num_matches = len(protac.GetSubstructMatches(substr, useChirality=True)) | |
| if num_matches != 1: | |
| return None | |
| other_substr = Chem.ReplaceCore(protac, substr, labelByIndex=False, replaceDummies=False) | |
| if other_substr is None: | |
| return None | |
| mapped_substr = Chem.ReplaceCore(protac, remove_dummy_atoms(other_substr), labelByIndex=False, replaceDummies=False) | |
| if mapped_substr is None: | |
| return None | |
| mapped_smiles = Chem.MolToSmiles(mapped_substr, canonical=True) | |
| # Replace "[1*]" or "[2*]" with the correct attachment point with a regex | |
| mapped_smiles = re.sub(r'\[(\d+)\*\]', f'[*:{attachment_id}]', mapped_smiles) | |
| mapped_smiles = canonize(mapped_smiles) | |
| if mapped_smiles is None: | |
| return None | |
| return Chem.MolFromSmiles(mapped_smiles) | |
| def get_substructs_from_substr_and_linker( | |
| protac_smiles: str, | |
| protac: Chem.Mol, | |
| substr: Chem.Mol, | |
| linker: Chem.Mol, | |
| attachment_id: int = 1, | |
| poi_attachment_id: int = 1, | |
| e3_attachment_id: int = 2, | |
| verbose: int = 0, | |
| stats: Counter = None, | |
| ) -> Optional[Dict[str, str]]: | |
| """ Get the substructures of a PROTAC molecule from an unmapped substructure and linker. | |
| Args: | |
| protac_smiles: The SMILES of the PROTAC molecule. | |
| protac: The RDKit molecule object of the PROTAC. | |
| substr: The RDKit molecule object of the currently matching substructure. Should be UNMAPPED. | |
| linker: The RDKit molecule object of the linker. | |
| attachment_id: The attachment point ID of the currently matching substructure. | |
| verbose: The verbosity level. | |
| Returns: | |
| Dict: The substructures of the PROTAC molecule. None if the function fails to find the substructures. | |
| """ | |
| if attachment_id not in [poi_attachment_id, e3_attachment_id]: | |
| raise ValueError('Attachment ID must be either 1 or 2') | |
| if substr is None: | |
| return None | |
| subr_matches = list(protac.GetSubstructMatches(substr, useChirality=True)) | |
| if len(subr_matches) != 1: | |
| if stats is not None: | |
| stats['multiple substructure matches'] += 1 | |
| if verbose: | |
| print('ERROR: Multiple substructure matches') | |
| return None | |
| subr_match = subr_matches[0] | |
| mapped_substr = get_mapped_substr_from_protac(protac, substr, attachment_id) | |
| if mapped_substr is None: | |
| if stats is not None: | |
| stats['mapped substructure not found'] += 1 | |
| if verbose: | |
| print('ERROR: Mapped substructure not found') | |
| return None | |
| linker_matches = protac.GetSubstructMatches(remove_dummy_atoms(linker), useChirality=True) | |
| for linker_match in linker_matches: | |
| # Check that the intersection between the substructure and the linker | |
| # matches is only one atom, i.e., the attachment point | |
| if len(set(subr_match).intersection(linker_match)) == 1: | |
| linker_match = linker_match | |
| break | |
| # Based on the linker match found, remove it from the PROTAC | |
| emol = Chem.EditableMol(protac) | |
| # Remove atoms in descending order of their indices | |
| for idx in sorted(linker_match, reverse=True): | |
| emol.RemoveAtom(idx) | |
| # Get the modified molecule | |
| try: | |
| protac_fragments = emol.GetMol() | |
| except Exception as e: | |
| if verbose: | |
| print(e) | |
| return None | |
| try: | |
| Chem.SanitizeMol(protac_fragments) | |
| except Exception as e: | |
| if verbose: | |
| print(e) | |
| return None | |
| if verbose: | |
| img = Draw.MolToImage(protac_fragments, highlightAtoms=linker_match, size=(800, 300)) | |
| safe_display(img) | |
| # Get the fragments after removing the linker | |
| try: | |
| fragments = Chem.GetMolFrags(protac_fragments, asMols=True, sanitizeFrags=True) | |
| except Exception as e: | |
| if verbose: | |
| print(e) | |
| return None | |
| if len(fragments) != 2: | |
| if stats is not None: | |
| stats['multiple fragments after removing the linker'] += 1 | |
| if verbose: | |
| for frag in fragments: | |
| safe_display(frag) | |
| print('ERROR: Multiple fragments after removing the linker') | |
| return None | |
| substructs = {} | |
| substructs['linker'] = Chem.MolToSmiles(linker, canonical=True) | |
| for frag in fragments: | |
| if frag.HasSubstructMatch(substr, useChirality=True): | |
| label = 'e3' if attachment_id == e3_attachment_id else 'poi' | |
| substructs[label] = Chem.MolToSmiles(mapped_substr, canonical=True) | |
| # Replace "[1*]" or "[2*]" with the correct attachment point with a regex | |
| substructs[label] = re.sub(r'\[(\d+)\*\]', f'[*:{attachment_id}]', substructs[label]) | |
| if verbose: | |
| print(f'Found {label.capitalize()} fragment.') | |
| img = Draw.MolToImage(Chem.MolFromSmiles(substructs[label]), size=(800, 300)) | |
| safe_display(img) | |
| else: | |
| label = 'e3' if attachment_id == poi_attachment_id else 'poi' | |
| other_attachment_id = e3_attachment_id if label == 'e3' else poi_attachment_id | |
| other_substr = get_mapped_substr_from_protac(protac, frag, other_attachment_id) | |
| if other_substr is None: | |
| return None | |
| substructs[label] = Chem.MolToSmiles(other_substr, canonical=True) | |
| if verbose: | |
| print(f'Found {label.capitalize()} fragment.') | |
| img = Draw.MolToImage(Chem.MolFromSmiles(substructs[label]), size=(800, 300)) | |
| safe_display(img) | |
| # Canonicalize the SMILES strings | |
| substructs = {k: canonize(v) for k, v in substructs.items()} | |
| # Check that the reassembled PROTAC matches the original PROTAC | |
| if not check_reassembly(protac_smiles, '.'.join(substructs.values()), stats, verbose): | |
| return None | |
| return substructs | |
| def swap_attachment_points( | |
| s: str, | |
| poi_attachment_id: int = 1, | |
| e3_attachment_id: int = 2, | |
| ) -> str: | |
| """ Swaps the attachment points in a SMARTS string. | |
| Args: | |
| s: The input SMARTS string. | |
| Returns: | |
| The SMARTS string with the attachment points swapped. | |
| """ | |
| tmp_e3_id = '^^^^E3^^^^' | |
| tmp_poi_id = '^^^^POI^^^^' | |
| s = s.replace(f'[*:{poi_attachment_id}]', f'[*:{tmp_poi_id}]') | |
| s = s.replace(f'[*:{e3_attachment_id}]', f'[*:{tmp_e3_id}]') | |
| s = s.replace(f'[*:{tmp_poi_id}]', f'[*:{e3_attachment_id}]') | |
| s = s.replace(f'[*:{tmp_e3_id}]', f'[*:{poi_attachment_id}]') | |
| return canonize(s) |