Spaces:

ailab-bio
/

PROTAC-Splitter-App

Running

App Files Files Community

ribesstefano commited on Jul 1, 2025

Commit

9dd777e

1 Parent(s): 7ca0099

Setup the spaces app

Browse files

Files changed (37) hide show

README.md +8 -5
protac_splitter/__init__.py +11 -0
protac_splitter/chemoinformatics.py +487 -0
protac_splitter/data/__init__.py +0 -0
protac_splitter/data/curation/__init__.py +11 -0
protac_splitter/data/curation/bond_adjustments.py +407 -0
protac_splitter/data/curation/curation.py +894 -0
protac_splitter/data/curation/mapping_utils.py +77 -0
protac_splitter/data/curation/substructure_extraction.py +586 -0
protac_splitter/data/generation/__init__.py +11 -0
protac_splitter/data/generation/functional_groups.py +400 -0
protac_splitter/data/generation/generation.py +277 -0
protac_splitter/display_utils.py +199 -0
protac_splitter/drawing_utils.py +177 -0
protac_splitter/evaluation.py +495 -0
protac_splitter/fixing_functions.py +355 -0
protac_splitter/graphs/README.md +114 -0
protac_splitter/graphs/__init__.py +0 -0
protac_splitter/graphs/e3_clustering.py +321 -0
protac_splitter/graphs/edge_classifier.py +582 -0
protac_splitter/graphs/edge_features.py +293 -0
protac_splitter/graphs/splitting_algorithms.py +512 -0
protac_splitter/graphs/utils.py +67 -0
protac_splitter/graphs_utils.py +190 -0
protac_splitter/llms/__init__.py +0 -0
protac_splitter/llms/data_utils.py +296 -0
protac_splitter/llms/evaluation.py +169 -0
protac_splitter/llms/hf_utils.py +36 -0
protac_splitter/llms/model_utils.py +256 -0
protac_splitter/llms/training.py +869 -0
protac_splitter/llms/training_causal_model.py +87 -0
protac_splitter/llms/training_mlm_model.py +287 -0
protac_splitter/llms/training_rl_models.py +406 -0
protac_splitter/protac_cheminformatics.py +120 -0
protac_splitter/protac_splitter.py +370 -0
protac_splitter_app.py +351 -0
requirements.txt +138 -0

README.md CHANGED Viewed

@@ -1,14 +1,17 @@
 ---
-title: PROTAC Splitter
-emoji: 👁
-colorFrom: gray
 colorTo: indigo
 sdk: gradio
 sdk_version: 5.35.0
-app_file: app.py
 pinned: false
 license: mit
 short_description: App to split given PROTACs into their substructures.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PROTAC-Splitter
+emoji: ✂️
+colorFrom: green
 colorTo: indigo
 sdk: gradio
 sdk_version: 5.35.0
+python_version: 3.10
+app_file: protac_splitter_app.py
 pinned: false
 license: mit
 short_description: App to split given PROTACs into their substructures.
 ---
+# PROTAC-Splitter
+This repository contains a program to split PROTAC molecules into their substructures.

protac_splitter/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+""" PROTAC Splitter package for splitting PROTAC SMILES into substructures."""
+from protac_splitter.protac_splitter import split_protac
+from protac_splitter.fixing_functions import fix_prediction
+from protac_splitter.graphs.splitting_algorithms import split_protac_graph_based
+from protac_splitter.evaluation import (
+    check_reassembly,
+    split_prediction,
+)
+__version__ = "1.0.0"
+__author__ = "Stefano Ribes and Anders Källberg"

protac_splitter/chemoinformatics.py ADDED Viewed

	@@ -0,0 +1,487 @@

+""" Chemoinformatics utilities for PROTAC Splitter. """
+import logging
+from typing import List, Union, Optional, Literal
+from multiprocessing import Process, Queue
+from hashlib import sha256
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+def GetSubstructMatchesWorker(q, mol, substruct, useChirality, maxMatches):
+    """ Worker function to get substructure matches in a separate process. """
+    q.put(list(mol.GetSubstructMatches(
+        substruct,
+        useChirality=useChirality,
+        maxMatches=maxMatches,
+    )))
+def GetSubstructMatchesWithTimeout(
+    mol: Chem.Mol,
+    substruct: Chem.Mol,
+    useChirality: bool = True,
+    maxMatches: int = 50,
+    timeout: Union[int, float] = 10,
+) -> Optional[List[List[int]]]:
+    """ Get substructure matches with a timeout.
+    Args:
+        mol (Chem.Mol): The molecule to search for substructure matches.
+        substruct (Chem.Mol): The substructure to search for in the molecule.
+        useChirality (bool, optional): Whether to use chirality in the substructure search. Defaults to True.
+        maxMatches (int, optional): The maximum number of matches to return. Defaults to 50.
+        timeout (int | float, optional): The timeout in seconds. Defaults to 10.
+    Returns:
+        Optional[List[List[int]]]: A list of lists containing the atom indices of the substructure matches. Returns None if the search times out or failed.
+    """
+    q = Queue()
+    p = Process(
+        target=GetSubstructMatchesWorker,
+        args=(q, mol, substruct, useChirality, maxMatches),
+    )
+    p.start()
+    p.join(timeout)
+    if p.is_alive():
+        p.terminate()
+        p.join()
+        return None
+    return q.get()
+def remove_stereo(smiles: str) -> str:
+    """
+    Remove stereochemistry from a SMILES string.
+    Args:
+        smiles (str): The input SMILES string.
+    Returns:
+        str: The SMILES string with stereochemistry removed.
+    """
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        Chem.rdmolops.RemoveStereochemistry(mol)
+        return Chem.MolToSmiles(mol)
+    except Exception as e:
+        logging.warning(f"Error removing stereochemistry: {e}")
+        return None
+def get_mol(smiles: str, remove_stereo: bool = False) -> Chem.Mol:
+    """
+    Get a molecule object from a SMILES string.
+    Args:
+        smiles (str): The SMILES string representing the molecule.
+    Returns:
+        Chem.Mol: The molecule object.
+    """
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is not None and remove_stereo:
+        Chem.rdmolops.RemoveStereochemistry(mol)
+    return mol
+def canonize_smarts(smarts: str) -> str:
+    """
+    Cleans a SMARTS string by converting it to canonical SMARTS representation.
+    NOTE: It might not work for complex patterns: https://github.com/rdkit/rdkit/discussions/6929
+    Args:
+        smarts (str): The input SMARTS string.
+    Returns:
+        str: The cleaned SMARTS string.
+    """
+    mol = Chem.MolFromSmarts(smarts)
+    if mol is None:
+        return None
+    canonical_smarts = Chem.MolToSmarts(Chem.MolFromSmiles(Chem.MolToSmiles(mol), sanitize=False))
+    return canonical_smarts
+def smiles2mol(smiles: str) -> Chem.Mol:
+    """Converts a SMILES string to an RDKit molecule object.
+    Args:
+        smiles (str): The input SMILES string.
+    Returns:
+        Chem.Mol: The RDKit molecule object.
+    """
+    return Chem.MolFromSmiles(smiles)
+def mol2smiles(mol: Chem.Mol) -> str:
+    """Converts an RDKit molecule object to a SMILES string.
+    Args:
+        mol (Chem.Mol): The RDKit molecule object.
+    Returns:
+        str: The SMILES string.
+    """
+    return Chem.MolToSmiles(mol)
+def canonize_smiles(smiles: str) -> str:
+    """ Canonizes a SMILES string by converting it to canonical SMILES representation.
+    Args:
+        smiles (str): The input SMILES string.
+    Returns:
+        str: The canonized SMILES string.
+    """
+    if smiles is None:
+        return None
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+    if mol is None:
+        return None
+    try:
+        return Chem.MolToSmiles(mol, canonical=True)
+    except:
+        return None
+def canonize(x: Union[str, Chem.Mol]) -> Union[str, Chem.Mol]:
+    """ Canonizes a SMILES string or RDKit molecule object.
+    Args:
+        x: The input SMILES string or RDKit molecule object.
+    Returns:
+        str | Chem.Mol: The canonized SMILES string or RDKit molecule object, according to the input type.
+    """
+    if x is None:
+        return None
+    if isinstance(x, str):
+        return canonize_smiles(x)
+    return Chem.MolFromSmiles(Chem.MolToSmiles(x, canonical=True))
+def compute_RDKitFP(
+        smiles: Union[str, List[str], List[Chem.Mol]],
+        maxPath: int = 7,
+        fpSize: int = 2048,
+) -> List[Chem.RDKFingerprint]:
+    """
+    Compute RDKit fingerprints for a given list of SMILES strings or RDKit molecules.
+    Args:
+        smiles (Union[str, List[str], List[Chem.Mol]]): A single SMILES string or a list of SMILES strings
+            or a list of RDKit molecules.
+        maxPath (int, optional): The maximum path length for the fingerprints. Defaults to 7.
+        fpSize (int, optional): The size of the fingerprint vector. Defaults to 2048.
+    Returns:
+        List[Chem.RDKFingerprint]: A list of RDKit fingerprints computed from the input SMILES strings or molecules.
+    """
+    if isinstance(smiles[0], str):
+        mols = [get_mol(smi) for smi in smiles]
+    else:
+        mols = smiles  # assume mols were fed instead
+    rdgen = rdFingerprintGenerator.GetRDKitFPGenerator(
+        maxPath=maxPath, fpSize=fpSize)
+    fps = [rdgen.GetCountFingerprint(mol) for mol in mols]
+    return fps
+def remove_dummy_atoms(mol: Union[str, Chem.Mol], canonical=True) -> Union[str, Chem.Mol]:
+    """
+    Removes all dummy atoms (attachment points) from a molecule.
+    Args:
+        mol: RDKit Mol object with dummy atoms.
+    Returns:
+        A new RDKit Mol object without dummy atoms.
+    """
+    return_smiles = False
+    if isinstance(mol, str):
+        return_smiles = True
+        mol = Chem.MolFromSmiles(mol)
+    if mol is None:
+        return None
+    # Remove all dummy atoms with a query
+    mol_no_dummy = Chem.DeleteSubstructs(mol, Chem.MolFromSmarts('[#0]'))
+    if mol_no_dummy is None:
+        # --------------------------------------------------------------------------
+        # Other approach: editing molecule and removing dummy atoms
+        # --------------------------------------------------------------------------
+        # Create an editable molecule to remove atoms
+        editable_mol = Chem.EditableMol(mol)
+        # List of atoms to remove (dummy atoms have atomic number 0)
+        dummy_atoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomicNum() == 0]
+        # Remove dummy atoms
+        for atom_idx in sorted(dummy_atoms, reverse=True):  # Remove from the highest index to avoid index shifts
+            editable_mol.RemoveAtom(atom_idx)
+        if editable_mol is None:
+            return None
+        # Return the modified molecule
+        if return_smiles:
+            return Chem.MolToSmiles(editable_mol.GetMol())
+        editable_mol = editable_mol.GetMol()
+        editable_mol.UpdatePropertyCache()
+        return editable_mol
+        # --------------------------------------------------------------------------
+    # Return the modified molecule
+    if return_smiles:
+        return Chem.MolToSmiles(mol_no_dummy, canonical=canonical)
+    return mol_no_dummy
+def dummy2query(mol: Chem.Mol) -> Chem.Mol:
+    """ Converts dummy atoms to query atoms, so that a molecule with attachment points can be used in HasSubstructMatch.
+    Args:
+        mol: The molecule to convert.
+    Returns:
+        The molecule with dummy atoms converted to query atoms
+    """
+    if mol is None:
+        return None
+    p = Chem.AdjustQueryParameters.NoAdjustments()
+    p.makeDummiesQueries = True
+    return Chem.AdjustQueryProperties(mol, p)
+def get_substr_match(
+        protac_mol: Chem.Mol,
+        substr: Chem.Mol,
+        max_allowed_fragments: int = 1,
+        replace: Literal['core', 'sidechains'] = 'core',
+        useChirality: bool = True,
+) -> bool:
+    """ Check if a molecule contains a substructure match with a given molecule.
+    Compared to RDKit HasSubstructMatch, this function also checks the number of fragments when replacing the substr in the PROTAC.
+    Args:
+        protac_mol (Chem.Mol): The PROTAC molecule.
+        substr (Chem.Mol): The substructure molecule.
+        max_allowed_fragments (int, optional): The maximum number of fragments allowed when replacing the substr in the PROTAC. Defaults to 1. Example when equal to 1: if removing the warhead, a single fragment should remain.
+    Returns:
+        bool: True if the PROTAC contains a substructure match with the given molecule and the fragments count is equal, False otherwise.
+    """
+    # Count the number of fragments when replacing the substr in the PROTAC
+    if replace == 'core':
+        fragments = Chem.ReplaceCore(protac_mol, dummy2query(substr), useChirality=useChirality)
+    elif replace == 'sidechains':
+        fragments = Chem.ReplaceSidechains(protac_mol, dummy2query(substr), useChirality=useChirality)
+    else:
+        raise ValueError(f"replace argument should be either 'core' or 'sidechains', provided: {replace}")
+    # Check if the number of fragments is equal to the max allowed fragments
+    if fragments is None:
+        return False
+    try:
+        fragments = Chem.GetMolFrags(fragments, sanitizeFrags=False)
+    except Exception as e:
+        print(e)
+        return False
+    return len(fragments) == max_allowed_fragments
+def remove_attach_atom(mol: Chem.Mol, attach_id: int, sanitize: bool = False) -> Chem.Mol:
+    """ Removes the atom with the specified attachment id from the molecule.
+    Example:
+    >>> remove_attach_atom(Chem.MolFromSmiles('CC[*:1]'), 1)
+    CC
+    There are no checks on the molecule, so it is assumed it is not None.
+    Args:
+        mol (Chem.Mol): The molecule.
+        attach_id (int): The attachment id of the atom to remove.
+        sanitize (bool, optional): Whether to sanitize the molecule after removing the atom. When used in `fix_prediction` function, it is used to "remove" substructures, so there is no need to have them sanitized. Default: False.
+    Returns:
+        (Chem.Mol) The molecule with the atom removed.
+    """
+    atoms_to_remove = []
+    for atom in mol.GetAtoms():
+        if atom.GetAtomicNum() == 0:  # Dummy atom
+            map_num = atom.GetAtomMapNum()
+            if map_num == attach_id:  # Targeting only [*:attach_id]
+                atoms_to_remove.append(atom.GetIdx())
+    # Remove atoms using an EditableMol
+    editable_mol = Chem.EditableMol(mol)
+    for idx in sorted(atoms_to_remove, reverse=True):  # Remove from highest index to avoid shifting
+        editable_mol.RemoveAtom(idx)
+    # Convert back to a molecule
+    new_mol = editable_mol.GetMol()
+    if sanitize:
+        Chem.SanitizeMol(new_mol)
+    return new_mol
+def get_bond_idx(smi: str, bonds_start_end_atoms: List[List[int]]) -> List[int]:
+    """
+    Get the indices of bonds in a molecule that match the given start and end atom indices.
+    Args:
+        smi (str): The SMILES representation of the molecule.
+        bonds_start_end_atoms (List[List[int]]): A list of lists containing the start and end atom indices of the bonds to search for.
+    Returns:
+        List[int]: A list of bond indices that match the given start and end atom indices.
+    """
+    mol = Chem.MolFromSmiles(smi)
+    bond_indices = []
+    for bond in mol.GetBonds():
+        begin_idx = bond.GetBeginAtomIdx()
+        end_idx = bond.GetEndAtomIdx()
+        if [begin_idx, end_idx] in bonds_start_end_atoms or [end_idx, begin_idx] in bonds_start_end_atoms:
+            bond_indices.append(bond.GetIdx())
+        elif (begin_idx, end_idx) in bonds_start_end_atoms or (end_idx, begin_idx) in bonds_start_end_atoms:
+            bond_indices.append(bond.GetIdx())
+    return bond_indices
+def get_mol_id(smiles: str) -> str | None:
+    """ Get the Hash of a given SMILES string.
+    Args:
+        smiles (str): The SMILES string to hash.
+    Returns:
+        str | None: The Hash of the SMILES string. None if the function failed.
+    """
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        Chem.RemoveStereochemistry(mol)
+    except Exception as e:
+        logging.warning(f"Error while removing stereochemistry: {e}")
+        logging.warning(f"SMILES: {smiles}")
+        return None
+    # Get the InChIKey for the molecule
+    inchi_key = Chem.MolToInchiKey(mol)
+    smiles = Chem.MolToSmiles(mol, canonical=True)
+    # Encode the InChIKey and SMILES to create a unique identifier
+    return sha256((inchi_key + smiles).encode()).hexdigest()
+def get_atom_idx_at_attachment(
+        protac: Chem.Mol,
+        substruct: Chem.Mol,
+        linker: Optional[Chem.Mol] = None,
+        timeout: Optional[Union[int, float]] = None,
+        return_dict: bool = False,
+        verbose: int = 0,
+) -> List[int]:
+    """ Get the atom index of the attachment point of a substructure in the PROTAC molecule.
+    Args:
+        protac: The PROTAC molecule.
+        substruct: The substructure of the PROTAC that contains the attachment point, e.g., the POI or E3 ligase.
+        linker: The linker molecule.
+        verbose: Verbosity level.
+    Returns:
+        List[int]: The two atom indices at the attachment point.
+    """
+    if linker is None:
+        # Get the "other" substructure, i.e., replace side chain of PROTAC using the substruct
+        linker = Chem.DeleteSubstructs(protac, remove_dummy_atoms(substruct), useChirality=True)
+        if timeout is None:
+            timeout = 60
+            logging.warning(f'No timeout set when linker is not provided, using default value of {timeout} seconds.')
+    substruct_match = set(protac.GetSubstructMatch(dummy2query(substruct), useChirality=True))
+    if verbose:
+        print(f'Substruct match: {substruct_match}')
+    linker_no_dummy = remove_dummy_atoms(linker)
+    if verbose:
+        print(f'Linker without dummy atoms found.')
+    max_matches = 2
+    linker_match = set()
+    shared_atoms = set()
+    # NOTE: The following is a hacky way to speed up the search for linker
+    # matches. In fact, the linker can be quite short, so it might match in
+    # multiple places of the PROTAC molecule.
+    # If the number of max matches in GetSubstructMatches is low, then the
+    # search tends to be faster, but imprecise. However, we are interested in
+    # the interesection of the matches, so we can progressively increase the
+    # number of max matches until we find a single atom in common.
+    while len(shared_atoms) != 1 and max_matches <= 50:
+        if timeout is None:
+            linker_matches = list(protac.GetSubstructMatches(linker_no_dummy, useChirality=True, maxMatches=max_matches))
+        else:
+            linker_matches = GetSubstructMatchesWithTimeout(protac, linker_no_dummy, useChirality=True, maxMatches=max_matches, timeout=timeout)
+        if verbose:
+            print(f'Linker matches: {linker_matches}')
+        if not linker_matches:
+            # return None
+            linker_match = set()
+            shared_atoms = set()
+            max_matches += 1
+            continue
+        for match in linker_matches:
+            shared_atoms = set(match) & set(substruct_match)
+            linker_match = match
+            if len(shared_atoms) == 1:
+                if verbose:
+                    print(f'Shared atoms: {list(shared_atoms)}')
+                break
+        if len(shared_atoms) != 1:
+            linker_match = set()
+            shared_atoms = set()
+            max_matches += 1
+    if not shared_atoms:
+        if verbose:
+            print('No shared atoms found.')
+        return None
+    attachment_idx = list(shared_atoms)
+    attachments = {'substruct': attachment_idx[0]}
+    # Get the other atom at the attachment point that is NOT in the linker
+    for neighbor in protac.GetAtomWithIdx(attachment_idx[0]).GetNeighbors():
+        if neighbor.GetIdx() not in linker_match:
+            attachment_idx.append(neighbor.GetIdx())
+            attachments['linker'] = neighbor.GetIdx()
+            break
+    if return_dict:
+        return attachments
+    return attachment_idx

protac_splitter/data/__init__.py ADDED Viewed

File without changes

protac_splitter/data/curation/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .mapping_utils import update_dictionary
+from .curation import (
+    split_protacs,
+    iterative_protac_splitting,
+)
+__all__ = [
+    'update_dictionary',
+    'split_protacs',
+    'iterative_protac_splitting',
+]

protac_splitter/data/curation/bond_adjustments.py ADDED Viewed

	@@ -0,0 +1,407 @@

+""" Adjusts amide and ester bonds in PROTAC substructures. """
+from typing import Tuple, Dict
+from rdkit import Chem
+from protac_splitter.chemoinformatics import (
+    dummy2query,
+    canonize,
+)
+from protac_splitter.display_utils import display_mol
+from protac_splitter.evaluation import check_reassembly
+def adjust_amide_bond(
+        substruct: Chem.Mol,
+        linker: Chem.Mol,
+        substruct_attachment_id: int,
+        verbose: int = 0,
+) -> Tuple[Chem.Mol, Chem.Mol]:
+    """
+    Adjust the amide bond between the substruct and linker substructure.
+    Handles the case when neighboring atoms of the amide bond are dummy atoms, which represent attachment points.
+    The linker will be modified with the required additional atoms.
+    Args:
+        substruct: The substructure of the substruct (protein of interest) that contains the amide bond.
+        linker: The linker molecule that connects substruct to the E3 ligase.
+        substruct_attachment_id: The attachment point ID in the substruct substructure. E.g., 1 for the POI, as in "[*:1]".
+    Returns:
+        Tuple[Chem.Mol, Chem.Mol]: The adjusted substruct and linker molecules, in that order.
+    """
+    # Pseudo-code of the algorithm:
+    """
+    ```python
+    # Check if the amide bond (N-C=O) is in the substructure
+    if "N-C(=O)" in substruct:
+        if neighbor("N-C(=O)") == "[*:substruct]":
+            # If the neighboring atom of the amide bond is a dummy atom, i.e., attachment point
+            mark_protac_as_wrong("[PROTAC]")
+            # Identify the bond to split, i.e., the nitrogen-carbon bond, and split
+            "[*:substruct]-[<optional neighboring atom>]-N-[*:tmp]", "[*:tmp]-C(=O)-[rest of the PROTAC]" = split_PROTAC_at("N-C")
+            "[Linker]-N-[*:tmp]" = join("[Linker]-[*:substruct]", "[*:substruct]-N-[*:tmp]")
+            rename_attachment_point("[*:tmp]-C(=O)-[rest of the PROTAC]")
+            rename_attachment_point("[Linker]-N-[*:tmp]")
+        elif neighbor(neighbor("N-C(=O)")) == "[*:substruct]":
+            # If the second neighbor of athe amide bond is a dummy atom, i.e., attachment point
+            mark_protac_as_wrong("[PROTAC]")
+            # Do as above
+            # Identify the bond to split, i.e., the nitrogen-carbon bond, and split
+            "[*:substruct]-N-[*:tmp]", "[*:tmp]-C(=O)-[rest of the PROTAC]" = split_PROTAC_at("N-C")
+            "[Linker]-N-[*:tmp]" = join("[Linker]-[*:substruct]", "[*:substruct]-N-[*:tmp]")
+            rename_attachment_point("[*:tmp]-C(=O)-[rest of the PROTAC]")
+            rename_attachment_point("[Linker]-N-[*:tmp]")
+    ```
+    """
+    # Convert dummy atoms in substruct to query atoms for substructure search
+    query_substruct = dummy2query(substruct)
+    # Identify amide bond (N-C=O) in substruct substructure
+    amide_pattern = Chem.MolFromSmarts("[NX3][CX3](=[OX1])")
+    amide_matches = query_substruct.GetSubstructMatches(amide_pattern, useChirality=True)
+    if not amide_matches:
+        return substruct, linker  # No amide bond found, return the original substruct
+    side_atom = None
+    nitrogen_idx_found, carbonyl_idx_found = None, None
+    for match in amide_matches:
+        nitrogen_idx, carbonyl_idx = match[0], match[1]
+        nitrogen_atom = query_substruct.GetAtomWithIdx(nitrogen_idx)
+        carbonyl_atom = query_substruct.GetAtomWithIdx(carbonyl_idx)
+        for amide_atom in [nitrogen_atom, carbonyl_atom]:
+            # Check neighboring atoms for attachment points
+            # NOTE: The dummy atom representing an attachment point have atomic number 0
+            for neighbor in amide_atom.GetNeighbors():
+                if neighbor.GetAtomicNum() == 0:
+                    nitrogen_idx_found = nitrogen_idx
+                    carbonyl_idx_found = carbonyl_idx
+                    side_atom = "N" if amide_atom == nitrogen_atom else "C"
+                    break
+            # If previous search failed, check the neighbors of the neighboring
+            # atoms (second-order neighbors)
+            if nitrogen_idx_found is None or carbonyl_idx_found is None:
+                for neighbor in amide_atom.GetNeighbors():
+                    for second_neighbor in neighbor.GetNeighbors():
+                        if second_neighbor.GetIdx() == carbonyl_idx or second_neighbor.GetIdx() == nitrogen_idx:
+                            continue  # Skip the opposite atom from the amide bond
+                        if second_neighbor.GetAtomicNum() == 0:
+                            nitrogen_idx_found = nitrogen_idx
+                            carbonyl_idx_found = carbonyl_idx
+                            side_atom = "N" if amide_atom == nitrogen_atom else "C"
+                            break
+            else:
+                break
+    if nitrogen_idx_found is None or carbonyl_idx_found is None or side_atom is None:
+        return substruct, linker
+    # Split the amide bond and adjust
+    dummy_label = 3
+    dummy_labels = [(dummy_label, dummy_label)] # The E3 and substruct will have 1 and 2, so we need a third one
+    amid_bond_idx = query_substruct.GetBondBetweenAtoms(nitrogen_idx_found, carbonyl_idx_found).GetIdx()
+    fragments = Chem.FragmentOnBonds(query_substruct, [amid_bond_idx], addDummies=True, dummyLabels=dummy_labels)
+    # Get the fragments resulting from bond breaking
+    try:
+        mol_frags = Chem.GetMolFrags(fragments, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        print(e)
+        return substruct, linker
+    # Identify the "[*:substruct][<optional neighboring atom>]N[3*]" fragment, the other one will be the "truncated" substruct
+    amide_fragment_pattern = Chem.MolFromSmarts(f"[*:{substruct_attachment_id}][{side_atom}][{dummy_label}*]")
+    amide_fragment = None
+    substruct_fixed = None
+    if verbose:
+        print(f'Attachment point: *:{substruct_attachment_id}')
+        print('Substruct:')
+        display_mol(substruct)
+        print('Linker:')
+        display_mol(linker)
+    for frag in mol_frags:
+        if frag.HasSubstructMatch(dummy2query(amide_fragment_pattern)):
+            amide_fragment = frag
+            if verbose:
+                print('Amide fragment:')
+                display_mol(frag)
+        else:
+            if verbose:
+                print('Substruct fragment:')
+                display_mol(frag)
+            substruct_fixed = frag
+    if amide_fragment is None or substruct_fixed is None:
+        return substruct, linker
+    # In order for the function to be used "on linkers", we need to make sure
+    # that the amide fragment contains the attachment point of the substruct.
+    # If not, there's nothing to do.
+    if f'[*:{substruct_attachment_id}]' not in Chem.MolToSmiles(amide_fragment, canonical=True):
+        return substruct, linker
+    # Rename the "[3*]" attachment point on the amide fragment to "[*:3]"
+    amide_fragment_smiles = Chem.MolToSmiles(amide_fragment, canonical=True)
+    amide_fragment_smiles = amide_fragment_smiles.replace(f'[{dummy_label}*]', f'[*:{dummy_label}]')
+    amide_fragment_smiles = canonize(amide_fragment_smiles)
+    amide_fragment = Chem.MolFromSmiles(amide_fragment_smiles)
+    # Use molzip to join the linker and the fragment at the original attachment point
+    linker_fixed = Chem.molzip(linker, amide_fragment)
+    # Rename the "[*:3]" attachment point back to the original attachment point on the linker
+    linker_fixed_smiles = Chem.MolToSmiles(linker_fixed, canonical=True)
+    linker_fixed_smiles = linker_fixed_smiles.replace(f'[*:{dummy_label}]', f'[*:{substruct_attachment_id}]')
+    linker_fixed_smiles = canonize(linker_fixed_smiles)
+    linker_fixed = Chem.MolFromSmiles(linker_fixed_smiles)
+    # Rename the "[3*]" attachment point back to the original attachment point on the substruct
+    substruct_fixed_smiles = Chem.MolToSmiles(substruct_fixed, canonical=True)
+    substruct_fixed_smiles = substruct_fixed_smiles.replace(f'[{dummy_label}*]', f'[*:{substruct_attachment_id}]')
+    substruct_fixed_smiles = canonize(substruct_fixed_smiles)
+    substruct_fixed = Chem.MolFromSmiles(substruct_fixed_smiles)
+    return substruct_fixed, linker_fixed
+def adjust_amide_bonds_in_substructs(
+        substructs: Dict[str, str],
+        protac_smiles: str,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+) -> Dict[str, str]:
+    """ Adjusts the amide bonds in the substructures of a PROTAC. Just a wrapper function to apply it to multiple substructures.
+    Args:
+        substructs: The substructures of the PROTAC. A dictionary of SMILES with keys 'poi', 'linker', and 'e3'.
+        protac_smiles: The SMILES of the PROTAC for checking reassembly.
+    Returns:
+        The updated substructures dictionary.
+    """
+    poi_mol = Chem.MolFromSmiles(substructs['poi'])
+    e3_mol = Chem.MolFromSmiles(substructs['e3'])
+    linker_mol = Chem.MolFromSmiles(substructs['linker'])
+    # Fix the amide group on the POI ligand
+    poi_mol, linker_mol = adjust_amide_bond(poi_mol, linker_mol, poi_attachment_id)
+    poi_smiles = Chem.MolToSmiles(poi_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    e3_smiles = substructs['e3']
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    # Fix the amide group on the E3 binder
+    e3_mol, linker_mol = adjust_amide_bond(e3_mol, linker_mol, e3_attachment_id)
+    e3_smiles = Chem.MolToSmiles(e3_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    # Fix the amide group on the linker, E3 side
+    linker_mol, e3_mol = adjust_amide_bond(linker_mol, e3_mol, e3_attachment_id)
+    e3_smiles = Chem.MolToSmiles(e3_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    # Fix the amide group on the linker, POI side
+    linker_mol, poi_mol = adjust_amide_bond(linker_mol, poi_mol, poi_attachment_id)
+    poi_smiles = Chem.MolToSmiles(poi_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    substructs['poi'] = poi_smiles
+    substructs['e3'] = e3_smiles
+    substructs['linker'] = linker_smiles
+    return substructs
+def adjust_ester_bond(
+        substruct: Chem.Mol,
+        linker: Chem.Mol,
+        substruct_attachment_id: int,
+        verbose: int = 0,
+) -> Tuple[Chem.Mol, Chem.Mol]:
+    """
+    Adjust the amide bond between the substruct and linker substructure.
+    Handles the case when neighboring atoms of the amide bond are dummy atoms, which represent attachment points.
+    Args:
+        substruct: The substructure of the substruct (protein of interest) that contains the amide bond.
+        linker: The linker molecule that connects substruct to the E3 ligase.
+        substruct_attachment_id: The attachment point ID in the substruct substructure. E.g., 1 for the POI, as in "[*:1]".
+    Returns:
+        Tuple[Chem.Mol, Chem.Mol]: The adjusted substruct and linker molecules, in that order.
+    """
+    # Convert dummy atoms in substruct to query atoms for substructure search
+    query_substruct = dummy2query(substruct)
+    # Identify ester group (COOR) in substruct substructure
+    ester_pattern = Chem.MolFromSmarts("[OX2][CX3](=[OX1])")
+    ester_matches = query_substruct.GetSubstructMatches(ester_pattern)
+    if not ester_matches:
+        return substruct, linker  # No amide bond found, return the original substruct
+    side_atom = None
+    oxygen_idx_found, carbonyl_idx_found = None, None
+    for match in ester_matches:
+        oxygen_idx, carbonyl_idx = match[0], match[1]
+        oxygen_atom = query_substruct.GetAtomWithIdx(oxygen_idx)
+        carbonyl_atom = query_substruct.GetAtomWithIdx(carbonyl_idx)
+        for ester_atom in [oxygen_atom, carbonyl_atom]:
+            # Check neighboring atoms for attachment points
+            # NOTE: The dummy atom representing an attachment point have atomic number 0
+            for neighbor in ester_atom.GetNeighbors():
+                if neighbor.GetAtomicNum() == 0:
+                    oxygen_idx_found = oxygen_idx
+                    carbonyl_idx_found = carbonyl_idx
+                    side_atom = "O" if ester_atom == oxygen_atom else "C"
+                    break
+            # If previous search failed, check the neighbors of the neighboring
+            # atoms (second-order neighbors)
+            if oxygen_idx_found is None or carbonyl_idx_found is None:
+                for neighbor in ester_atom.GetNeighbors():
+                    for second_neighbor in neighbor.GetNeighbors():
+                        if second_neighbor.GetIdx() == carbonyl_idx or second_neighbor.GetIdx() == oxygen_idx:
+                            continue  # Skip the opposite atom from the amide bond
+                        if second_neighbor.GetAtomicNum() == 0:
+                            oxygen_idx_found = oxygen_idx
+                            carbonyl_idx_found = carbonyl_idx
+                            side_atom = "O" if ester_atom == oxygen_atom else "C"
+                            break
+            else:
+                break
+    if oxygen_idx_found is None or carbonyl_idx_found is None or side_atom is None:
+        return substruct, linker
+    # Split the amide bond and adjust
+    dummy_label = 3
+    dummy_labels = [(dummy_label, dummy_label)] # The E3 and substruct will have 1 and 2, so we need a third one
+    amid_bond_idx = query_substruct.GetBondBetweenAtoms(oxygen_idx_found, carbonyl_idx_found).GetIdx()
+    fragments = Chem.FragmentOnBonds(query_substruct, [amid_bond_idx], addDummies=True, dummyLabels=dummy_labels)
+    # Get the fragments resulting from bond breaking
+    try:
+        mol_frags = Chem.GetMolFrags(fragments, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return substruct, linker
+    # Identify the "[*:substruct][<optional neighboring atom>]N[3*]" fragment, the other one will be the "truncated" substruct
+    ester_fragment_pattern = Chem.MolFromSmarts(f"[*:{substruct_attachment_id}][{side_atom}][{dummy_label}*]")
+    ester_fragment = None
+    substruct_fixed = None
+    for frag in mol_frags:
+        if frag.HasSubstructMatch(dummy2query(ester_fragment_pattern)):
+            ester_fragment = frag
+        else:
+            substruct_fixed = frag
+    if ester_fragment is None or substruct_fixed is None:
+        return substruct, linker
+    # In order for the function to be used "on linkers", we need to make sure
+    # that the ester fragment contains the attachment point of the substruct.
+    # If not, there's nothing to do.
+    if f'[*:{substruct_attachment_id}]' not in Chem.MolToSmiles(ester_fragment, canonical=True):
+        return substruct, linker
+    # Rename the "[3*]" attachment point on the amide fragment to "[*:3]"
+    ester_fragment_smiles = Chem.MolToSmiles(ester_fragment, canonical=True)
+    ester_fragment_smiles = ester_fragment_smiles.replace(f'[{dummy_label}*]', f'[*:{dummy_label}]')
+    ester_fragment = Chem.MolFromSmiles(ester_fragment_smiles)
+    # Use molzip to join the linker and the fragment at the original attachment point
+    linker_fixed = Chem.molzip(linker, ester_fragment)
+    # Rename the "[*:3]" attachment point back to the original attachment point on the linker
+    linker_fixed_smiles = Chem.MolToSmiles(linker_fixed, canonical=True)
+    linker_fixed_smiles = linker_fixed_smiles.replace(f'[*:{dummy_label}]', f'[*:{substruct_attachment_id}]')
+    linker_fixed = Chem.MolFromSmiles(linker_fixed_smiles)
+    # Rename the "[3*]" attachment point back to the original attachment point on the substruct
+    substruct_fixed_smiles = Chem.MolToSmiles(substruct_fixed, canonical=True)
+    substruct_fixed_smiles = substruct_fixed_smiles.replace(f'[{dummy_label}*]', f'[*:{substruct_attachment_id}]')
+    substruct_fixed = Chem.MolFromSmiles(substruct_fixed_smiles)
+    return substruct_fixed, linker_fixed
+def adjust_ester_bonds_in_substructs(
+        substructs: Dict[str, str],
+        protac_smiles: str,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+) -> Dict[str, str]:
+    """ Adjusts the ester bonds in the substructures of a PROTAC. Just a wrapper function to apply it to multiple substructures.
+    Args:
+        substructs: The substructures of the PROTAC. A dictionary of SMILES with keys 'poi', 'linker', and 'e3'.
+        protac_smiles: The SMILES of the PROTAC for checking reassembly.
+    Returns:
+        The updated substructures dictionary.
+    """
+    poi_mol = Chem.MolFromSmiles(substructs['poi'])
+    e3_mol = Chem.MolFromSmiles(substructs['e3'])
+    linker_mol = Chem.MolFromSmiles(substructs['linker'])
+    # Fix the amide group on the POI ligand
+    poi_mol, linker_mol = adjust_ester_bond(poi_mol, linker_mol, poi_attachment_id)
+    poi_smiles = Chem.MolToSmiles(poi_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    e3_smiles = substructs['e3']
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    # Fix the amide group on the E3 binder
+    e3_mol, linker_mol = adjust_ester_bond(e3_mol, linker_mol, e3_attachment_id)
+    e3_smiles = Chem.MolToSmiles(e3_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    # Fix the amide group on the linker, E3 side
+    linker_mol, e3_mol = adjust_ester_bond(linker_mol, e3_mol, e3_attachment_id)
+    e3_smiles = Chem.MolToSmiles(e3_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    # Fix the amide group on the linker, POI side
+    linker_mol, poi_mol = adjust_ester_bond(linker_mol, poi_mol, poi_attachment_id)
+    poi_smiles = Chem.MolToSmiles(poi_mol, canonical=True)
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True)
+    if not check_reassembly(protac_smiles, '.'.join([poi_smiles, linker_smiles, e3_smiles])):
+        return substructs
+    substructs['poi'] = poi_smiles
+    substructs['e3'] = e3_smiles
+    substructs['linker'] = linker_smiles
+    return substructs

protac_splitter/data/curation/curation.py ADDED Viewed

	@@ -0,0 +1,894 @@

+""" Curation utilities for PROTAC Splitter. """
+import os
+import re
+from typing import Any, Dict, Optional, Union, Callable
+from joblib import Parallel, delayed
+from rdkit import Chem
+from rdkit.Chem import DataStructs
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from protac_splitter.chemoinformatics import (
+    canonize,
+    remove_dummy_atoms,
+    canonize_smiles,
+    get_mol_id,
+    get_substr_match,
+)
+from protac_splitter.evaluation import check_reassembly
+from protac_splitter.data.curation.substructure_extraction import (
+    get_substructure_from_non_perfect_match,
+    get_substructs_from_unmapped_e3_poi,
+    get_substructs_from_substr_and_linker,
+    get_substructs_from_mapped_linker,
+    swap_attachment_points,
+)
+from protac_splitter.data.curation.bond_adjustments import (
+    adjust_amide_bonds_in_substructs,
+    adjust_ester_bonds_in_substructs,
+)
+from protac_splitter.data.curation.mapping_utils import update_dictionary
+def check_substructs_size(
+        protac_mol: Chem.Mol,
+        substructs: Dict[str, str],
+        size_perc_threshold: float = 0.8,
+) -> bool:
+    """ Check the size of the substructures in the PROTAC. If any of them is too big, return False.
+    Args:
+        protac_mol: The PROTAC molecule.
+        substructs: The substructures to check against.
+    Returns:
+        False if any of the substructures is too big. True otherwise.
+    """
+    num_protac_atoms = protac_mol.GetNumAtoms()
+    for key, smiles in substructs.items():
+        substruct = Chem.MolFromSmiles(smiles)
+        num_substruct_atoms = substruct.GetNumAtoms()
+        if num_substruct_atoms / num_protac_atoms > size_perc_threshold:
+            # print(f'Error: {key.upper()} is too big in the PROTAC ({num_substruct_atoms} / {num_protac_atoms} = {num_substruct_atoms / num_protac_atoms:.2%} > {size_perc_threshold:.2%})')
+            # display_mol(substruct)
+            # display_mol(protac_mol)
+            return False
+    return True
+def check_linker_similarity(
+        linker_smiles: str,
+        pois: Union[pd.DataFrame, str],
+        e3s: Union[pd.DataFrame, str],
+        linkers: Optional[Union[pd.DataFrame, str]] = None,
+        pois_similarity_threshold: float = 0.7,
+        e3s_similarity_threshold: float = 0.7,
+        linkers_similarity_threshold: float = 0.6,
+        morgan_fp_generator: Optional[Callable] = None,
+) -> bool:
+    """ Check the similarity of the linker with all the matching POIs and E3s. If too similar to any of them, return False.
+    Args:
+        linker_smiles: The linker SMILES.
+        pois: The POI ligands. Must have a 'FP' column with the Morgan fingerprints.
+        e3s: The E3 binders. Must have a 'FP' column with the Morgan fingerprints.
+        pois_similarity_threshold: The similarity threshold for the POIs.
+        e3s_similarity_threshold: The similarity threshold for the E3s.
+        morgan_fp_generator: The Morgan fingerprint generator.
+    Returns:
+        False if the linker is too similar to any of the POIs or E3s. True otherwise.
+    """
+    # Get the linker fingerprint
+    if morgan_fp_generator is None:
+        morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(
+            radius=2,
+            fpSize=2048,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    linker = Chem.MolFromSmiles(linker_smiles)
+    linker_fp = morgan_fp_generator.GetFingerprint(linker)
+    # Check the similarity of the linker with the POIs and E3s (use BulkTanimotoSimilarity)
+    if isinstance(e3s, str):
+        # Create a one-element list with the E3 fingerprint
+        e3s_fps = [morgan_fp_generator.GetFingerprint(Chem.MolFromSmiles(e3s))]
+    else:
+        e3s_fps = e3s['FP'].to_list()
+    e3s_similarities = DataStructs.BulkTanimotoSimilarity(linker_fp, e3s_fps)
+    if (np.array(e3s_similarities) > e3s_similarity_threshold).any():
+        print(f'WARNING: Linker {linker_smiles} is too similar to an E3 binder')
+        # display_mol(linker)
+        # display_mol(Chem.MolFromSmiles(e3s[e3s_similarities.argmax()]))
+        return False
+    # Check if the linker is similar to any of the POIs or E3s
+    if isinstance(pois, str):
+        # Create a one-element list with the POI fingerprint
+        pois_fps = [morgan_fp_generator.GetFingerprint(Chem.MolFromSmiles(pois))]
+    else:
+        pois_fps = pois['FP'].to_list()
+    pois_similarities = DataStructs.BulkTanimotoSimilarity(linker_fp, pois_fps)
+    if (np.array(pois_similarities) > pois_similarity_threshold).any():
+        # print(f'Error: Linker {linker_smiles} is too similar to a POI ligand')
+        # display_mol(linker)
+        # display_mol(Chem.MolFromSmiles(pois[pois_similarities.argmax()]))
+        return False
+    # Check if the linker is NOT similar to any of the linkers
+    if linkers is not None:
+        if isinstance(linkers, str):
+            # Create a one-element list with the linker fingerprint
+            linkers_fps = [morgan_fp_generator.GetFingerprint(Chem.MolFromSmiles(linkers))]
+        else:
+            linkers_fps = linkers['FP'].to_list()
+        linkers_similarities = DataStructs.BulkTanimotoSimilarity(linker_fp, linkers_fps)
+        if not (np.array(linkers_similarities) > linkers_similarity_threshold).all():
+            print(f'WARNING: Linker {linker_smiles} is too similar to a linker')
+            # display_mol(linker)
+            # display_mol(Chem.MolFromSmiles(linkers[linkers_similarities.argmax()]))
+            return False
+    return True
+def check_substructs_similarity(
+        protac: Union[np.ndarray, str, Chem.Mol],
+        substructs: Dict[str, str],
+        similarity_threshold: float = 0.7,
+        similarity_thresholds : Dict[str, float] = None,
+        morgan_fp_generator: Optional[Callable] = None,
+) -> bool:
+    """ Check the similarity of the PROTAC with the substructures. If too similar to any of them, return False.
+    Args:
+        protac: The PROTAC molecule or its SMILES.
+        substructs: The substructures to check against.
+        similarity_threshold: The similarity threshold.
+        similarity_thresholds: The similarity thresholds for the substructures.
+        morgan_fp_generator: The Morgan fingerprint generator.
+    Returns:
+        False if the PROTAC is too similar to any of the substructures. True otherwise.
+    """
+    if morgan_fp_generator is None:
+        morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(
+            radius=2,
+            fpSize=2048,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    if isinstance(protac, str):
+        protac = Chem.MolFromSmiles(protac)
+        protac_fp = morgan_fp_generator.GetFingerprint(protac)
+    elif isinstance(protac, Chem.Mol):
+        protac_fp = morgan_fp_generator.GetFingerprint(protac)
+    else:
+        protac_fp = protac
+    for key, smiles in substructs.items():
+        substr_fp = morgan_fp_generator.GetFingerprint(Chem.MolFromSmiles(smiles))
+        threshold = similarity_thresholds[key] if similarity_thresholds is not None else similarity_threshold
+        if DataStructs.TanimotoSimilarity(protac_fp, substr_fp) > threshold:
+            print(f'WARNING: {key.upper()} is too similar to the PROTAC, similarity: {DataStructs.TanimotoSimilarity(protac_fp, substr_fp):.4f} > {threshold}')
+            # display_mol(Chem.MolFromSmiles(smiles))
+            return False
+    return True
+def get_split_row(
+        row: pd.Series,
+        substructs: Dict[str, str],
+        poi_smiles_no_dummy: Optional[str] = None,
+        e3_smiles_no_dummy: Optional[str] = None,
+) -> Dict[str, Any]:
+    """ Update the fields of a row with the substructures and their IDs.
+    Args:
+        row: The input row.
+        dictionaries: The dictionaries containing the substructures.
+        substructs: The substructures found in the PROTAC.
+        poi_smiles_no_dummy: The POI ligand SMILES without the dummy atoms.
+        e3_smiles_no_dummy: The E3 binder SMILES without the dummy atoms.
+        update_dict_if_ids_not_found: Whether to update the dictionary if the substructure IDs are not found.
+    Returns:
+        The updated row.
+    """
+    mapped_row = {}
+    mapped_row['PROTAC SMILES'] = canonize_smiles(row['SMILES'])
+    mapped_row['POI Ligand SMILES with direction'] = substructs['poi']
+    mapped_row['E3 Binder SMILES with direction'] = substructs['e3']
+    mapped_row['Linker SMILES with direction'] = substructs['linker']
+    mapped_row['POI Ligand SMILES'] = remove_dummy_atoms(substructs['poi']) if poi_smiles_no_dummy is None else poi_smiles_no_dummy
+    mapped_row['E3 Binder SMILES'] = remove_dummy_atoms(substructs['e3']) if e3_smiles_no_dummy is None else e3_smiles_no_dummy
+    mapped_row['Linker SMILES'] = remove_dummy_atoms(substructs['linker'])
+    # Get the IDs and update the dictionaries with new substructures
+    mapped_row['PROTAC ID'] = get_mol_id(mapped_row['PROTAC SMILES'])
+    mapped_row['POI Ligand ID'] = get_mol_id(mapped_row['POI Ligand SMILES with direction'])
+    mapped_row['E3 Binder ID'] = get_mol_id(mapped_row['E3 Binder SMILES with direction'])
+    mapped_row['Linker ID'] = get_mol_id(mapped_row['Linker SMILES with direction'])
+    return mapped_row
+def split_single_protac(
+        row: pd.Series,
+        dictionaries: Dict[str, pd.DataFrame],
+        biggest_matches_first: bool = True,
+        max_iter_on_linkers: int = 0,
+        split_with_substr_and_linker_matching: bool = False,
+        similarity_threshold: float = 0.65,
+        morgan_radius: Optional[int] = None,
+        morgan_fp_size: Optional[int] = None,
+        morgan_fp_generator: Optional[Callable] = None,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+) -> Dict[str, Any]:
+    """ Map a PROTAC row to the substructures in the dictionaries.
+    Args:
+        row: The input row, containing the PROTAC SMILES, ID, and molecule.
+        dictionaries: The dictionaries containing the substructures.
+        biggest_matches_first: Whether to sort the matches by the number of atoms in the molecule.
+        max_iter_on_linkers: The maximum number of iterations to perform on the linkers.
+    Returns:
+        The mapped row. None if the mapping was not successful.
+    """
+    # # Disable the RDKit warnings that pop up when RDKit fails to create molecules
+    # # NOTE: The following is done to avoid warning messages during multiprocessing
+    # RDLogger.DisableLog("rdApp.*")
+    # blocker = rdBase.BlockLogs()
+    protac_smiles = row['SMILES']
+    protac_mol = row['Molecule']
+    if morgan_fp_generator is None:
+        morgan_radius = 2 if morgan_radius is None else morgan_radius
+        morgan_fp_size = 2048 if morgan_fp_size is None else morgan_fp_size
+        morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(
+            radius=morgan_radius,
+            fpSize=morgan_fp_size,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    else:
+        morgan_radius = 'None'
+        morgan_fp_size = 'None'
+    protac_fp = morgan_fp_generator.GetFingerprint(protac_mol)
+    notes = f'({max_iter_on_linkers=})({split_with_substr_and_linker_matching=})({morgan_radius=})({morgan_fp_size=})'
+    # Get all substructure matches in the POI dictionary
+    # poi_matches = dictionaries['POI Ligand']['Molecule'].apply(lambda x: get_substr_match(protac_mol, x, num_allowed_fragments=1))
+    poi_matches = dictionaries['POI Ligand']['Molecule'].apply(lambda x: protac_mol.HasSubstructMatch(x))
+    pois = dictionaries['POI Ligand'][poi_matches].drop_duplicates(subset=['SMILES'])
+    # Get all substructure matches in the E3 dictionary
+    # e3_matches = dictionaries['E3 Binder']['Molecule'].apply(lambda x: get_substr_match(protac_mol, x, num_allowed_fragments=1))
+    e3_matches = dictionaries['E3 Binder']['Molecule'].apply(lambda x: protac_mol.HasSubstructMatch(x))
+    e3s = dictionaries['E3 Binder'][e3_matches].drop_duplicates(subset=['SMILES'])
+    # # Sort the matches by the number of atoms in the molecule
+    # ascending = False if biggest_matches_first else True
+    # pois = pois.sort_values(by='Molecule', key=lambda s: s.apply(lambda m: m.GetNumAtoms()), ascending=True)
+    # e3s = e3s.sort_values(by='Molecule', key=lambda s: s.apply(lambda m: m.GetNumAtoms()), ascending=True)
+    # Get the POI median, then re-arrenge the pois dataframe so that the median is the first element
+    poi_median = pois['Molecule'].apply(lambda x: x.GetNumAtoms()).median()
+    pois = pois.sort_values(by='Molecule', key=lambda s: s.apply(lambda m: m.GetNumAtoms()), ascending=True)
+    pois = pois.iloc[np.abs(pois['Molecule'].apply(lambda x: x.GetNumAtoms()) - poi_median).argsort()]
+    # Get the E3 median, then re-arrenge the e3s dataframe so that the median is the first element
+    e3_median = e3s['Molecule'].apply(lambda x: x.GetNumAtoms()).median()
+    e3s = e3s.sort_values(by='Molecule', key=lambda s: s.apply(lambda m: m.GetNumAtoms()), ascending=True)
+    e3s = e3s.iloc[np.abs(e3s['Molecule'].apply(lambda x: x.GetNumAtoms()) - e3_median).argsort()]
+    # If any of the substructures is not found, get the matching linkers to be
+    # used later (do it only once).
+    linkers = None
+    if len(pois) == 0 or len(e3s) == 0 or split_with_substr_and_linker_matching:
+        matches = dictionaries['Linker with direction']['Molecule'].apply(lambda x: get_substr_match(protac_mol, x, num_allowed_fragments=2))
+        linkers = dictionaries['Linker with direction'][matches]
+        linkers = linkers.sort_values(by='Molecule', key=lambda s: s.apply(lambda m: m.GetNumAtoms()), ascending=False)
+    # dummy_attachment_id = 1
+    # mapping_found = False
+    # for _, linker in linkers.iterrows():
+    #     if mapping_found:
+    #         break
+    #     for _, poi in pois.iterrows():
+    #         if mapping_found:
+    #             break
+    #         for _, e3 in e3s.iterrows():
+    #             if mapping_found:
+    #                 break
+    #             # Get the replace side chain
+    #             e3_mapped = Chem.ReplaceSidechains(protac_mol, e3['Molecule'], useChirality=True)
+    #             e3_mapped = rename_attachment_id(e3_mapped, dummy_attachment_id, e3_attachment_id)
+    #             if e3_mapped is None:
+    #                 continue
+    #             poi_mapped = Chem.ReplaceSidechains(protac_mol, poi['Molecule'], useChirality=True)
+    #             poi_mapped = rename_attachment_id(poi_mapped, dummy_attachment_id, poi_attachment_id)
+    #             if poi_mapped is None:
+    #                 continue
+    #             # Join the substructures as fragments
+    #             protac_candidate = canonize('.'.join([linker['SMILES'], e3_mapped, poi_mapped]))
+    #             protac_candidate = Chem.MolFromSmiles(protac_candidate)
+    #             protac_candidate = canonize(Chem.molzip(protac_candidate))
+    #             if check_reassembly(protac_mol, protac_candidate):
+    #                 print('Found a match!')
+    #                 mapping_found = True
+    #                 # substructs = {
+    #                 #     'linker': linker['Molecule'],
+    #                 #     'e3': e3['Molecule'],
+    #                 #     'poi': poi['Molecule'],
+    #                 # }
+    #                 # mapped_row = get_split_row(row, dictionaries, substructs, poi['SMILES'], e3['SMILES'])
+    #                 # mapped_row['Notes'] = 'Obtained from matching E3, POI, and Linker found in dictionaries.'
+    #                 # return mapped_row
+    # TODO: Add a variable to get mapped ligands even if the checks failed... add a note when it happens
+    best_substructs_candidate = None
+    # There were matching E3s and matching POIs: try to recover the linker from
+    # an unmapped E3 and an unmapped POI.
+    if len(e3s) > 0 and len(pois) > 0:
+        for _, poi in pois.iterrows():
+            for _, e3 in e3s.iterrows():
+                additional_notes = '(matching_poi=True)(matching_e3=True)(matching_linker=None)'
+                substructs = get_substructs_from_unmapped_e3_poi(protac_smiles, protac_mol, poi['Molecule'], e3['Molecule'])
+                # If the substructure is not found, try to get it from a non-perfect match
+                if substructs is None:
+                    fixed_poi = get_substructure_from_non_perfect_match(protac_mol, poi['Molecule'], poi_attachment_id)
+                    fixed_e3 = get_substructure_from_non_perfect_match(protac_mol, e3['Molecule'], e3_attachment_id)
+                    fixed_poi = poi['Molecule'] if fixed_poi is None else fixed_poi
+                    fixed_e3 = e3['Molecule'] if fixed_e3 is None else fixed_e3
+                    if fixed_poi is not None and fixed_e3 is not None:
+                        substructs = get_substructs_from_unmapped_e3_poi(protac_smiles, protac_mol, fixed_poi, fixed_e3)
+                        if Chem.MolToSmiles(fixed_e3) != e3['SMILES']:
+                            additional_notes += '(non_perfect_e3_match=True)'
+                        else:
+                            additional_notes += '(non_perfect_e3_match=False)'
+                        if Chem.MolToSmiles(fixed_poi) != poi['SMILES']:
+                            additional_notes += '(non_perfect_poi_match=True)'
+                        else:
+                            additional_notes += '(non_perfect_poi_match=False)'
+                if substructs is not None:
+                    size_check = check_substructs_size(protac_mol, substructs, size_perc_threshold=0.7)
+                    # Check if the linker is too similar to any of the matching POIs or E3s (use the bulk Tanimoto similarity)
+                    if not check_linker_similarity(substructs['linker'], pois, e3s, morgan_fp_generator=morgan_fp_generator, e3s_similarity_threshold=similarity_threshold, pois_similarity_threshold=similarity_threshold):
+                        best_substructs_candidate = substructs
+                        continue
+                    if not size_check and not check_substructs_similarity(protac_fp, substructs, similarity_threshold=similarity_threshold, morgan_fp_generator=morgan_fp_generator):
+                        best_substructs_candidate = substructs
+                        # display_mol(protac_mol)
+                        continue
+                    # Fix the bonds close to amide and ester groups, if necessary
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+                    # Check and report if any SMILES was changed
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(amide_bonds_fixed=True)'
+                    else:
+                        additional_notes += '(amide_bonds_fixed=False)'
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+                    # Check and report if any SMILES was changed
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(ester_bonds_fixed=True)'
+                    else:
+                        additional_notes += '(ester_bonds_fixed=False)'
+                    # Add the mapped PROTAC to the final list
+                    mapped_row = get_split_row(row, substructs)
+                    mapped_row['Notes'] = notes + additional_notes
+                    return mapped_row
+    # There were no matching POIs, but some E3s and linkers matched: try to
+    # recover the E3 from an unmapped POI and a mapped Linker
+    if len(e3s) > 0 and split_with_substr_and_linker_matching: # len(pois) == 0 and
+        # NOTE: Only take the largest linker(s) into account
+        if max_iter_on_linkers:
+            selected_linkers = linkers.iloc[:max_iter_on_linkers, :]
+        else:
+            selected_linkers = linkers.iloc[:1, :]
+        for _, e3 in e3s.iterrows():
+            # Adjust the E3 molecule if it is not a perfect match
+            e3_mol_fixed = get_substructure_from_non_perfect_match(protac_mol, e3['Molecule'], e3_attachment_id)
+            e3_mol = e3['Molecule'] if e3_mol_fixed is None else e3_mol_fixed
+            e3_mol = remove_dummy_atoms(e3_mol)
+            if Chem.MolToSmiles(e3_mol) != e3['SMILES']:
+                non_perfect_e3_match = True
+            else:
+                non_perfect_e3_match = False
+            for _, linker in selected_linkers.iterrows():
+                additional_notes = f'(matching_poi=False)(matching_e3=True)(matching_linker=True)({non_perfect_e3_match=})'
+                substructs = get_substructs_from_substr_and_linker(
+                    protac_smiles=protac_smiles,
+                    protac=protac_mol,
+                    substr=e3_mol,
+                    linker=linker['Molecule'],
+                    attachment_id=e3_attachment_id,
+                )
+                if substructs is not None:
+                    size_check = check_substructs_size(protac_mol, substructs, size_perc_threshold=0.7)
+                    if not check_linker_similarity(substructs['linker'], substructs['poi'], e3s, morgan_fp_generator=morgan_fp_generator, e3s_similarity_threshold=similarity_threshold, pois_similarity_threshold=similarity_threshold):
+                        best_substructs_candidate = substructs
+                        continue
+                    if not size_check and not check_substructs_similarity(protac_fp, substructs, similarity_threshold=similarity_threshold, morgan_fp_generator=morgan_fp_generator):
+                        best_substructs_candidate = substructs
+                        # display_mol(protac_mol)
+                        continue
+                    # Fix the bonds close to amide and ester groups, if necessary
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(amide_bonds_fixed=True)'
+                    else:
+                        additional_notes += '(amide_bonds_fixed=False)'
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(ester_bonds_fixed=True)'
+                    else:
+                        additional_notes += '(ester_bonds_fixed=False)'
+                    mapped_row = get_split_row(row, substructs)
+                    mapped_row['Notes'] = notes + additional_notes
+                    return mapped_row
+                # Swap the attachment points on the linker and try again
+                linker_swapped = swap_attachment_points(linker['SMILES'])
+                substructs = get_substructs_from_substr_and_linker(
+                    protac_smiles=protac_smiles,
+                    protac=protac_mol,
+                    substr=e3_mol,
+                    linker=Chem.MolFromSmiles(linker_swapped),
+                    attachment_id=e3_attachment_id,
+                )
+                additional_notes += '(attachment_points_swapped_in_linker=True)'
+                if substructs is not None:
+                    size_check = check_substructs_size(protac_mol, substructs, size_perc_threshold=0.7)
+                    if not check_linker_similarity(substructs['linker'], substructs['poi'], e3s, morgan_fp_generator=morgan_fp_generator, e3s_similarity_threshold=similarity_threshold, pois_similarity_threshold=similarity_threshold):
+                        continue
+                    if not size_check and not check_substructs_similarity(protac_fp, substructs, similarity_threshold=similarity_threshold, morgan_fp_generator=morgan_fp_generator):
+                        # display_mol(protac_mol)
+                        continue
+                    # Fix the bonds close to amide and ester groups, if necessary
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(amide_bonds_fixed=True)'
+                    else:
+                        additional_notes += '(amide_bonds_fixed=False)'
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(ester_bonds_fixed=True)'
+                    mapped_row = get_split_row(row, substructs)
+                    mapped_row['Notes'] = notes + additional_notes
+                    return mapped_row
+    # There were no matching E3s, but some POIs and linkers matched: try to
+    # recover the POI from an unmapped E3 and a mapped Linker
+    if len(pois) > 0 and split_with_substr_and_linker_matching: # and len(e3s) == 0
+        # NOTE: Only take the largest linker(s) into account
+        if max_iter_on_linkers:
+            selected_linkers = linkers.iloc[:max_iter_on_linkers, :]
+        else:
+            selected_linkers = linkers.iloc[:1, :]
+        for _, poi in pois.iterrows():
+            poi_mol = get_substructure_from_non_perfect_match(protac_mol, poi['Molecule'], poi_attachment_id)
+            poi_mol = poi['Molecule'] if poi_mol is None else poi_mol
+            poi_mol = remove_dummy_atoms(poi_mol)
+            if Chem.MolToSmiles(poi_mol) != poi['SMILES']:
+                non_perfect_poi_match = True
+            else:
+                non_perfect_poi_match = False
+            for _, linker in selected_linkers.iterrows():
+                additional_notes = f'(matching_poi=True)(matching_e3=False)(matching_linker=True)({non_perfect_poi_match=})'
+                substructs = get_substructs_from_substr_and_linker(
+                    protac_smiles=protac_smiles,
+                    protac=protac_mol,
+                    substr=poi_mol,
+                    linker=linker['Molecule'],
+                    attachment_id=poi_attachment_id,
+                )
+                if substructs is not None:
+                    size_check = check_substructs_size(protac_mol, substructs, size_perc_threshold=0.7)
+                    if not check_linker_similarity(substructs['linker'], pois, substructs['e3'], morgan_fp_generator=morgan_fp_generator, e3s_similarity_threshold=similarity_threshold, pois_similarity_threshold=similarity_threshold):
+                        best_substructs_candidate = substructs
+                        continue
+                    if not size_check and not check_substructs_similarity(protac_fp, substructs, similarity_threshold=similarity_threshold, morgan_fp_generator=morgan_fp_generator):
+                        best_substructs_candidate = substructs
+                        # display_mol(protac_mol)
+                        continue
+                    # Fix the bonds close to amide and ester groups, if necessary
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(amide_bonds_fixed=True)'
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(ester_bonds_fixed=True)'
+                    mapped_row = get_split_row(row, substructs)
+                    mapped_row['Notes'] = notes + additional_notes
+                    return mapped_row
+                # Swap the attachment points on the linker and try again
+                linker_swapped = swap_attachment_points(linker['SMILES'])
+                substructs = get_substructs_from_substr_and_linker(
+                    protac_smiles=protac_smiles,
+                    protac=protac_mol,
+                    substr=poi_mol,
+                    linker=Chem.MolFromSmiles(linker_swapped),
+                    attachment_id=poi_attachment_id,
+                )
+                additional_notes += '(attachment_points_swapped_in_linker=True)'
+                if substructs is not None:
+                    size_check = check_substructs_size(protac_mol, substructs, size_perc_threshold=0.7)
+                    if not check_linker_similarity(substructs['linker'], substructs['poi'], e3s, morgan_fp_generator=morgan_fp_generator, e3s_similarity_threshold=similarity_threshold, pois_similarity_threshold=similarity_threshold):
+                        best_substructs_candidate = substructs
+                        continue
+                    if not size_check and not check_substructs_similarity(protac_fp, substructs, similarity_threshold=similarity_threshold, morgan_fp_generator=morgan_fp_generator):
+                        best_substructs_candidate = substructs
+                        # display_mol(protac_mol)
+                        continue
+                    # Fix the bonds close to amide and ester groups, if necessary
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(amide_bonds_fixed=True)'
+                    substructs_copy = substructs.copy()
+                    substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+                    if substructs['linker'] != substructs_copy['linker']:
+                        additional_notes += '(ester_bonds_fixed=True)'
+                    mapped_row = get_split_row(row, substructs)
+                    mapped_row['Notes'] = notes + additional_notes
+                    return mapped_row
+    # Get all substructure matches in the Linker with direction dictionary
+    # NOTE: This code is repeated here for performance reasons, to avoid
+    # calculating the matches if not needed.
+    if linkers is None and max_iter_on_linkers:
+        matches = dictionaries['Linker with direction']['Molecule'].apply(lambda x: get_substr_match(protac_mol, x, num_allowed_fragments=2))
+        linkers = dictionaries['Linker with direction'][matches]
+        # Sort all the matches by the number of atoms in the linker, the biggest first
+        linkers = linkers.sort_values(by='Molecule', key=lambda s: s.apply(lambda m: m.GetNumAtoms()), ascending=False)
+    # for j, (_, linker) in enumerate(linkers.iterrows()):
+    #     additional_notes = '(matching_poi=False)(matching_e3=False)(matching_linker=True)'
+    #     if j >= max_iter_on_linkers or max_iter_on_linkers == 0:
+    #         return None
+    for j in range(max_iter_on_linkers):
+        additional_notes = '(matching_poi=False)(matching_e3=False)(matching_linker=True)'
+        linker = linkers.iloc[j, :]
+        substructs = get_substructs_from_mapped_linker(protac_smiles, linker['SMILES'])
+        if substructs is not None:
+            if not check_linker_similarity(substructs['linker'], substructs['poi'], substructs['e3'], morgan_fp_generator=morgan_fp_generator, e3s_similarity_threshold=similarity_threshold, pois_similarity_threshold=similarity_threshold):
+                best_substructs_candidate = substructs
+                continue
+            size_check = check_substructs_size(protac_mol, substructs, size_perc_threshold=0.7)
+            if not size_check and not check_substructs_similarity(protac_fp, substructs, similarity_threshold=similarity_threshold, morgan_fp_generator=morgan_fp_generator):
+                best_substructs_candidate = substructs
+                # display_mol(protac_mol)
+                continue
+            # Fix the bonds close to amide and ester groups, if necessary
+            substructs_copy = substructs.copy()
+            substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+            if substructs['linker'] != substructs_copy['linker']:
+                additional_notes += '(amide_bonds_fixed=True)'
+            substructs_copy = substructs.copy()
+            substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+            if substructs['linker'] != substructs_copy['linker']:
+                additional_notes += '(ester_bonds_fixed=True)'
+            if not check_substructs_size(protac_mol, substructs, size_perc_threshold=0.95):
+                best_substructs_candidate = substructs
+                continue
+            mapped_row = get_split_row(row, substructs)
+            mapped_row['Notes'] = notes + additional_notes
+            return mapped_row
+    # If we are here, it means that the substructures found in the above loops
+    # failed the similarity checks. We add a note and return the best
+    # substructure candidate found.
+    if best_substructs_candidate is not None:
+        substructs_copy = substructs.copy()
+        substructs = adjust_amide_bonds_in_substructs(best_substructs_candidate, protac_smiles)
+        if substructs['linker'] != best_substructs_candidate['linker']:
+            notes += '(amide_bonds_fixed=True)'
+        substructs_copy = substructs.copy()
+        substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+        if substructs['linker'] != substructs_copy['linker']:
+            notes += '(ester_bonds_fixed=True)'
+        mapped_row = get_split_row(row, substructs)
+        mapped_row['Notes'] = notes + '(similarity_checks_failed=True)'
+        return mapped_row
+    return None
+def split_protacs(
+        protac_df: pd.DataFrame,
+        dictionaries: Dict[str, pd.DataFrame],
+        max_iter_on_linkers: int = 0,
+        split_with_substr_and_linker_matching: bool = False,
+        biggest_matches_first: bool = True,
+        update_dict_if_ids_not_found: bool = False,
+        use_multiprocessing: bool = False,
+) -> pd.DataFrame:
+    """ Maps PROTACs to their substructures.
+    Args:
+        protac_df: The input PROTAC dataframe.
+        dictionaries: The input dictionaries.
+        max_iter_on_linkers: The maximum number of matching linkers to iterate over. If zero, there will be no attempt to match linkers in the dictionary. If negative, iterate over all matched linkers. Default is 0.
+        biggest_matches_first: Whether to sort the matches by the number of atoms in the molecule. Default is True.
+        update_dict_if_ids_not_found: DEPRECATED. Whether to update the dictionary if the substructure IDs are not found. Default is False.
+        use_multiprocessing: Whether to use multiprocessing. Default is False.
+    Returns:
+        The mapped PROTAC dataframe.
+    """
+    # if use_multiprocessing:
+    #     global split_single_protac
+    #     with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
+    #         results = pool.map(partial(split_single_protac, dictionaries=dictionaries, biggest_matches_first=biggest_matches_first, max_iter_on_linkers=max_iter_on_linkers), protac_df.copy().to_dict(orient='records'))
+    #     mapped_protacs = pd.DataFrame(results)
+    #     mapped_protacs = mapped_protacs.dropna(subset=['POI Ligand SMILES with direction', 'E3 Binder SMILES with direction', 'Linker SMILES with direction'])
+    #     return mapped_protacs
+    if use_multiprocessing:
+        # TODO: The following does run in parallel, but it gives wrong results. I don't know why. I will have to investigate further.
+        results = Parallel(n_jobs=-1)(delayed(split_single_protac)(row, dictionaries=dictionaries, biggest_matches_first=biggest_matches_first, max_iter_on_linkers=max_iter_on_linkers) for _, row in protac_df.iterrows())
+        mapped_protacs = pd.DataFrame([r for r in results if r is not None])
+        return mapped_protacs
+    mapped_protacs = []
+    for i, row in (pbar := tqdm(protac_df.iterrows(), total=len(protac_df))):
+        pbar.set_description(f'PROTAC n.{i:4d}')
+        r = split_single_protac(
+            row,
+            dictionaries,
+            biggest_matches_first=biggest_matches_first,
+            max_iter_on_linkers=max_iter_on_linkers,
+            split_with_substr_and_linker_matching=split_with_substr_and_linker_matching,
+        )
+        if r is not None:
+            mapped_protacs.append(r)
+            tmp = pd.DataFrame(mapped_protacs)
+            pbar.set_postfix({'len_mapped': len(tmp), 'perc_mapped': f'{len(tmp) / len(protac_df):.1%}'})
+    mapped_protacs = pd.DataFrame(mapped_protacs)
+    return mapped_protacs
+def parse_notes(notes: str) -> Dict[str, Any]:
+    # Define the regex pattern to match key-value pairs within parentheses
+    pattern = r'\(([^=]+)=([^\)]+)\)'
+    # Find all matches in the string
+    matches = re.findall(pattern, notes)
+    # Initialize an empty dictionary to store the parsed key-value pairs
+    parsed_dict = {}
+    # Iterate over the matches and add them to the dictionary
+    for key, value in matches:
+        # Convert the value to the appropriate type (int, bool, None, or str)
+        if value.isdigit():
+            parsed_dict[key] = int(value)
+        elif value.lower() == 'true':
+            parsed_dict[key] = True
+        elif value.lower() == 'false':
+            parsed_dict[key] = False
+        elif value.lower() == 'none':
+            parsed_dict[key] = None
+        else:
+            parsed_dict[key] = value
+    return parsed_dict
+def iterative_protac_splitting(
+        dictionaries: Dict[str, pd.DataFrame],
+        data_dir: str,
+) -> Dict[str, pd.DataFrame]:
+    """ Map PROTACs to their substructures in an iterative way.
+    Args:
+        dictionaries: The input dictionaries. The same format as the output of the `update_dictionary` function.
+        data_dir: The directory where the output data is stored.
+    Returns:
+        The final mapped PROTAC dataframe.
+    """
+    final_df = None
+    non_mapped_protacs = dictionaries['PROTAC'].copy()
+    start_from_beginning = True # Re-map all PROTACs ignoring loading previous results
+    step = -1
+    max_iter_on_linkers = 0
+    split_with_substr_and_linker_matching = False
+    while True:
+        if max_iter_on_linkers == -1 or non_mapped_protacs.empty or step >= 50:
+            break
+        if max_iter_on_linkers == 5:
+            max_iter_on_linkers = -1 # Iterate over all linkers
+        step += 1
+        print('-' * 100)
+        print(f'Step n.{step}')
+        print(f'Max iterations on linkers: {max_iter_on_linkers}')
+        print(f'Map with substr and linker matching: {split_with_substr_and_linker_matching}')
+        print('-' * 50)
+        step_filename = os.path.join(data_dir, f'mapped_protacs_{step=}.csv')
+        final_filename = os.path.join(data_dir, 'mapped_protacs.csv')
+        non_mapped_filename = os.path.join(data_dir, 'non_mapped_protacs.csv')
+        if os.path.exists(step_filename) and not start_from_beginning:
+            # Check if all lines of the file are empty
+            with open(step_filename, 'r') as f:
+                lines = f.readlines()
+                if all([len(line.strip()) == 0 for line in lines]):
+                    mapped_protacs = pd.DataFrame()
+                else:
+                    mapped_protacs = pd.read_csv(step_filename)
+        else:
+            mapped_protacs = split_protacs(
+                non_mapped_protacs,
+                dictionaries=dictionaries,
+                split_with_substr_and_linker_matching=split_with_substr_and_linker_matching,
+                max_iter_on_linkers=max_iter_on_linkers,
+                biggest_matches_first=False,
+                use_multiprocessing=False,
+            )
+            # Add a string at the end of the strings in the 'Notes' column
+            if not mapped_protacs.empty:
+                mapped_protacs['Notes'] = mapped_protacs['Notes'].apply(lambda x: f'{x}({step=})')
+            mapped_protacs.to_csv(step_filename, index=False)
+        # Update the final dataframe and save it to file
+        if final_df is None:
+            final_df = mapped_protacs
+        else:
+            final_df = pd.concat([final_df, mapped_protacs], axis=0).drop_duplicates(subset=['PROTAC SMILES'])
+        final_df.to_csv(final_filename, index=False)
+        print(f'All mapped PROTACs saved to: {final_filename}')
+        # Reporting information
+        mapped_perc = len(mapped_protacs) / len(non_mapped_protacs)
+        total_mapped_perc = len(final_df) / len(dictionaries['PROTAC'])
+        print(f'Number of mapped PROTACs:     {len(mapped_protacs)} ({mapped_perc:.2%})')
+        print(f'Total num. of mapped PROTACs: {len(final_df)} ({total_mapped_perc:.2%})')
+        print('-' * 50)
+        print(final_df['Notes'].value_counts())
+        print('-' * 50)
+        # Get the non-mapped PROTACs yet and save them to file
+        non_mapped_protacs = dictionaries['PROTAC'][~dictionaries['PROTAC']['SMILES'].isin(final_df['PROTAC SMILES'])].copy()
+        non_mapped_protacs[['SMILES', 'ID']].to_csv(non_mapped_filename, index=False)
+        print(f'Non-mapped PROTACs saved to: {non_mapped_filename}')
+        # Control logic for breaking the loop
+        if mapped_protacs.empty:
+            if max_iter_on_linkers == 0 and not split_with_substr_and_linker_matching:
+                split_with_substr_and_linker_matching = True
+                continue
+            else:
+                max_iter_on_linkers += 1
+            continue
+        else:
+            # Using only the linker to map the PROTACs can be unreliable, so if we
+            # found new PROTACs, we should the max_iter_on_linkers to zero and try
+            # to map the PROTACs again with the newly found substructures.
+            max_iter_on_linkers = 0
+            split_with_substr_and_linker_matching = False
+        # Update all dictionaries with the substructures of the mapped PROTACs
+        smiles_list = mapped_protacs['Linker SMILES with direction'].unique()
+        smiles_list = [canonize(smiles) for smiles in smiles_list]
+        dictionaries['Linker with direction'] = update_dictionary(dictionaries['Linker with direction'], smiles_list)
+        # Avoid adding POIs that are in the E3 dictionary!
+        smiles_list = mapped_protacs['POI Ligand SMILES'].unique()
+        smiles_list = [canonize(smiles) for smiles in smiles_list]
+        smiles_list = [s for s in smiles_list if s not in dictionaries['E3 Binder']['SMILES'].values]
+        smiles_list = [remove_dummy_atoms(s) for s in smiles_list if s is not None]
+        # Use Tanimoto similarity to prevent adding POIs too similar to E3s
+        similarity_threshold = 0.5
+        radius = 2
+        nbits = 2048
+        morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nbits, useBondTypes=True, includeChirality=True)
+        pois_to_add = []
+        for poi_smiles in smiles_list:
+            poi_mol = Chem.MolFromSmiles(poi_smiles)
+            poi_fp = morgan_fp_generator.GetFingerprint(poi_mol)
+            similarities = DataStructs.BulkTanimotoSimilarity(poi_fp, dictionaries['E3 Binder']['FP'].to_list())
+            skip_poi = False
+            for sim in similarities:
+                if sim >= similarity_threshold:
+                    skip_poi = True
+                    break
+            if not skip_poi:
+                pois_to_add.append(poi_smiles)
+        dictionaries['POI Ligand'] = update_dictionary(dictionaries['POI Ligand'], smiles_list)
+        # Avoid adding E3s that are in the POI dictionary!
+        smiles_list = mapped_protacs['E3 Binder SMILES'].unique()
+        smiles_list = [canonize(smiles) for smiles in smiles_list]
+        smiles_list = [s for s in smiles_list if s not in dictionaries['POI Ligand']['SMILES'].values]
+        smiles_list = [remove_dummy_atoms(s) for s in smiles_list if s is not None]
+        dictionaries['E3 Binder'] = update_dictionary(dictionaries['E3 Binder'], smiles_list)
+        # Save all dictionaries to file
+        for key, dictionary in dictionaries.items():
+            filename = os.path.join(data_dir, f'dictionary_{key.lower().replace(" ", "_")}.csv')
+            dictionary[['ID', 'SMILES']].to_csv(filename, index=False)
+            print(f'Dictionary saved to: {filename}')
+        return dictionaries

protac_splitter/data/curation/mapping_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from rdkit import Chem
+import pandas as pd
+from protac_splitter.chemoinformatics import (
+    canonize_smiles,
+    remove_stereo,
+    get_mol_id,
+)
+def update_dictionary(
+        dictionary: pd.DataFrame,
+        substr_to_add: list,
+        morgan_fp_generator = None,
+        verbose: int = 0,
+) -> pd.DataFrame:
+    """ Updates a dictionary with a list of additional substructures.
+    The dictionary is a dataframe with columns 'SMILES', 'Molecule', 'ID', and 'FP'.
+    Args:
+        dictionary: The input dictionary dataframe.
+        substr_to_add: The list of additional substructures.
+    Returns:
+        The updated dictionary dataframe.
+    """
+    # Canonize the SMILES strings
+    substr_to_add = [canonize_smiles(smiles) for smiles in substr_to_add if smiles is not None]
+    substr_to_add = list(set(substr_to_add))
+    # Remove entries already in the dictionary
+    for smiles in substr_to_add:
+        if not dictionary.empty and smiles in dictionary[f'SMILES'].unique().tolist():
+            if verbose > 1:
+                print(f'\tWARNING. SMILES already in the dictionary: {smiles}')
+            # Remove it from the list
+            substr_to_add.remove(smiles)
+    new_entries = []
+    for smiles in substr_to_add:
+        try:
+            mol = Chem.MolFromSmiles(smiles)
+        except Exception as e:
+            if verbose:
+                print(e)
+            mol = None
+        # Remove entries that result in invalid molecules
+        if mol is None:
+            continue
+        new_entries.append({
+            'SMILES': smiles,
+            'Molecule': mol,
+            'ID': get_mol_id(smiles),
+        })
+        # Try adding its no-stereochemistry version as well
+        smiles_nostereo = remove_stereo(smiles)
+        if smiles_nostereo is not None and smiles_nostereo != smiles:
+            mol_nostereo = Chem.MolFromSmiles(smiles_nostereo)
+            if mol_nostereo is not None:
+                new_entries.append({
+                    'SMILES': canonize_smiles(smiles_nostereo),
+                    'Molecule': mol_nostereo,
+                    'ID': get_mol_id(smiles_nostereo),
+                })
+    new_entries = pd.DataFrame(new_entries).drop_duplicates()
+    if len(new_entries) > 0:
+        # Add fingerprints to the new entries
+        if morgan_fp_generator is None:
+            morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048, useBondTypes=True, includeChirality=True)
+        new_entries['FP'] = new_entries['Molecule'].apply(lambda x: morgan_fp_generator.GetFingerprint(x) if x is not None else None)
+        if verbose:
+            print(f'Number of substructures added to the dictionary: {len(new_entries)}')
+    # Return the updated dictionary
+    return pd.concat([dictionary, pd.DataFrame(new_entries)], axis=0).drop_duplicates(subset='SMILES').reset_index(drop=True)

protac_splitter/data/curation/substructure_extraction.py ADDED Viewed

	@@ -0,0 +1,586 @@

+import re
+from typing import Any, Dict, List, Optional, Union
+from collections import Counter
+from rdkit import Chem
+from rdkit.Chem import Draw
+from protac_splitter.chemoinformatics import (
+    dummy2query,
+    remove_dummy_atoms,
+    canonize,
+    canonize_smiles,
+    GetSubstructMatchesWithTimeout,
+)
+from protac_splitter.display_utils import (
+    safe_display,
+    display_mol,
+)
+from protac_splitter.evaluation import check_reassembly
+def get_substructs_from_mapped_linker(
+        protac_smiles: str,
+        linker_smiles: str,
+        e3_attachment_id: int = 2,
+        poi_attachment_id: int = 1,
+        verbose: int = 0,
+) -> Dict[str, str]:
+    """ Get the substructures of a PROTAC molecule from a mapped linker SMILES.
+    This function will return the substructures given a linker with
+    directionality, _i.e._, with the two attachment points mapped.
+    Args:
+        protac_smiles: The SMILES of the PROTAC molecule.
+        linker_smiles: The SMILES of the linker molecule. Must have attachment points.
+        verbose: Verbosity level.
+    Returns:
+        A dictionary with the substructure names as keys ('e3', 'linker', and 'poi') and their SMILES as values. None if the matching fails.
+    """
+    protac_smiles = canonize_smiles(protac_smiles)
+    linker_smiles = canonize_smiles(linker_smiles)
+    protac_mol = Chem.MolFromSmiles(protac_smiles)
+    linker_mol = Chem.MolFromSmiles(linker_smiles)
+    # Check if the linker is a substructure of the PROTAC
+    if not protac_mol.HasSubstructMatch(dummy2query(linker_mol), useChirality=True):
+        return None
+    # Split the big molecule into the two fragments
+    frags = Chem.ReplaceCore(protac_mol, dummy2query(linker_mol), labelByIndex=True, replaceDummies=False)
+    if frags is None:
+        return None
+    try:
+        frags = Chem.GetMolFrags(frags, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        # print(e)
+        return None
+    if verbose:
+        safe_display(protac_mol)
+        safe_display(linker_mol)
+    # The linker has a map number at its attachment points: the following is a
+    # dictionary that maps the atom index of the attachment points to their
+    # respective map numbers, i.e., the attachment IDs.
+    linker_idx2map = {}
+    for atom in linker_mol.GetAtoms():
+        if atom.GetAtomicNum() == 0:
+            linker_idx2map[atom.GetIdx()] = atom.GetAtomMapNum()
+    if verbose:
+        print(f'linker indexes: {linker_idx2map}')
+        print('-' * 80)
+    substructs = {'linker': linker_smiles}
+    # After splitting the PROTAC with ReplaceCore, the fragments will have as
+    # attachment points the same atom indexes as the linker. We can then use the
+    # map numbers from the linker to identify the attachment points in the
+    # PROTAC fragments and assign the correct map number to them, i.e., the
+    # attachment ID.
+    for i, side_mol in enumerate(frags):
+        side_smiles = Chem.MolToSmiles(side_mol, canonical=True)
+        # Use a regex to get the number in the pattern, e.g., [9*], in the SMILES
+        attachment_point = re.findall(r'\[(\d+)\*\]', side_smiles)
+        if attachment_point:
+            attachment_point = int(attachment_point[0])
+        else:
+            attachment_point = None
+        if verbose:
+            print(f'Side {i + 1} SMILES: {side_smiles}')
+            print(f'Attachment point: {attachment_point}')
+            safe_display(side_mol)
+        # Get the map from the linker
+        linker_attachment_point = linker_idx2map.get(attachment_point, None)
+        # Modify the SMILES to include the map number
+        if linker_attachment_point is not None:
+            side_smiles = re.sub(r'\[(\d+)\*\]', f'[*:{linker_attachment_point}]', side_smiles)
+            if f'[*:{e3_attachment_id}]' in side_smiles:
+                substructs['e3'] = canonize_smiles(side_smiles)
+            elif f'[*:{poi_attachment_id}]' in side_smiles:
+                substructs['poi'] = canonize_smiles(side_smiles)
+            if verbose:
+                print(f'Modified SMILES: {side_smiles}')
+                safe_display(Chem.MolFromSmiles(side_smiles))
+    # Canonize the substructures SMILES
+    substructs = {k: canonize_smiles(v) for k, v in substructs.items()}
+    # Check that the reassembled PROTAC matches the original PROTAC
+    if not check_reassembly(protac_smiles, '.'.join(substructs.values())):
+        return None
+    return substructs
+def get_attachment_bonds(mol: Chem.Mol, match_atoms: List[int]) -> List[int]:
+    """ Get the bonds to break to separate the substructure from the PROTAC or R-groups molecule.
+    Args:
+        mol: The molecule to break, i.e., the PROTAC.
+        match_atoms: The atoms matched in the PROTAC molecule, from the GetSubstructMatch function.
+    Returns:
+        List[int]: The bond indices to break.
+    """
+    bonds_to_break = []
+    for idx in match_atoms:
+        atom = mol.GetAtomWithIdx(idx)
+        # Skip non-heavy atoms
+        if atom.GetAtomicNum() == 1:
+            continue
+        for bond in atom.GetBonds():
+            neighbor_idx = bond.GetOtherAtomIdx(idx)
+            # Skip if the neighbor atom if non-heavy
+            if mol.GetAtomWithIdx(neighbor_idx).GetAtomicNum() == 1:
+                continue
+            if neighbor_idx not in match_atoms:
+                bonds_to_break.append(bond.GetIdx())
+                # If more than one bond is found, e.g., if the substructure is
+                # connected to the PROTAC/R-groups in multiple places like in a
+                # ring, reset list of bonds and go to the next atom.
+                if len(bonds_to_break) > 1:
+                    bonds_to_break = []
+                    break
+    return bonds_to_break
+def get_substructs_from_unmapped_e3_poi(
+        protac_smiles: str,
+        mol_protac: Chem.Mol,
+        mol_poi: Chem.Mol,
+        mol_e3: Chem.Mol,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        verbose: int = 0,
+        stats: Counter = None,
+) -> Optional[Dict[str, str]]:
+    """ Get the matches of the POI, E3, and linker in the PROTAC molecule.
+    This function will return the substructures given a PROTAC and its unmapped
+    POI and E3 ligand substructures, _i.e._, they do not need to have the
+    attachment points in their SMILES strings.
+    Args:
+        mol_protac: The PROTAC molecule.
+        mol_poi: The POI ligand molecule. Must NOT contain the attachment point.
+        mol_e3: The E3 binder molecule. Must NOT contain the attachment point.
+        verbose: The verbosity level.
+    Returns:
+        Dict: The matches of the POI, E3, and linker in the PROTAC molecule. None if no match is found.
+    """
+    if verbose:
+        safe_display(mol_protac)
+    poi_match = mol_protac.GetSubstructMatch(mol_poi, useChirality=True)
+    # Get bonds to break to separate the POI ligand
+    bonds_to_break_poi = get_attachment_bonds(mol_protac, poi_match)
+    # Return if no bonds are found
+    if len(bonds_to_break_poi) != 1:
+        if stats is not None:
+            stats['multiple POI attachment bonds'] += 1
+        if verbose:
+            print('ERROR: Multiple POI attachment bonds')
+        return None
+    # Break the bonds to isolate the POI ligand
+    frag_mol_poi = Chem.FragmentOnBonds(mol_protac, bonds_to_break_poi, addDummies=True, dummyLabels=[(poi_attachment_id, poi_attachment_id)])
+    # Get the fragments resulting from bond breaking
+    try:
+        frags = Chem.GetMolFrags(frag_mol_poi, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        print(e)
+        return None
+    # Identify the POI ligand fragment
+    poi_fragment = None
+    for frag in frags:
+        if frag.HasSubstructMatch(mol_poi):
+            poi_fragment = frag
+            break
+    if poi_fragment is None:
+        if stats is not None:
+            stats['POI fragment not found'] += 1
+        if verbose:
+            print('ERROR: POI fragment not found')
+        return None
+    # Combine the remaining fragments to get the R-groups
+    # TODO: Check that the length of frags is 1, otherwise, there are multiple fragments
+    r_group_mol = [frag for frag in frags if frag != poi_fragment]
+    if len(r_group_mol) != 1:
+        if stats is not None:
+            stats['multiple POI fragments'] += 1
+        if verbose:
+            for frag in frags:
+                safe_display(frag)
+            print('ERROR: Multiple POI fragments')
+        return None
+    r_group_mol = r_group_mol[0]
+    if verbose:
+        print('POI:', Chem.MolToSmiles(poi_fragment, canonical=True))
+        safe_display(poi_fragment)
+    e3_match = r_group_mol.GetSubstructMatch(mol_e3, useChirality=True)
+    # Get bonds to break to isolate the E3 binder
+    bonds_to_break_e3 = get_attachment_bonds(r_group_mol, e3_match)
+    # Return if no bonds are found
+    if len(bonds_to_break_e3) != 1:
+        if stats is not None:
+            stats['multiple E3 attachment bonds'] += 1
+        if verbose:
+            safe_display(r_group_mol)
+            print('ERROR: Multiple E3 attachment bonds')
+        return None
+    # Break the bonds to isolate the E3 binder
+    frag_mol_e3 = Chem.FragmentOnBonds(r_group_mol, bonds_to_break_e3, addDummies=True, dummyLabels=[(e3_attachment_id, e3_attachment_id)])
+    # Get fragments after breaking bonds in R-groups
+    try:
+        frags = Chem.GetMolFrags(frag_mol_e3, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        print(e)
+        return None
+    # Identify the E3 binder fragment
+    e3_fragment = None
+    for frag in frags:
+        if frag.HasSubstructMatch(mol_e3):
+            e3_fragment = frag
+            break
+    if e3_fragment is None:
+        if stats is not None:
+            stats['E3 fragment not found'] += 1
+        if verbose:
+            print('ERROR: E3 fragment not found')
+        return None
+    if verbose:
+        print('E3:', Chem.MolToSmiles(e3_fragment, canonical=True))
+        safe_display(e3_fragment)
+    # The remaining fragment is the linker
+    # TODO: Check that the length of frags is 1, otherwise, there are multiple fragments
+    linker_mol = [frag for frag in frags if frag != e3_fragment]
+    if len(linker_mol) != 1:
+        if stats is not None:
+            stats['multiple E3 fragments'] += 1
+        if verbose:
+            for frag in frags:
+                safe_display(frag)
+            print('ERROR: Multiple E3 fragments')
+        return None
+    linker_mol = linker_mol[0]
+    poi_smiles = Chem.MolToSmiles(poi_fragment, canonical=True).replace(f'[{poi_attachment_id}*]', f'[*:{poi_attachment_id}]')
+    e3_smiles = Chem.MolToSmiles(e3_fragment, canonical=True).replace(f'[{e3_attachment_id}*]', f'[*:{e3_attachment_id}]')
+    linker_smiles = Chem.MolToSmiles(linker_mol, canonical=True).replace(f'[{poi_attachment_id}*]', f'[*:{poi_attachment_id}]').replace(f'[{e3_attachment_id}*]', f'[*:{e3_attachment_id}]')
+    # Get the substructure names and canonize their SMILES
+    substructs = {'poi': poi_smiles, 'e3': e3_smiles, 'linker': linker_smiles}
+    substructs = {k: canonize_smiles(v) for k, v in substructs.items()}
+    if verbose:
+        print('Linker:', Chem.MolToSmiles(linker_mol, canonical=True))
+        safe_display(linker_mol)
+    # Check that the reassembled PROTAC matches the original PROTAC
+    if check_reassembly(protac_smiles, '.'.join(substructs.values())):
+        return substructs
+    if stats is not None:
+        stats['reassembling failed'] += 1
+    if verbose:
+        print('ERROR: Reassembling failed')
+    return None
+def get_substructure_from_non_perfect_match(
+        protac_mol: Chem.Mol,
+        substruct_mol: Chem.Mol,
+        attachment_id: int,
+        verbose: int = 0,
+) -> Chem.Mol:
+    """ Extract the correct substructure from a PROTAC molecule, given the
+    SMILES of a wrong substructure resulting in many fragments and matches.
+    Sometimes the substructure we have is not a _perfect_ substructure of the
+    PROTAC, _i.e._, it will generate more than two fragments when trying to
+    replace the PROTAC core with it. In this case, this function will perform
+    the following steps:
+    1. Get the largest fragment by trying to replace the PROTAC core with the
+       substructure. This largest fragment will be the other substructure plus
+       the linker.
+    2. We can now remove the largest fragment from the PROTAC to get the
+       "original" substructure without the smaller dangling fragments.
+    Args:
+        protac_mol (Chem.Mol): The PROTAC molecule.
+        substruct_smiles (Chem.Mol): The molecule of the wrong substructure, either the POI ligand or the E3 binder.
+        attachment_id (int): The attachment ID.
+    Returns:
+        Chem.Mol: The extracted substructure molecule. If failing, it will return None.
+    """
+    # Remove the substructure, even if there are "dangling" fragments, to obtain: PROTAC - substruct = (POI + Linker) + remainders
+    linker_and_other_mol = Chem.DeleteSubstructs(protac_mol, substruct_mol, useChirality=True)
+    # Get the largest fragment, i.e., the PROTAC - substruct = POI + Linker
+    try:
+        fragments = Chem.GetMolFrags(linker_and_other_mol, asMols=True)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+    if len(fragments) == 1:
+        if verbose:
+            print("WARNING. There are no small fragments, there's only one fragment.")
+    if not fragments:
+        if verbose:
+            print('ERROR. No fragments found.')
+        return None
+    largest_fragment = max(fragments, key=lambda x: x.GetNumAtoms())
+    # Get the match of the largest fragment in the PROTAC molecule
+    largest_match = protac_mol.GetSubstructMatch(largest_fragment, useChirality=True)
+    # Get bonds to break to isolate the substructure, i.e., the opposite of the POI + Linker
+    bonds_to_break = get_attachment_bonds(protac_mol, largest_match)
+    if len(bonds_to_break) != 1:
+        if verbose:
+            print(f'ERROR. The bond to break is not a single one: {bonds_to_break}')
+        return None
+    # Break the bonds to isolate the substructure
+    frag_mol_substruct = Chem.FragmentOnBonds(protac_mol, bonds_to_break, addDummies=True, dummyLabels=[(attachment_id, attachment_id)])
+    # Get fragments after breaking bonds, i.e., the POI + Linker and the substructure without "remainders"
+    try:
+        frags = Chem.GetMolFrags(frag_mol_substruct, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+    # Get the smallest between the substructure and the POI+Linker fragments
+    substruct_mol = min(frags, key=lambda x: x.GetNumAtoms())
+    substruct_smiles = Chem.MolToSmiles(substruct_mol, canonical=True).replace(f'[{attachment_id}*]', f'[*:{attachment_id}]')
+    substruct_mol = Chem.MolFromSmiles(canonize(substruct_smiles))
+    # Check that the substructure matches in the PROTAC molecule
+    if not protac_mol.HasSubstructMatch(dummy2query(substruct_mol), useChirality=True):
+        if verbose:
+            print('ERROR. Substructure does not match in PROTAC molecule:')
+            print('PROTAC molecule:')
+            safe_display(protac_mol)
+            print('Substructure molecule:')
+            safe_display(substruct_mol)
+        return None
+    return substruct_mol
+def get_mapped_substr_from_protac(
+        protac: Chem.Mol,
+        substr: Chem.Mol,
+        attachment_id: int = 1,
+) -> Optional[Chem.Mol]:
+    """ Get the mapped substructure from a PROTAC molecule and an unmapped substructure.
+    Args:
+        protac: The PROTAC molecule.
+        substr: The unmapped substructure.
+        attachment_id: The attachment point ID to be assigned to the substructure.
+    Returns:
+        The mapped substructure molecule. None if the function fails to find the substructure.
+    """
+    num_matches = len(protac.GetSubstructMatches(substr, useChirality=True))
+    if num_matches != 1:
+        return None
+    other_substr = Chem.ReplaceCore(protac, substr, labelByIndex=False, replaceDummies=False)
+    if other_substr is None:
+        return None
+    mapped_substr = Chem.ReplaceCore(protac, remove_dummy_atoms(other_substr), labelByIndex=False, replaceDummies=False)
+    if mapped_substr is None:
+        return None
+    mapped_smiles = Chem.MolToSmiles(mapped_substr, canonical=True)
+    # Replace "[1*]" or "[2*]" with the correct attachment point with a regex
+    mapped_smiles = re.sub(r'\[(\d+)\*\]', f'[*:{attachment_id}]', mapped_smiles)
+    mapped_smiles = canonize(mapped_smiles)
+    if mapped_smiles is None:
+        return None
+    return Chem.MolFromSmiles(mapped_smiles)
+def get_substructs_from_substr_and_linker(
+        protac_smiles: str,
+        protac: Chem.Mol,
+        substr: Chem.Mol,
+        linker: Chem.Mol,
+        attachment_id: int = 1,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        verbose: int = 0,
+        stats: Counter = None,
+) -> Optional[Dict[str, str]]:
+    """ Get the substructures of a PROTAC molecule from an unmapped substructure and linker.
+    Args:
+        protac_smiles: The SMILES of the PROTAC molecule.
+        protac: The RDKit molecule object of the PROTAC.
+        substr: The RDKit molecule object of the currently matching substructure. Should be UNMAPPED.
+        linker: The RDKit molecule object of the linker.
+        attachment_id: The attachment point ID of the currently matching substructure.
+        verbose: The verbosity level.
+    Returns:
+        Dict: The substructures of the PROTAC molecule. None if the function fails to find the substructures.
+    """
+    if attachment_id not in [poi_attachment_id, e3_attachment_id]:
+        raise ValueError('Attachment ID must be either 1 or 2')
+    if substr is None:
+        return None
+    subr_matches = list(protac.GetSubstructMatches(substr, useChirality=True))
+    if len(subr_matches) != 1:
+        if stats is not None:
+            stats['multiple substructure matches'] += 1
+        if verbose:
+            print('ERROR: Multiple substructure matches')
+        return None
+    subr_match = subr_matches[0]
+    mapped_substr = get_mapped_substr_from_protac(protac, substr, attachment_id)
+    if mapped_substr is None:
+        if stats is not None:
+            stats['mapped substructure not found'] += 1
+        if verbose:
+            print('ERROR: Mapped substructure not found')
+        return None
+    linker_matches = protac.GetSubstructMatches(remove_dummy_atoms(linker), useChirality=True)
+    for linker_match in linker_matches:
+        # Check that the intersection between the substructure and the linker
+        # matches is only one atom, i.e., the attachment point
+        if len(set(subr_match).intersection(linker_match)) == 1:
+            linker_match = linker_match
+            break
+    # Based on the linker match found, remove it from the PROTAC
+    emol = Chem.EditableMol(protac)
+    # Remove atoms in descending order of their indices
+    for idx in sorted(linker_match, reverse=True):
+        emol.RemoveAtom(idx)
+    # Get the modified molecule
+    try:
+        protac_fragments = emol.GetMol()
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+    try:
+        Chem.SanitizeMol(protac_fragments)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+    if verbose:
+        img = Draw.MolToImage(protac_fragments, highlightAtoms=linker_match, size=(800, 300))
+        safe_display(img)
+    # Get the fragments after removing the linker
+    try:
+        fragments = Chem.GetMolFrags(protac_fragments, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        if verbose:
+            print(e)
+        return None
+    if len(fragments) != 2:
+        if stats is not None:
+            stats['multiple fragments after removing the linker'] += 1
+        if verbose:
+            for frag in fragments:
+                safe_display(frag)
+            print('ERROR: Multiple fragments after removing the linker')
+        return None
+    substructs = {}
+    substructs['linker'] = Chem.MolToSmiles(linker, canonical=True)
+    for frag in fragments:
+        if frag.HasSubstructMatch(substr, useChirality=True):
+            label = 'e3' if attachment_id == e3_attachment_id else 'poi'
+            substructs[label] = Chem.MolToSmiles(mapped_substr, canonical=True)
+            # Replace "[1*]" or "[2*]" with the correct attachment point with a regex
+            substructs[label] = re.sub(r'\[(\d+)\*\]', f'[*:{attachment_id}]', substructs[label])
+            if verbose:
+                print(f'Found {label.capitalize()} fragment.')
+                img = Draw.MolToImage(Chem.MolFromSmiles(substructs[label]), size=(800, 300))
+                safe_display(img)
+        else:
+            label = 'e3' if attachment_id == poi_attachment_id else 'poi'
+            other_attachment_id = e3_attachment_id if label == 'e3' else poi_attachment_id
+            other_substr = get_mapped_substr_from_protac(protac, frag, other_attachment_id)
+            if other_substr is None:
+                return None
+            substructs[label] = Chem.MolToSmiles(other_substr, canonical=True)
+            if verbose:
+                print(f'Found {label.capitalize()} fragment.')
+                img = Draw.MolToImage(Chem.MolFromSmiles(substructs[label]), size=(800, 300))
+                safe_display(img)
+    # Canonicalize the SMILES strings
+    substructs = {k: canonize(v) for k, v in substructs.items()}
+    # Check that the reassembled PROTAC matches the original PROTAC
+    if not check_reassembly(protac_smiles, '.'.join(substructs.values()), stats, verbose):
+        return None
+    return substructs
+def swap_attachment_points(
+        s: str,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+) -> str:
+    """ Swaps the attachment points in a SMARTS string.
+    Args:
+        s: The input SMARTS string.
+    Returns:
+        The SMARTS string with the attachment points swapped.
+    """
+    tmp_e3_id = '^^^^E3^^^^'
+    tmp_poi_id = '^^^^POI^^^^'
+    s = s.replace(f'[*:{poi_attachment_id}]', f'[*:{tmp_poi_id}]')
+    s = s.replace(f'[*:{e3_attachment_id}]', f'[*:{tmp_e3_id}]')
+    s = s.replace(f'[*:{tmp_poi_id}]', f'[*:{e3_attachment_id}]')
+    s = s.replace(f'[*:{tmp_e3_id}]', f'[*:{poi_attachment_id}]')
+    return canonize(s)

protac_splitter/data/generation/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .generation import generate_protacs
+from .functional_groups import (
+    get_functional_group_at_attachment,
+    get_functional_groups_distributions,
+)
+__all__ = [
+    'generate_protacs',
+    'get_functional_group_at_attachment',
+    'get_functional_groups_distributions',
+]

protac_splitter/data/generation/functional_groups.py ADDED Viewed

	@@ -0,0 +1,400 @@

+from typing import Dict, Optional, Union
+from collections import defaultdict, Counter
+import json
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import Draw
+from tqdm import tqdm
+from protac_splitter.chemoinformatics import (
+    get_atom_idx_at_attachment,
+    canonize_smarts,
+)
+from protac_splitter.display_utils import (
+    safe_display,
+    display_mol,
+)
+def get_functional_group_at_attachment(
+        protac: Chem.Mol,
+        substruct: Chem.Mol,
+        linker: Chem.Mol,
+        n_hops: int = 1,
+        timeout: Optional[Union[int, float]] = None,
+        return_dict: bool = False,
+        verbose: int = 0,
+) -> Union[str, Dict[str, str]]:
+    """ Get the functional group at the attachment point of a substructure in the PROTAC molecule.
+    Args:
+        protac: The PROTAC molecule.
+        substruct: The substructure of the PROTAC that contains the attachment point, e.g., the POI or E3 ligase.
+        linker: The linker molecule.
+        n_hops: The number of hops to consider for the neighborhood.
+        timeout: The timeout for the substructure search.
+        return_dict: Whether to return the functional groups as a dictionary.
+        verbose: Verbosity level.
+    Returns:
+        str | Dict[str, str]: The SMARTS of the functional group at the attachment point. If return_dict is True, a dictionary with the SMARTS of the functional groups at the attachment point and at the "two sides" of the attachment point (keys: 'attachment', 'substruct', 'linker').
+    """
+    protac = Chem.AddHs(protac)
+    substruct = Chem.AddHs(substruct)
+    if linker is not None:
+        linker = Chem.AddHs(linker)
+    attachment_idxs = get_atom_idx_at_attachment(
+            protac=protac,
+            substruct=substruct,
+            linker=linker,
+            timeout=timeout,
+            return_dict=True,
+            verbose=0,
+    )
+    # Get all neighboring atoms that are n_hops away from the attachment point
+    if attachment_idxs is None:
+        return None
+    if len(attachment_idxs) != 2:
+        return None
+    if verbose:
+        print(f'Attachment points: {attachment_idxs}')
+        img = Draw.MolToImage(protac, highlightAtoms=attachment_idxs.values(), size=(800, 500))
+        safe_display(img)
+        print('Neighbors:')
+    # Recursively find neighbors at n_hops distance
+    neighborhood = set([protac.GetAtomWithIdx(idx) for idx in attachment_idxs.values()])
+    def find_neighbors(atom, hops, excluded_atom_idx=None):
+        if hops <= 0:
+            return
+        for neighbor in atom.GetNeighbors():
+            if excluded_atom_idx is not None and neighbor.GetIdx() == excluded_atom_idx:
+                neighborhood.add(neighbor)
+                continue
+            neighborhood.add(neighbor)
+            find_neighbors(neighbor, hops - 1)
+    for idx in attachment_idxs.values():
+        find_neighbors(protac.GetAtomWithIdx(idx), n_hops)
+    # Display the neighborhood
+    if verbose:
+        print(f'Neighbors at {n_hops} hops:')
+        # Get options to display all hydrogen atoms
+        options = Draw.DrawingOptions()
+        # Add a legend to the image
+        options.legend = 'Neighbors at attachment points'
+        img = Draw.MolToImage(protac, highlightAtoms=[a.GetIdx() for a in neighborhood], size=(800, 500), options=options)
+        safe_display(img)
+    # # NOTE: The following is an overkill, there is an RDKit function to extract a substructure
+    # neighborhood_mol = extract_atoms_as_molecule(protac, [a.GetIdx() for a in neighborhood])
+    # neighborhood_smarts = canonize_smarts(Chem.MolToSmarts(neighborhood_mol))
+    # Extract the SMARTS given the atom indices of the neighborhood
+    neighborhood_idxs = [a.GetIdx() for a in neighborhood]
+    neighborhood_smarts = Chem.MolFragmentToSmarts(protac, neighborhood_idxs)
+    neighborhood_smarts = canonize_smarts(neighborhood_smarts)
+    if verbose:
+        print(neighborhood_smarts)
+        display_mol(Chem.MolFromSmarts(neighborhood_smarts), display_svg=False)
+    if return_dict:
+        smarts = {}
+        smarts['attachment'] = neighborhood_smarts
+        # Get the SMARTS at the attachment point and at its "two sides"
+        for side, idx in attachment_idxs.items():
+            # NOTE: We know that attachment_idxs is a dictionary with two keys,
+            # 'susbtruct' and 'linker', so we can directly use the other key
+            other_side = 'linker' if side == 'substruct' else 'substruct'
+            excluded_atom_idx = attachment_idxs[other_side]
+            neighborhood = {protac.GetAtomWithIdx(idx)}
+            find_neighbors(protac.GetAtomWithIdx(idx), n_hops, excluded_atom_idx=excluded_atom_idx)
+            # Get the atom indices of the neighborhood
+            neighborhood_idxs = [a.GetIdx() for a in neighborhood]
+            # Copy the PROTAC molecule and set the excluded_atom_idx to a dummy
+            p = Chem.Mol(protac)
+            p.GetAtomWithIdx(excluded_atom_idx).SetAtomicNum(0)
+            # Extract the SMARTS from the copied PROTAC given the indeces
+            s = Chem.MolFragmentToSmarts(p, neighborhood_idxs)
+            smarts[other_side] = canonize_smarts(s)
+        return smarts
+    return neighborhood_smarts
+def get_functional_group_at_attachment_side(
+    substruct: Chem.Mol,
+    attachment_id: Optional[int] = None,
+    n_hops: int = 2,
+    add_Hs: bool = True,
+) -> Optional[str]:
+    """ Get the functional group at the attachment point of a substructure in the PROTAC molecule.
+    Args:
+        substruct: The substructure of the PROTAC that contains the attachment point, e.g., the POI or E3 ligase.
+        attachment_id: The attachment point ID in the substructure. E.g., 1 for the POI, as in "[*:1]".
+        n_hops: The number of hops to consider for the neighborhood. Default is 2.
+        add_Hs: Whether to add hydrogens to the substructure.
+    Returns:
+        str: The SMARTS of the functional group at the attachment point. None if failed.
+    """
+    if add_Hs:
+        substruct = Chem.AddHs(substruct)
+    # Get the atom index of the attachment point, i.e., a dummy atom
+    attachment_idx2map = {}
+    for atom in substruct.GetAtoms():
+        if atom.GetAtomicNum() == 0:
+            # Get the mapped atom index
+            attachment_idx2map[atom.GetIdx()] = atom.GetAtomMapNum()
+    if not attachment_idx2map:
+        return None
+    # If we are dealing with a linker, get the specific attachment point
+    if attachment_id is not None:
+        attachment_idx = [k for k, v in attachment_idx2map.items() if v == attachment_id]
+        if not attachment_idx:
+            return None
+        attachment_idx = attachment_idx[0]
+    else:
+        attachment_idx = list(attachment_idx2map.keys())[0]
+    neighborhood = {substruct.GetAtomWithIdx(attachment_idx)}
+    def find_neighbors(atom, hops):
+        if hops <= 0:
+            return
+        for neighbor in atom.GetNeighbors():
+            neighborhood.add(neighbor)
+            find_neighbors(neighbor, hops - 1)
+    find_neighbors(substruct.GetAtomWithIdx(attachment_idx), n_hops)
+    neighborhood_idxs = [a.GetIdx() for a in neighborhood]
+    neighborhood_smarts = Chem.MolFragmentToSmarts(substruct, neighborhood_idxs)
+    if neighborhood_smarts:
+        return canonize_smarts(neighborhood_smarts)
+    return None
+def get_functional_groups_distributions(
+        df: pd.DataFrame,
+        get_side_chain_info: bool = False,
+        timeout: Optional[Union[int, float]] = None,
+        filename_distributions: Optional[str] = None,
+        filename_mappings: Optional[str] = None,
+        filename_df_with_functional_groups: Optional[str] = None,
+        load_from_file: bool = True,
+        verbose: int = 0,
+) -> Dict[str, Dict[str, set]]:
+    """ Get the distributions of functional groups at attachment points in a dataframe of PROTACs.
+    The input dataframe should contain the following columns:
+        - 'PROTAC SMILES': The SMILES of the PROTAC.
+        - 'POI Ligand SMILES with direction': The SMILES of the POI ligand.
+        - 'Linker SMILES with direction': The SMILES of the linker.
+        - 'E3 Binder SMILES with direction': The SMILES of the E3 binder.
+    Args:
+        df: The DataFrame containing the PROTACs.
+        get_side_chain_info: Whether to get the side chain information along with the functional groups at the attachment points.
+        timeout: The timeout for the substructure search. Default is None.
+        verbose: Verbosity level.
+    Returns:
+        Dict[str, Dict[str, set]]: The distributions of functional groups at attachment points in PROTACs.
+    """
+    smarts_counter = Counter()
+    e3_smarts_counter = Counter()
+    poi_smarts_counter = Counter()
+    substr_smarts_counter = {
+        'poi2linker': defaultdict(Counter),
+        'linker2poi': defaultdict(Counter),
+        'e32linker': defaultdict(Counter),
+        'linker2e3': defaultdict(Counter),
+    }
+    # Assign to each functional group the list of substructures that appear in the df
+    poi_substr2fg = defaultdict(set)
+    e3_substr2fg = defaultdict(set)
+    # Assign to each substructure the list of functional groups that appear in the df
+    poi_fg_2_substr = defaultdict(set)
+    e3_fg_2_substr = defaultdict(set)
+    substr_fg_2_linker = defaultdict(set)
+    linker2fg = defaultdict(dict)
+    if load_from_file:
+        if filename_distributions is not None and filename_mappings is not None:
+            with open(filename_distributions, 'r') as f:
+                fg_distr = json.load(f)
+            with open(filename_mappings, 'r') as f:
+                fg_mappings = json.load(f)
+            ret = {}
+            ret.update(fg_distr)
+            ret.update(fg_mappings)
+            return ret
+        else:
+            print(f'WARNING: No filename provided to load the mappings from. The functional groups will be recomputed.')
+    df_with_functional_groups = []
+    for i, row in tqdm(df.iterrows(), total=len(df)):
+        protac_smiles = row['PROTAC SMILES']
+        poi_smiles = row['POI Ligand SMILES with direction']
+        linker_smiles = row['Linker SMILES with direction']
+        e3_smiles = row['E3 Binder SMILES with direction']
+        protac = Chem.MolFromSmiles(protac_smiles)
+        poi = Chem.MolFromSmiles(poi_smiles)
+        e3 = Chem.MolFromSmiles(e3_smiles)
+        linker = Chem.MolFromSmiles(linker_smiles)
+        if None in [protac, poi, e3, linker]:
+            print(f'WARNING: Could not parse the following SMILES:')
+            print(f'PROTAC: {protac_smiles}')
+            print(f'POI: {poi_smiles}')
+            print(f'Linker: {linker_smiles}')
+            print(f'E3: {e3_smiles}')
+            print('-' * 80)
+        # We have a bit of care with the linker, as it can be empty
+        try:
+            _ = Chem.molzip(Chem.MolFromSmiles('.'.join([poi_smiles, linker_smiles, e3_smiles])))
+        except:
+            print(f'WARNING: The linker might be empty: {linker_smiles}')
+            linker = None
+        if linker is not None:
+            fg_poi = get_functional_group_at_attachment(protac, poi, linker, timeout=timeout, return_dict=get_side_chain_info)
+            fg_e3 = get_functional_group_at_attachment(protac, e3, linker, timeout=timeout, return_dict=get_side_chain_info)
+        else:
+            # If the linker is empty, then we use the other side as the linker
+            fg_poi = get_functional_group_at_attachment(protac, poi, e3, return_dict=get_side_chain_info)
+            fg_e3 = get_functional_group_at_attachment(protac, e3, poi, return_dict=get_side_chain_info)
+        if get_side_chain_info:
+            if fg_poi is not None:
+                smarts_counter.update([fg_poi['attachment']])
+                poi_smarts_counter.update([fg_poi['substruct']])
+                substr_smarts_counter['poi2linker'][fg_poi['substruct']].update([fg_poi['linker']])
+                substr_smarts_counter['linker2poi'][fg_poi['linker']].update([fg_poi['substruct']])
+                linker2fg[linker_smiles]['poi'] = fg_poi['attachment']
+                poi_substr2fg[poi_smiles].append(fg_poi['attachment'])
+                poi_fg_2_substr[fg_poi['attachment']].update([poi_smiles])
+            if fg_e3 is not None:
+                smarts_counter.update([fg_e3['attachment']])
+                e3_smarts_counter.update([fg_e3['substruct']])
+                substr_smarts_counter['e32linker'][fg_e3['substruct']].update([fg_e3['linker']])
+                substr_smarts_counter['linker2e3'][fg_e3['linker']].update([fg_e3['substruct']])
+                linker2fg[linker_smiles]['e3'] = fg_e3['attachment']
+                e3_substr2fg[e3_smiles].update(fg_e3['attachment'])
+                e3_fg_2_substr[fg_e3['attachment']].update([e3_smiles])
+        else:
+            if fg_poi is not None:
+                smarts_counter.update([fg_poi])
+                poi_smarts_counter.update([fg_poi])
+                poi_substr2fg[poi_smiles].update([fg_poi])
+                poi_fg_2_substr[fg_poi].update([poi_smiles])
+                substr_fg_2_linker[fg_poi].update([linker_smiles])
+            if fg_e3 is not None:
+                smarts_counter.update([fg_e3])
+                e3_smarts_counter.update([fg_e3])
+                e3_substr2fg[e3_smiles].update([fg_e3])
+                e3_fg_2_substr[fg_e3].update([e3_smiles])
+                substr_fg_2_linker[fg_e3].update([linker_smiles])
+        # Update the DataFrame with the functional groups
+        if fg_poi is not None:
+            row['POI Ligand Functional Group'] = fg_poi
+        if fg_e3 is not None:
+            row['E3 Binder Functional Group'] = fg_e3
+        df_with_functional_groups.append(row)
+    # Normalize all the counts to probability distributions
+    fg_distr = {k: v / smarts_counter.total() for k, v in smarts_counter.items()}
+    e3_fg_distr = {k: v / e3_smarts_counter.total() for k, v in e3_smarts_counter.items()}
+    poi_fg_distr = {k: v / poi_smarts_counter.total() for k, v in poi_smarts_counter.items()}
+    # Sort the probability distributions
+    fg_distr = dict(sorted(fg_distr.items(), key=lambda x: x[1], reverse=True))
+    e3_fg_distr = dict(sorted(e3_fg_distr.items(), key=lambda x: x[1], reverse=True))
+    poi_fg_distr = dict(sorted(poi_fg_distr.items(), key=lambda x: x[1], reverse=True))
+    if not get_side_chain_info:
+        ret = {
+            'fg_distr': fg_distr,
+            'e3_fg_distr': e3_fg_distr,
+            'poi_fg_distr': poi_fg_distr,
+            'poi_fg_2_substr': poi_fg_2_substr,
+            'e3_fg_2_substr': e3_fg_2_substr,
+            'substr_fg_2_linker': substr_fg_2_linker,
+        }
+    # Normalize the linker-to-substructure to probability distributions
+    if get_side_chain_info:
+        side_fg_distr = defaultdict(dict)
+        for direction, smarts2counter in substr_smarts_counter.items():
+            for smarts, counter in smarts2counter.items():
+                side_fg_distr[direction][smarts] = {k: v / counter.total() for k, v in counter.items()}
+                side_fg_distr[direction][smarts] = dict(sorted(side_fg_distr[direction][smarts].items(), key=lambda x: x[1], reverse=True))
+            if verbose:
+                # Display the top 5 functional groups
+                print('-' * 80)
+                print(f'{"-".join(direction.upper().split("2"))}:')
+                print('-' * len(direction) + '-' * 2)
+                for i, (smarts, probs) in enumerate(side_fg_distr[direction].items()):
+                    if i >= 5:
+                        break
+                    print(f'{smarts}:')
+                    for j, (sma, prob) in enumerate(probs.items()):
+                        if j >= 5:
+                            break
+                        print(f'\t{prob:.2%} -> {sma}')
+        ret = {
+            'fg_distr': fg_distr,
+            'e3_fg_distr': e3_fg_distr,
+            'poi_fg_distr': poi_fg_distr,
+            'poi_fg_2_substr': poi_fg_2_substr,
+            'e3_fg_2_substr': e3_fg_2_substr,
+            'substr_fg_2_linker': substr_fg_2_linker,
+            'side_fg_distr': side_fg_distr,
+        }
+    if filename_distributions is not None:
+        # Save to JSON file
+        distributions = {k: v for k, v in ret.items() if 'distr' in k}
+        with open(filename_distributions, 'w') as f:
+            json.dump(distributions, f, indent=4)
+        print(f'Functional group distributions saved to: {filename_distributions}')
+    if filename_mappings is not None:
+        # Convert sets to lists to make the data serializable
+        fg_mappings = {k: {sk: list(s) for sk, s in v.items()} for k, v in ret.items() if 'distr' not in k}
+        with open(filename_mappings, 'w') as f:
+            json.dump(fg_mappings, f, indent=4)
+        print(f'Functional group mappings saved to: {filename_mappings}')
+    df_with_functional_groups = pd.DataFrame(df_with_functional_groups)
+    ret['dataframe'] = df_with_functional_groups
+    if filename_df_with_functional_groups is not None:
+        df_with_functional_groups.to_csv(filename_df_with_functional_groups, index=False)
+        print(f'DataFrame with functional groups saved to: {filename_df_with_functional_groups}')
+    return ret

protac_splitter/data/generation/generation.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from rdkit import Chem
+from protac_splitter.evaluation import check_reassembly
+def generate_protacs(
+        poi_fg_distr: Dict[str, float],
+        e3_fg_distr: Dict[str, float],
+        substr_fg_2_linker: Dict[str, List[str]],
+        poi_fg_2_substr: Dict[str, List[str]],
+        e3_fg_2_substr: Dict[str, List[str]],
+        num_samples: int,
+        random_state: int = 42,
+        batch_size: int = 1000,
+        max_workers: int = 4,
+        original_df: Optional[pd.DataFrame] = None,
+        filename_generated_df: Optional[str] = None,
+        base_data_dir: Optional[str] = None,
+        cover_all_smiles: bool = False,
+) -> pd.DataFrame:
+    """ Generate PROTACs given the distributions of functional groups at attachment points.
+    Args:
+        poi_fg_distr: The distribution of functional groups at the POI attachment point.
+        e3_fg_distr: The distribution of functional groups at the E3 attachment point.
+        substr_fg_2_linker: The mapping of functional groups to linkers.
+        poi_fg_2_substr: The mapping of functional groups to POI substrates.
+        e3_fg_2_substr: The mapping of functional groups to E3 substrates.
+        num_samples: The number of PROTACs to generate.
+        random_state: The random state for reproducibility.
+        batch_size: The batch size for generating PROTACs.
+        max_workers: The maximum number of workers for the ThreadPoolExecutor.
+        original_df: The original DataFrame containing the PROTACs. Must have a
+                     column named 'PROTAC SMILES' containing the strings to
+                     avoid generating. The check is done on strings, so make
+                     sure to canonize/standardize the SMILES strings.
+        filename_generated_df: The filename to save the generated PROTACs.
+    Returns:
+        pd.DataFrame: The DataFrame containing the generated PROTACs.
+    """
+    np.random.seed(random_state)
+    final_df = pd.DataFrame()
+    total_batches = int(np.ceil(num_samples / batch_size))
+    def generate_protac_batch(batch_size: int, random_state: int) -> List[dict]:
+        np.random.seed(random_state)
+        # Sample functional groups for POI and E3
+        poi_fgs = np.random.choice(list(poi_fg_distr.keys()), size=batch_size, p=list(poi_fg_distr.values()))
+        e3_fgs = np.random.choice(list(e3_fg_distr.keys()), size=batch_size, p=list(e3_fg_distr.values()))
+        # Map functional groups to corresponding substrates
+        # NOTE: When size argument is specified, the output is a numpy array.
+        # NOTE: If the functional group is not in the dictionary, the output is an empty numpy array.
+        poi_samples = [
+            np.random.choice(poi_fg_2_substr.get(fg, []), size=1 if fg in poi_fg_2_substr and poi_fg_2_substr[fg] else 0)
+            for fg in poi_fgs
+        ]
+        e3_samples = [
+            np.random.choice(e3_fg_2_substr.get(fg, []), size=1 if fg in e3_fg_2_substr and e3_fg_2_substr[fg] else 0)
+            for fg in e3_fgs
+        ]
+        generated_protacs = []
+        for poi_smiles, poi_fg, e3_smiles, e3_fg in zip(poi_samples, poi_fgs, e3_samples, e3_fgs):
+            # Check if poi_smiles and e3_smiles are not an empty numpy array
+            if poi_smiles.size == 0 or e3_smiles.size == 0:
+                continue
+            # Convert the numpy arrays to strings
+            poi_smiles, e3_smiles = poi_smiles[0], e3_smiles[0]
+            linkers = set(substr_fg_2_linker.get(poi_fg, [])) & set(substr_fg_2_linker.get(e3_fg, []))
+            if not linkers:
+                continue
+            linker_smiles = np.random.choice(list(linkers))
+            # Get the PROTAC by combining the POI, linker, and E3
+            ligands_smiles = '.'.join([poi_smiles, linker_smiles, e3_smiles])
+            protac = Chem.MolFromSmiles(ligands_smiles)
+            if protac is None:
+                continue
+            try:
+                protac = Chem.molzip(protac)
+            except:
+                continue
+            # Sanitize molecule
+            try:
+                zero_on_success = Chem.SanitizeMol(protac, catchErrors=True)
+                if zero_on_success != 0:
+                    continue
+                protac_smiles = Chem.MolToSmiles(protac, canonical=True)
+            except:
+                continue
+            if original_df is not None and protac_smiles in original_df['PROTAC SMILES'].values:
+                continue
+            # Check if PROTAC can be reassembled
+            if not check_reassembly(protac_smiles, ligands_smiles):
+                continue
+            generated_protacs.append({
+                'PROTAC SMILES': protac_smiles,
+                'POI Ligand SMILES with direction': poi_smiles,
+                'Linker SMILES with direction': linker_smiles,
+                'E3 Binder SMILES with direction': e3_smiles,
+                'POI Ligand Functional Group': poi_fg,
+                'E3 Binder Functional Group': e3_fg,
+            })
+        return generated_protacs
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        for i in tqdm(range(total_batches), desc="Generating Batches"):
+            futures.append(executor.submit(generate_protac_batch, batch_size, random_state + i))
+        for i, future in tqdm(enumerate(futures), desc="Processing Results", total=total_batches):
+            generated_batch = future.result()
+            if generated_batch:
+                batch_df = pd.DataFrame(generated_batch)
+                final_df = pd.concat([final_df, batch_df]).drop_duplicates()
+                if i % 100 == 0:
+                    if base_data_dir:
+                        batch_df.to_csv(os.path.join(base_data_dir, f'generated_protacs_batch={i}.csv'), index=False)
+                    else:
+                        batch_df.to_csv(f'generated_protacs_batch={i}.csv', index=False)
+                    if filename_generated_df:
+                        final_df.to_csv(filename_generated_df, index=False)
+                if len(final_df) >= num_samples:
+                    break
+        if not final_df.empty:
+            generated_pois = set(final_df['POI Ligand SMILES with direction'].unique())
+            generated_e3s = set(final_df['E3 Binder SMILES with direction'].unique())
+            generated_linkers = set(final_df['Linker SMILES with direction'].unique())
+        else:
+            generated_pois = set()
+            generated_e3s = set()
+            generated_linkers = set()
+        # Check how we covered the available substructures
+        avail_pois = set()
+        avail_e3s = set()
+        avail_linkers = set()
+        for fg in poi_fg_2_substr:
+            avail_pois.update(set(poi_fg_2_substr[fg]))
+        for fg in e3_fg_2_substr:
+            avail_e3s.update(set(e3_fg_2_substr[fg]))
+        for fg in substr_fg_2_linker:
+            avail_linkers.update(set(substr_fg_2_linker[fg]))
+        e3_coverage = len(generated_e3s) / len(avail_e3s)
+        poi_coverage = len(generated_pois) / len(avail_pois)
+        linker_coverage = len(generated_linkers) / len(avail_linkers)
+        print(f"POI coverage:    {poi_coverage:.3%}")
+        print(f"E3 coverage:     {e3_coverage:.3%}")
+        print(f"Linker coverage: {linker_coverage:.3%}")
+        # Get the "leftover" ligands
+        leftover_pois = avail_pois - generated_pois
+        leftover_e3s = avail_e3s - generated_e3s
+        leftover_linkers = avail_linkers - generated_linkers
+        covering_df = []
+        with tqdm(total=len(leftover_pois) + len(leftover_e3s) + len(leftover_linkers), desc="Covering Leftover Ligands") as pbar:
+            while True:
+                if not cover_all_smiles:
+                    break
+                # Randomly select a POI, E3, and linker
+                if not leftover_pois:
+                    pois_to_sample = avail_pois
+                else:
+                    pois_to_sample = leftover_pois
+                if not leftover_e3s:
+                    e3s_to_sample = avail_e3s
+                else:
+                    e3s_to_sample = leftover_e3s
+                if not leftover_linkers:
+                    linkers_to_sample = avail_linkers
+                else:
+                    linkers_to_sample = leftover_linkers
+                poi_smiles = np.random.choice(list(pois_to_sample))
+                e3_smiles = np.random.choice(list(e3s_to_sample))
+                linker_smiles = np.random.choice(list(linkers_to_sample))
+                # Get the PROTAC by combining the POI, linker, and E3
+                ligands_smiles = '.'.join([poi_smiles, linker_smiles, e3_smiles])
+                protac = Chem.MolFromSmiles(ligands_smiles)
+                if protac is None:
+                    continue
+                try:
+                    protac = Chem.molzip(protac)
+                except:
+                    continue
+                # Sanitize molecule
+                try:
+                    zero_on_success = Chem.SanitizeMol(protac, catchErrors=True)
+                    if zero_on_success != 0:
+                        continue
+                    protac_smiles = Chem.MolToSmiles(protac, canonical=True)
+                except:
+                    continue
+                if original_df is not None and protac_smiles in original_df['PROTAC SMILES'].values:
+                    continue
+                # Check if PROTAC can be reassembled
+                if not check_reassembly(protac_smiles, ligands_smiles):
+                    continue
+                covering_df.append({
+                    'PROTAC SMILES': protac_smiles,
+                    'POI Ligand SMILES with direction': poi_smiles,
+                    'Linker SMILES with direction': linker_smiles,
+                    'E3 Binder SMILES with direction': e3_smiles,
+                    'POI Ligand Functional Group': None,
+                    'E3 Binder Functional Group': None,
+                })
+                generated_pois.add(poi_smiles)
+                generated_e3s.add(e3_smiles)
+                generated_linkers.add(linker_smiles)
+                ligands_added = 0
+                if poi_smiles in leftover_pois:
+                    leftover_pois.remove(poi_smiles)
+                    ligands_added += 1
+                if e3_smiles in leftover_e3s:
+                    leftover_e3s.remove(e3_smiles)
+                    ligands_added += 1
+                if linker_smiles in leftover_linkers:
+                    leftover_linkers.remove(linker_smiles)
+                    ligands_added += 1
+                e3_coverage = len(generated_e3s) / len(avail_e3s)
+                poi_coverage = len(generated_pois) / len(avail_pois)
+                linker_coverage = len(generated_linkers) / len(avail_linkers)
+                # Update the pbar and write the coverage
+                pbar.update(ligands_added)
+                pbar.set_postfix({
+                    'POI': f"{poi_coverage:.2%}",
+                    'E3': f"{e3_coverage:.2%}",
+                    'Linker': f"{linker_coverage:.2%}",
+                })
+                if not leftover_pois and not leftover_e3s and not leftover_linkers:
+                    break
+        final_df = pd.concat([final_df, pd.DataFrame(covering_df)]).drop_duplicates()
+    # Save to file if specified
+    if filename_generated_df:
+        final_df.to_csv(filename_generated_df, index=False)
+        print(f"Generated PROTACs saved to: {filename_generated_df}")
+    return final_df

protac_splitter/display_utils.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import sys
+from typing import Optional
+from rdkit import Chem
+from rdkit.Chem import Draw
+if 'ipykernel' in sys.modules:
+    from IPython.display import SVG
+from .chemoinformatics import get_atom_idx_at_attachment, canonize
+def safe_display(*args):
+    """Displays content only if running in a Jupyter notebook."""
+    if 'ipykernel' in sys.modules:
+        display(*args)
+    else:
+        print(*args)
+def display_mol(
+        mol: Chem.Mol,
+        w: int = 800,
+        h: int = 300,
+        legend: Optional[str] = None,
+        use_smiles_as_legend: bool = True,
+        display_svg: bool = True,
+):
+    """ Display a molecule in a Jupyter notebook. Useful for having """
+    if mol is None:
+        print('Molecule is None')
+        return None
+    if use_smiles_as_legend and legend is None:
+        legend = Chem.MolToSmiles(mol)
+    if display_svg:
+        mol.SetProp("_Name", Chem.MolToSmiles(mol, canonical=True))
+        d = Draw.rdMolDraw2D.MolDraw2DSVG(w, h, noFreetype=True)
+        font_path = '/System/Library/Fonts/Supplemental/Arial.ttf'
+        if os.path.exists(font_path):
+            d.fontFile = font_path
+        d.DrawMolecule(mol, legend=legend)
+        d.FinishDrawing()
+        svg = d.GetDrawingText()
+        # Check if in Jupyter notebook
+        if sys.modules.get('ipykernel', None):
+            from IPython.display import SVG
+            safe_display(SVG(svg))
+    else:
+        img = Draw.MolToImage(mol, size=(w, h))
+        safe_display(img)
+def get_mapped_protac_img(
+        protac_smiles: str,
+        poi_smiles: str,
+        linker_smiles: str,
+        e3_smiles: str,
+        w: int = 1000,
+        h: int = 1000,
+        useSVG: bool = False,
+        display_image: bool = False,
+        legend: Optional[str] = None,
+        show_bond_indices: bool = False,
+):
+    """ Display a PROTAC molecule with the POI, linker, and E3 ligase highlighted.
+    If `useSVG` is True, then the POI-Linker bond is highlighted in purple, whereas the E3-Linker bond is highlighted in green.
+    If `useSVG` is False, then both splitting points are highlighted in purple.
+    Args:
+        protac_smiles: The SMILES string of the PROTAC.
+        poi_smiles: The SMILES string of the POI.
+        linker_smiles: The SMILES string of the linker.
+        e3_smiles: The SMILES string of the E3 ligase.
+        w: The width of the image.
+        h: The height of the image.
+        useSVG: Whether to use SVG format.
+        display_image: Whether to display the image.
+        legend: The legend to display.
+        show_bond_indices: Whether to show bond indices in the image.
+    """
+    protac_smiles = canonize(protac_smiles)
+    e3_smiles = canonize(e3_smiles)
+    poi_smiles = canonize(poi_smiles)
+    linker_smiles = canonize(linker_smiles)
+    # Check if any of the canonicalized SMILES is None
+    if None in [protac_smiles, e3_smiles, poi_smiles, linker_smiles]:
+        return None
+    protac_mol = Chem.MolFromSmiles(protac_smiles)
+    e3_mol = Chem.MolFromSmiles(e3_smiles)
+    poi_mol = Chem.MolFromSmiles(poi_smiles)
+    linker_mol = Chem.MolFromSmiles(linker_smiles)
+    if None in [protac_mol, e3_mol, poi_mol, linker_mol]:
+        return None
+    if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
+        print('WARNING. Linker is empty.')
+        poi_attachment_idx = get_atom_idx_at_attachment(protac_mol, poi_mol, e3_mol)
+        e3_attachment_idx = get_atom_idx_at_attachment(protac_mol, e3_mol, poi_mol)
+    else:
+        poi_attachment_idx = get_atom_idx_at_attachment(protac_mol, poi_mol, linker_mol)
+        e3_attachment_idx = get_atom_idx_at_attachment(protac_mol, e3_mol, linker_mol)
+    cyan = (0, 1, 1, 0.5)
+    red = (1, 0, 0, 0.5)
+    green = (0, 1, 0, 0.5)
+    blue = (0, 0, 1, 0.5)
+    purple = (1, 0, 1, 0.3)
+    highlight_atoms = []
+    highlight_bonds = []
+    atom_colors = {}
+    bond_colors = {}
+    if poi_attachment_idx is not None:
+        if len(poi_attachment_idx) != 2:
+            if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
+                print(f'WARNING. Linker is empty, no highlighting will be showed for the POI.')
+            else:
+                print(f'WARNING. POI attachment points must be only two, got instead: {poi_attachment_idx}')
+        else:
+            poi_bond_idx = protac_mol.GetBondBetweenAtoms(*poi_attachment_idx).GetIdx()
+            highlight_atoms += poi_attachment_idx
+            highlight_bonds.append(poi_bond_idx)
+            atom_colors[poi_attachment_idx[0]] = purple
+            atom_colors[poi_attachment_idx[1]] = purple
+            bond_colors[poi_bond_idx] = purple
+    if e3_attachment_idx is not None:
+        if len(e3_attachment_idx) != 2:
+            if linker_smiles in ['[*:1][*:2]', '[*:2][*:1]']:
+                print(f'WARNING. Linker is empty, no highlighting will be showed for the E3.')
+            else:
+                print(f'WARNING. E3 attachment points must be only two, got instead: {e3_attachment_idx}')
+        else:
+            e3_bond_idx = protac_mol.GetBondBetweenAtoms(*e3_attachment_idx).GetIdx()
+            highlight_atoms += e3_attachment_idx
+            highlight_bonds.append(e3_bond_idx)
+            atom_colors[e3_attachment_idx[0]] = green
+            atom_colors[e3_attachment_idx[1]] = green
+            bond_colors[e3_bond_idx] = green
+    if useSVG:
+        drawer = Draw.rdMolDraw2D.MolDraw2DSVG(w, h, noFreetype=True)
+        options = drawer.drawOptions()
+        options.fontFile = '/System/Library/Fonts/Supplemental/Arial.ttf'
+        if legend is None:
+            # legend = '.'.join([e3_smiles, linker_smiles, poi_smiles])
+            legend = ""
+        drawer.DrawMolecule(
+            protac_mol,
+            legend=legend,
+            highlightAtoms=highlight_atoms,
+            highlightBonds=highlight_bonds,
+            highlightAtomColors=atom_colors,
+            highlightBondColors=bond_colors,
+        )
+        # Add bond indices as text in the center of each bond
+        if show_bond_indices:
+            # Needs coordinates; ensure 2D coords present
+            Chem.rdDepictor.Compute2DCoords(protac_mol)
+            for bond in protac_mol.GetBonds():
+                idx = bond.GetIdx()
+                begin = bond.GetBeginAtomIdx()
+                end = bond.GetEndAtomIdx()
+                begin_pos = drawer.GetDrawCoords(begin)
+                end_pos = drawer.GetDrawCoords(end)
+                mid_y = (begin_pos.y + end_pos.y) / 2
+                mid_x = (begin_pos.x + end_pos.x) / 2
+                drawer.DrawString(f"{idx}", Chem.rdGeometry.Point2D(mid_x, mid_y), rawCoords=True)
+        drawer.FinishDrawing()
+        svg_text = drawer.GetDrawingText()
+        if display_image:
+            safe_display(SVG(svg_text))
+        return svg_text
+    else:
+        img = Draw.MolToImage(
+            protac_mol,
+            size=(w, h),
+            highlightColor=purple,
+            highlightAtoms=highlight_atoms,
+            highlightBonds=highlight_bonds,
+            highlightAtomColors=atom_colors,
+            highlightBondColors=bond_colors,
+        )
+        if display_image:
+            safe_display(img)
+        return img

protac_splitter/drawing_utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import re
+import numpy as np
+from rdkit import Chem, DataStructs
+from rdkit.Chem import (
+    AllChem,
+    Draw,
+    rdFMCS,
+    rdMolAlign,
+)
+def save_as_svg(svg_content, filename, num_mols):
+    """Save SVG content to a file."""
+    with open(filename, 'w') as file:
+        data = str(svg_content.data)
+        data = data.replace('1500', str(500*num_mols))
+        file.write(data)
+def align_molecules_2D(ref_mol, to_align_mol):
+    AllChem.Compute2DCoords(ref_mol)
+    AllChem.Compute2DCoords(to_align_mol)
+    # Find the maximum common substructure and use it to align molecules
+    mcs = rdFMCS.FindMCS([ref_mol, to_align_mol])
+    mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
+    ref_match = ref_mol.GetSubstructMatch(mcs_mol)
+    align_match = to_align_mol.GetSubstructMatch(mcs_mol)
+    atom_map = list(zip(align_match, ref_match))
+    rdMolAlign.AlignMol(to_align_mol, ref_mol, atomMap=atom_map)
+    return to_align_mol
+def align_molecules_by_coordinates(ref_mol, to_align_mol):
+    # Find the maximum common substructure
+    AllChem.Compute2DCoords(to_align_mol)
+    mcs = rdFMCS.FindMCS([ref_mol, to_align_mol])
+    mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
+    ref_match = ref_mol.GetSubstructMatch(mcs_mol)
+    align_match = to_align_mol.GetSubstructMatch(mcs_mol)
+    # Copy the coordinates from the reference molecule to the molecule to be aligned
+    ref_conf = ref_mol.GetConformer()
+    align_conf = to_align_mol.GetConformer()
+    for ref_idx, align_idx in zip(ref_match, align_match):
+        ref_pos = ref_conf.GetAtomPosition(ref_idx)
+        align_conf.SetAtomPosition(align_idx, ref_pos)
+    return to_align_mol
+def draw_molecule_to_svg(mol, size=(500, 500), scale=1.0):
+    drawer = Draw.rdMolDraw2D.MolDraw2DSVG(size[0], size[1])
+    drawer.drawOptions().fixedBondLength = scale
+    drawer.DrawMolecule(mol)
+    drawer.FinishDrawing()
+    svg = drawer.GetDrawingText()
+    svg = re.sub(r'\<\?xml.*?\?\>', '', svg)  # Remove XML declaration
+    svg = svg.replace('<svg', '<g').replace(
+        '</svg>', '</g>')  # Replace svg tags with g tags
+    return svg
+def combine_svgs(svgs, output_filename, dimensions=None, size=(500, 500), xy_shifts=None):
+    if dimensions is None:
+        dimensions = (len(svgs), 1)
+    if xy_shifts is None:
+        xy_shifts = [(0, 0) for i in range(dimensions[0]*dimensions[1])]
+    width, height = size
+    grid_width, grid_height = dimensions
+    # Include only one XML declaration and the opening <svg> tag
+    combined_svg = f'<?xml version="1.0" standalone="no"?>\n'
+    combined_svg += f'<svg width="{grid_width * width}px" height="{grid_height * height}px" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">\n'
+    # Arrange SVGs in a grid
+    for i, (svg, xy_shift) in enumerate(zip(svgs, xy_shifts)):
+        x = (i % grid_width) * width
+        y = (i // grid_width) * height
+        combined_svg += f'<g transform="translate({x+xy_shift[0]},{y-xy_shift[1]})">{svg}</g>\n'
+    combined_svg += '</svg>'
+    with open(output_filename, 'w') as file:
+        file.write(combined_svg)
+def draw_molecule_with_highlighted_bonds(mol, bonds_to_highlight):
+    """
+    Draws a molecule with specified atoms and bonds highlighted.
+    Parameters:
+    - smiles (str): SMILES string for the molecule.
+    - atoms_to_highlight (set): Set of atom indices to highlight.
+    - bonds_to_highlight (list): List of bond indices to highlight.
+    - highlight_bond_colors (dict): Dictionary mapping bond indices to colors.
+    """
+    # Create molecule from SMILES
+    # Initialize drawer
+    d2d = Draw.rdMolDraw2D.MolDraw2DSVG(350*2, 300*2)
+    # Set drawing options
+    d2d.drawOptions().useBWAtomPalette()
+    d2d.drawOptions().continuousHighlight = False
+    d2d.drawOptions().highlightBondWidthMultiplier = 24
+    d2d.drawOptions().setHighlightColour((0, 0, 1))
+    d2d.drawOptions().fillHighlights = False
+    # Draw the molecule with highlights
+    d2d.DrawMolecule(mol,
+                     highlightAtoms=[],
+                     highlightBonds=bonds_to_highlight)
+    d2d.FinishDrawing()
+    # Convert drawing to image and display
+    svg = d2d.GetDrawingText()
+    svg = svg.replace('svg:', '')
+    return svg
+def align_mol_2D_ver2(template, query):
+    mcs = rdFMCS.FindMCS([template, query])
+    patt = Chem.MolFromSmarts(mcs.smartsString)
+    query_match = query.GetSubstructMatch(patt)
+    template_match = template.GetSubstructMatch(patt)
+    rms = AllChem.AlignMol(query, template, atomMap=list(
+        zip(query_match, template_match)))
+    return template, query
+def transform_molecule(mol, degrees, translate_x=0, translate_y=0, flip_x_axis=False):
+    """Apply rotation, translation, and optionally flip the molecule."""
+    radians = np.deg2rad(degrees)
+    rotation_matrix = np.array([
+        [np.cos(radians), -np.sin(radians), 0],
+        [np.sin(radians),  np.cos(radians), 0],
+        [0,               0,               1]
+    ])
+    AllChem.Compute2DCoords(mol)
+    conf = mol.GetConformer()
+    for i in range(conf.GetNumAtoms()):
+        pos = np.array(conf.GetAtomPosition(i))
+        new_pos = np.dot(rotation_matrix, pos)
+        new_pos[0] += translate_x  # Translate along the x-axis
+        new_pos[1] += translate_y  # Translate along the y-axis
+        if flip_x_axis:
+            new_pos[1] = -new_pos[1]  # Flip along the x-axis
+        conf.SetAtomPosition(i, new_pos)
+def tailored_framework_example(mol_ms):
+    # remove lone atoms
+    # define all atoms to be atom number 1
+    # define all bonds to be single bonds
+    mol_ms_w = Chem.RWMol(mol_ms)
+    atom_idx_to_remove = []
+    for atom in mol_ms_w.GetAtoms():
+        # lone atom. Need to remove it to create the generic framework.
+        if atom.GetDegree() == 1:
+            atom_idx_to_remove.append(atom.GetIdx())
+            continue
+        atom.SetAtomicNum(0)
+    for bond in mol_ms_w.GetBonds():
+        bond.SetBondType(Chem.rdchem.BondType.SINGLE)
+    atom_idx_to_remove.sort(reverse=True)
+    for atom_idx in atom_idx_to_remove:
+        mol_ms_w.RemoveAtom(atom_idx)
+    mol_ms_new = mol_ms_w.GetMol()
+    return mol_ms_new

protac_splitter/evaluation.py ADDED Viewed

	@@ -0,0 +1,495 @@

+""" Evaluation functions for the protac_splitter package. They need to be generic to accomodate predictions coming from different models. """
+import math
+import re
+import logging
+from typing import Tuple, Any, Dict, Optional, Union
+import numpy as np
+from rdkit import Chem, RDLogger
+from rdkit.Chem import DataStructs
+# Disable RDKit logging: when checking SMILES validity, we suppress warnings
+RDLogger.DisableLog("rdApp.*")
+from .chemoinformatics import (
+    canonize,
+    canonize_smiles,
+    remove_stereo,
+    get_substr_match,
+)
+from .protac_cheminformatics import reassemble_protac
+from .graphs_utils import (
+    get_smiles2graph_edit_distance,
+    get_smiles2graph_edit_distance_norm,
+)
+def is_valid_smiles(
+        smiles: Optional[str],
+        return_mol: bool = False,
+) -> Union[bool, Tuple[bool, Chem.Mol]]:
+    """ Check if a SMILES is valid, i.e., it can be parsed by RDKit.
+    Args:
+        smiles (Optional[str]): The SMILES to check.
+        return_mol (bool): If True, return the RDKit molecule object, i.e., `(is_valid, mol)`.
+    Returns:
+        bool | Tuple[bool, Chem.Mol]: True if the SMILES is valid, False otherwise. If return_mol is True, also return the RDKit molecule object.
+    """
+    if smiles is None:
+        return False
+    mol = Chem.MolFromSmiles(smiles)
+    if return_mol:
+        return mol is not None, mol
+    return mol is not None
+def has_three_substructures(smiles: Optional[str]) -> bool:
+    """ Check if a PROTAC SMILES has three substructures. """
+    if smiles is None:
+        return False
+    return smiles.count(".") == 2
+def has_all_attachment_points(smiles: Optional[str]) -> bool:
+    """ Check if a PROTAC SMILES has all attachment points, i.e., [*:1] and [*:2], two each. """
+    if smiles is None:
+        return False
+    return smiles.count("[*:1]") == 2 and smiles.count("[*:2]") == 2
+def split_prediction(
+        pred: str,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+) -> Optional[dict[str, str]]:
+    """ Split a PROTAC SMILES prediction into its three substructures.
+    Args:
+        pred (str): The SMILES of the PROTAC molecule.
+        poi_attachment_id (int): The attachment point ID for the POI substructure.
+        e3_attachment_id (int): The attachment point ID for the E3 substructure.
+    Returns:
+        dict[str, str] | None: A dictionary (with keys: 'e3', 'linker', 'poi') containing the SMILES notations for the POI, linker, and E3 substructures, or None if the prediction is invalid
+    """
+    ret = {k: None for k in ['poi', 'linker', 'e3']}
+    if pred is None:
+        return ret
+    ligands = pred.split('.')
+    if len(ligands) != 3:
+        return ret
+    for ligand in ligands:
+        if f'[*:{poi_attachment_id}]' in ligand and f'[*:{e3_attachment_id}]' not in ligand:
+            ret['poi'] = ligand
+        elif f'[*:{e3_attachment_id}]' in ligand and f'[*:{poi_attachment_id}]' not in ligand:
+            ret['e3'] = ligand
+        elif f'[*:{poi_attachment_id}]' in ligand and f'[*:{e3_attachment_id}]' in ligand:
+            ret['linker'] = ligand
+    return ret
+def rename_attachment_id(mol: Union[str, Chem.Mol], old_id: int, new_id: int) -> Union[str, Chem.Mol]:
+    """ Rename an attachment point ID in a molecule.
+    Args:
+        mol: The input molecule.
+        old_id: The old attachment point ID.
+        new_id: The new attachment point ID.
+    Returns:
+        The renamed molecule.
+    """
+    return_str = False
+    if isinstance(mol, Chem.Mol):
+        mol = Chem.MolToSmiles(mol, canonical=True)
+        return_str = True
+    # Regex-replace the patterns "[*:old_id]" or "[old_id*]" with "[*:new_id]"
+    mol = re.sub(rf'\[\*:{old_id}\]', f'[*:{new_id}]', mol)
+    mol = re.sub(rf'\[{old_id}\*\]', f'[*:{new_id}]', mol)
+    mol = canonize_smiles(mol)
+    if mol is None:
+        return None
+    mol = Chem.MolFromSmiles(mol)
+    if return_str:
+        return Chem.MolToSmiles(mol, canonical=True)
+    return mol
+def at_least_two_ligands_correct(
+        protac_smiles: str,
+        ligands_smiles: str,
+) -> bool:
+    """ Check if at least two ligands are correct. """
+    # Check if there is at least one "." in the ligands SMILES
+    if "." not in ligands_smiles:
+        return False
+    ligands = ligands_smiles.split(".")
+    return True
+def check_reassembly(
+        protac_smiles: str,
+        ligands_smiles: str,
+        stats: Optional[Dict[str, int]] = None,
+        linker_can_be_null: bool = False,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        verbose: int = 0,
+        return_reassembled_smiles: bool = False,
+) -> bool:
+    """Check if the reassembled PROTAC matches the original PROTAC SMILES.
+    Args:
+        protac_smiles (str): The original PROTAC SMILES.
+        ligands_smiles (str): The SMILES of the joined PROTAC ligands, separated by a "." (dot).
+        stats (Optional[Dict[str, int]]): A dictionary to store statistics about the reassembly process.
+        linker_can_be_null (bool): If False, the linker cannot be empty, and if so, a None will be returned. If True, a special check is performed to rename the E3 and WH attchament points to assemble them together.
+        poi_attachment_id (int): The label of the attachment point for the POI ligand, i.e., "[*:{poi_attachment_id}]". Default is 1.
+        e3_attachment_id (int): The label of the attachment point for the E3 binder, i.e., "[*:{e3_attachment_id}]". Default is 2.
+        verbose (int): The verbosity
+    Returns:
+        bool: True if the reassembled PROTAC matches the original PROTAC SMILES, False otherwise. None if it failed.
+    """
+    ligands_smiles = canonize_smiles(ligands_smiles)
+    if ligands_smiles is None:
+        if verbose:
+            logging.error('Ligand could be canonicalized.')
+        return (False, None) if return_reassembled_smiles else False
+    null_linker_e3 = f'[*:{e3_attachment_id}][*:{poi_attachment_id}]'
+    null_linker_poi = f'[*:{poi_attachment_id}][*:{e3_attachment_id}]'
+    linker_is_null = False
+    if null_linker_e3 in ligands_smiles or null_linker_poi in ligands_smiles:
+        # If the linker is empty, remove the linker atoms
+        ligands_smiles = ligands_smiles.replace(null_linker_poi, '')
+        ligands_smiles = ligands_smiles.replace(null_linker_e3, '')
+        ligands_smiles = ligands_smiles.replace('..', '.')
+        ligands_smiles = ligands_smiles.rstrip('.')
+        ligands_smiles = ligands_smiles.lstrip('.')
+        ligands_smiles = canonize_smiles(ligands_smiles)
+        linker_is_null = True
+    if linker_can_be_null or linker_is_null:
+        if len(ligands_smiles.split('.')) == 2:
+            # Replace the attachment points with a third one (they will be joined later)
+            ligands_smiles = rename_attachment_id(ligands_smiles, e3_attachment_id, max([poi_attachment_id, e3_attachment_id]) + 1)
+            ligands_smiles = rename_attachment_id(ligands_smiles, poi_attachment_id, max([poi_attachment_id, e3_attachment_id]) + 1)
+    ligands_mol = Chem.MolFromSmiles(ligands_smiles)
+    if ligands_mol is None:
+        if verbose:
+            logging.error('ligands_mol is None')
+        return (False, None) if return_reassembled_smiles else False
+    try:
+        reassembled_mol = Chem.molzip(ligands_mol)
+        if reassembled_mol is None:
+            if stats is not None:
+                stats['molzip failed'] += 1
+            if verbose:
+                logging.error(f'molzip failed')
+            return (False, None) if return_reassembled_smiles else False
+    except:
+        if stats is not None:
+            stats['molzip failed (exception)'] += 1
+        if verbose:
+            logging.error(f'molzip failed (exception)')
+        return (False, None) if return_reassembled_smiles else False
+    try:
+        reassembled_smiles = canonize(Chem.MolToSmiles(reassembled_mol))
+        if reassembled_smiles is None:
+            if stats is not None:
+                stats['MolToSmiles of reassembled failed'] += 1
+            if verbose:
+                logging.error('MolToSmiles of reassembled failed')
+            return (False, None) if return_reassembled_smiles else False
+    except:
+        if stats is not None:
+            stats['MolToSmiles of reassembled failed'] += 1
+        if verbose:
+            logging.error('MolToSmiles of reassembled failed')
+        return (False, None) if return_reassembled_smiles else False
+    is_equal = canonize(protac_smiles) == reassembled_smiles
+    return (is_equal, reassembled_smiles) if return_reassembled_smiles else is_equal
+def check_substructs(
+        protac_smiles: str,
+        poi_smiles: str = None,
+        linker_smiles: str = None,
+        e3_smiles: str = None,
+        return_bond_types: bool = False,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        pred: str = None,
+) -> Union[bool, Tuple[bool, dict[str, str]]]:
+    """ DEPRECATED.
+    Check if the reassembled PROTAC is correct.
+    Args:
+        protac_smiles (str): The SMILES of the PROTAC molecule.
+        poi_smiles (str): The SMILES of the POI ligand.
+        linker_smiles (str): The SMILES of the linker.
+        e3_smiles (str): The SMILES of the E3 binder.
+        return_bond_types (bool): If True, return the bond types used for the reassembly.
+        poi_attachment_id (int): The label of the attachment point for the POI ligand, i.e., "[*:{poi_attachment_id}]".
+        e3_attachment_id (int): The label of the attachment point for the E3 binder, i.e., "[*:{e3_attachment_id}]".
+        pred (str): The SMILES of the predicted PROTAC molecule.
+    Returns:
+        bool | Tuple[bool, dict[str, str]]: True if the reassembled PROTAC is correct, False otherwise. If return_bond_types is True, also return the bond types used for the reassembly.
+    """
+    def get_failed_return():
+        if return_bond_types:
+            return False, {}
+        return False
+    # Make some checks before starting and fail if necessary
+    all_subs_none = all(v is None for v in [poi_smiles, linker_smiles, e3_smiles])
+    any_subs_none = any(v is None for v in [poi_smiles, linker_smiles, e3_smiles])
+    if pred is not None and all_subs_none:
+        # Split the prediction into the substructures
+        pred_substructs = split_prediction(pred, poi_attachment_id, e3_attachment_id)
+        if any(v is None for v in pred_substructs.values()):
+            return get_failed_return()
+        poi_smiles = pred_substructs['poi']
+        linker_smiles = pred_substructs['linker']
+        e3_smiles = pred_substructs['e3']
+    elif pred is None and any_subs_none:
+        return get_failed_return()
+    elif pred is None and all_subs_none:
+        logging.warning("Arguments 'pred' and 'poi_smiles', 'linker_smiles', 'e3_smiles' cannot be all None.")
+        return get_failed_return()
+    if f"[*:{poi_attachment_id}]" in e3_smiles:
+        return get_failed_return()
+    if f"[*:{e3_attachment_id}]" in poi_smiles:
+        return get_failed_return()
+    if f"[*:{poi_attachment_id}]" not in linker_smiles:
+        return get_failed_return()
+    if f"[*:{e3_attachment_id}]" not in linker_smiles:
+        return get_failed_return()
+    correct_substructs = False
+    protac_mol = Chem.MolFromSmiles(protac_smiles)
+    protac_inchi = Chem.MolToInchi(protac_mol)
+    protac_smiles_canon = canonize_smiles(protac_smiles)
+    bond_types = {}
+    bonds = ['single', 'double', 'triple']
+    # for e3_bond_type, poi_bond_type in itertools.product([bonds, bonds]):
+    for e3_bond_type in bonds:
+        for poi_bond_type in bonds:
+            try:
+                assmbl_smiles, assmbl_mol = reassemble_protac(
+                    poi_smiles,
+                    linker_smiles,
+                    e3_smiles,
+                    e3_bond_type,
+                    poi_bond_type,
+                    poi_attachment_id,
+                    e3_attachment_id,
+                )
+                if assmbl_mol is not None:
+                    # If either the InChI or SMILES of the reassembled PROTAC is
+                    # the same as the original PROTAC, then the reassembly is
+                    # correct.
+                    if protac_inchi == Chem.MolToInchi(assmbl_mol):
+                        correct_substructs = True
+                        bond_types['e3_bond_type'] = e3_bond_type
+                        bond_types['poi_bond_type'] = poi_bond_type
+                        break
+                    if protac_smiles_canon == canonize_smiles(assmbl_smiles):
+                        correct_substructs = True
+                        bond_types['e3_bond_type'] = e3_bond_type
+                        bond_types['poi_bond_type'] = poi_bond_type
+                        break
+            except:
+                continue
+    if return_bond_types:
+        return correct_substructs, bond_types
+    return correct_substructs
+def score_prediction(
+        protac_smiles: str,
+        label_smiles: str,
+        pred_smiles: str,
+        rouge = None,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        fpgen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=11, fpSize=2048),
+        compute_rdkit_metrics: bool = False,
+        compute_graph_metrics: bool = False,
+        graph_edit_kwargs: Dict[str, Any] = {},
+) -> dict[str, float]:
+    """ Score a PROTAC SMILES prediction.
+    Args:
+        protac_smiles (str): The SMILES of the PROTAC molecule.
+        label_smiles (str): The SMILES of the ground truth PROTAC molecule.
+        pred_smiles (str): The SMILES of the predicted PROTAC molecule.
+        rouge (Rouge | None): The Rouge object to use for scoring. If None, do not compute Rouge scores. Example: `rouge = evaluate.load("rouge")`
+        poi_attachment_id (int): The attachment point ID for the POI substructure.
+        e3_attachment_id (int): The attachment point ID for the E3 substructure.
+    Returns:
+        dict[str, float]: A dictionary containing the scores for the prediction
+    """
+    protac_mol = Chem.MolFromSmiles(protac_smiles)
+    protac_num_atoms = protac_mol.GetNumHeavyAtoms()
+    scores = {
+        'has_three_substructures': has_three_substructures(pred_smiles),
+        'has_all_attachment_points': has_all_attachment_points(pred_smiles),
+        'num_fragments': 0 if pred_smiles is None else pred_smiles.count('.') + 1,
+        'tanimoto_similarity': 0.0, # Default value
+        'valid': False,
+        'reassembly': False,
+        'reassembly_nostereo': False,
+        'heavy_atoms_difference': protac_num_atoms,
+        'heavy_atoms_difference_norm': 1.0,
+        'all_ligands_equal': False,
+    }
+    pred_substructs = split_prediction(pred_smiles, poi_attachment_id, e3_attachment_id)
+    # Compute metrics for the "entire" predicted PROTAC molecule
+    if None not in list(pred_substructs.values()):
+        e3_nostereo = remove_stereo(pred_substructs['e3'])
+        linker_nostereo = remove_stereo(pred_substructs['linker'])
+        poi_nostereo = remove_stereo(pred_substructs['poi'])
+        if None not in [e3_nostereo, linker_nostereo, poi_nostereo]:
+            pred_nostereo = f"{e3_nostereo}.{linker_nostereo}.{poi_nostereo}"
+            scores['reassembly_nostereo'] = check_reassembly(remove_stereo(protac_smiles), pred_nostereo)
+        scores['valid'] = is_valid_smiles(pred_smiles)
+        is_equal, reassembled_smiles = check_reassembly(protac_smiles, pred_smiles, return_reassembled_smiles=True)
+        scores['reassembly'] = is_equal
+        # Get the number of heavy atoms difference between the reassembled PROTAC and the ground truth PROTAC
+        if reassembled_smiles is not None:
+            reassembled_mol = Chem.MolFromSmiles(reassembled_smiles)
+            if reassembled_mol is not None:
+                scores['heavy_atoms_difference'] -= reassembled_mol.GetNumHeavyAtoms()
+                scores['heavy_atoms_difference_norm'] = scores['heavy_atoms_difference'] / protac_num_atoms
+        if scores['valid'] and compute_rdkit_metrics and fpgen is not None:
+            # Get Tanimoto similarity between the predicted PROTAC and the ground truth PROTAC
+            pred_mol = Chem.MolFromSmiles(pred_smiles)
+            label_mol = Chem.MolFromSmiles(label_smiles)
+            pred_fp = fpgen.GetFingerprint(pred_mol)
+            label_fp = fpgen.GetFingerprint(label_mol)
+            scores['tanimoto_similarity'] = DataStructs.TanimotoSimilarity(pred_fp, label_fp)
+    if rouge is not None:
+        rouge_output = rouge.compute(predictions=[pred_smiles], references=[label_smiles])
+        scores.update({k: v for k, v in rouge_output.items()})
+    # Compute metrics for each substructure
+    label_substructs = split_prediction(label_smiles, poi_attachment_id, e3_attachment_id)
+    # Set default values
+    for sub in ['e3', 'poi', 'linker']:
+        scores[f'{sub}_valid'] = False
+        scores[f'{sub}_equal'] = False
+        scores[f'{sub}_has_attachment_point(s)'] = False
+        scores[f'{sub}_tanimoto_similarity'] = 0.0
+        # NOTE: The graph edit distance can be very high and dependant on the
+        # graphs, but when the molecule is not valid, then we cannot compute it.
+        # Because of that, we instead set it to something very large, in case we
+        # need to sum the eval metrics.
+        scores[f'{sub}_graph_edit_distance'] = 1e64
+        scores[f'{sub}_graph_edit_distance_norm'] = 1.0
+        scores[f'{sub}_heavy_atoms_difference'] = 0
+        try:
+            scores[f'{sub}_heavy_atoms_difference'] = Chem.MolFromSmiles(label_substructs[sub]).GetNumHeavyAtoms()
+        except:
+            logging.warning(f"WARNING: {sub} substructure is None in the label: '{label_smiles}' - PROTAC: '{protac_smiles}'")
+        scores[f'{sub}_heavy_atoms_difference_norm'] = 1.0
+    # Calculate metrics for each substructure
+    for sub in ['e3', 'poi', 'linker']:
+        # Skip if the predicted substructure is None from `split_prediction`
+        pred_sub = pred_substructs[sub]
+        label_sub = label_substructs[sub]
+        if pred_sub is None:
+            continue
+        if label_sub is None:
+            logging.warning(f"WARNING: {sub} substructure is None in the label: '{label_smiles}' - PROTAC: '{protac_smiles}'")
+            continue
+        # Check if the predicted substructure is a valid RDKit molecule
+        sub_valid, sub_mol = is_valid_smiles(pred_sub, return_mol=True)
+        scores[f'{sub}_valid'] = sub_valid
+        if sub_mol is None:
+            continue
+        # Check if the predicted substructure has the correct attachment point(s)
+        if sub == 'e3':
+            if f'[*:{e3_attachment_id}]' in pred_sub and f'[*:{poi_attachment_id}]' not in pred_sub:
+                scores[f'{sub}_has_attachment_point(s)'] = True
+        elif sub == 'poi':
+            if f'[*:{poi_attachment_id}]' in pred_sub and f'[*:{e3_attachment_id}]' not in pred_sub:
+                scores[f'{sub}_has_attachment_point(s)'] = True
+        elif sub == 'linker':
+            if f'[*:{poi_attachment_id}]' in pred_sub and f'[*:{e3_attachment_id}]' in pred_sub:
+                scores[f'{sub}_has_attachment_point(s)'] = True
+        # Check if the predicted substructure InChI is the same as the ground truth substructure InChI
+        if scores[f'{sub}_valid']:
+            # scores[f'{sub}_equal'] = Chem.MolToInchi(sub_mol) == Chem.MolToInchi(Chem.MolFromSmiles(label_sub))
+            canon_pred = canonize_smiles(pred_sub)
+            canon_label = canonize_smiles(label_sub)
+            scores[f'{sub}_equal'] = canon_pred == canon_label
+        # Compute graph-related metrics
+        if scores[f'{sub}_valid'] and compute_graph_metrics:
+            scores[f'{sub}_graph_edit_distance'] = get_smiles2graph_edit_distance(pred_sub, label_sub, **graph_edit_kwargs)
+            scores[f'{sub}_graph_edit_distance_norm'] = get_smiles2graph_edit_distance_norm(
+                smi1=pred_sub,
+                smi2=label_sub,
+                ged_G1_G2=scores[f'{sub}_graph_edit_distance'],
+                **graph_edit_kwargs,
+            )
+        # Get the number of heavy atoms difference between the predicted substructure and the ground truth substructure
+        if scores[f'{sub}_valid']:
+            pred_mol = Chem.MolFromSmiles(pred_sub)
+            label_mol = Chem.MolFromSmiles(label_sub)
+            if label_mol is None:
+                logging.warning(f"WARNING: {sub} substructure is None in the label: '{label_smiles}' - PROTAC: '{protac_smiles}'")
+                continue
+            scores[f'{sub}_heavy_atoms_difference'] -= pred_mol.GetNumHeavyAtoms()
+            scores[f'{sub}_heavy_atoms_difference_norm'] = scores[f'{sub}_heavy_atoms_difference'] / label_mol.GetNumHeavyAtoms()
+        # Get Tanimoto similarity b/w the predicted substructure and the ground truth
+        if scores[f'{sub}_valid'] and compute_rdkit_metrics:
+            pred_mol = Chem.MolFromSmiles(pred_sub)
+            label_mol = Chem.MolFromSmiles(label_sub)
+            if label_mol is None:
+                logging.warning(f"WARNING: {sub} substructure is None in the label: '{label_smiles}' - PROTAC: '{protac_smiles}'")
+                continue
+            pred_fp = fpgen.GetFingerprint(pred_mol)
+            label_fp = fpgen.GetFingerprint(label_mol)
+            scores[f'{sub}_tanimoto_similarity'] = DataStructs.TanimotoSimilarity(pred_fp, label_fp)
+        # Compute Rouge scores
+        if rouge is not None:
+            rouge_output = rouge.compute(predictions=[pred_sub], references=[label_sub])
+            scores.update({f'{sub}_{k}': v for k, v in rouge_output.items()})
+    scores['all_ligands_equal'] = all([scores[f'{sub}_equal'] for sub in ['e3', 'poi', 'linker']])
+    return scores

protac_splitter/fixing_functions.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import logging
+from typing import Optional
+from rdkit import Chem
+from protac_splitter.chemoinformatics import (
+    canonize,
+    dummy2query,
+    remove_attach_atom,
+    remove_dummy_atoms,
+)
+from protac_splitter.evaluation import (
+    split_prediction,
+    check_reassembly,
+)
+from protac_splitter.data.curation.substructure_extraction import get_attachment_bonds
+def fix_tetrahedral_centers_ligand(
+        protac_mol: Chem.Mol,
+        ligand_smiles: str,
+        attachment_id: int = 1,
+) -> Optional[str]:
+    """ Fixes the tetrahedral centers of a ligand in a PROTAC molecule.
+    Args:
+        protac_mol (Chem.Mol): The RDKit molecule object of the PROTAC.
+        ligand_smiles (str): The SMILES of the ligand to fix.
+        attachment_id (int): The attachment point id of the ligand. Default is 1.
+    Returns:
+        A string containing the fixed ligand SMILES, or None if the fixing process failed.
+    """
+    ligand_mol = Chem.MolFromSmiles(ligand_smiles)
+    if ligand_mol is None:
+        logging.error(f"Invalid ligand SMILES: {ligand_smiles}")
+        return None
+    ligand_mol = remove_dummy_atoms(ligand_mol)
+    ligand_match = protac_mol.GetSubstructMatch(ligand_mol, useChirality=False) # useChirality=True
+    # Get bonds to break to separate the ligand
+    bonds_to_break = get_attachment_bonds(protac_mol, ligand_match)
+    # Return if no bonds are found
+    if len(bonds_to_break) != 1:
+        logging.error('ERROR: Multiple attachment bonds')
+        return None
+    # Break the bonds to isolate the ligand
+    frag_ligand_mol = Chem.FragmentOnBonds(protac_mol, bonds_to_break, addDummies=True, dummyLabels=[(attachment_id, attachment_id)])
+    # Get the fragments resulting from bond breaking
+    try:
+        frags = Chem.GetMolFrags(frag_ligand_mol, asMols=True, sanitizeFrags=True)
+    except Exception as e:
+        logging.error(e)
+        return None
+    # Identify the ligand fragment
+    ligand_fragment = None
+    for frag in frags:
+        if frag.HasSubstructMatch(ligand_mol):
+            ligand_fragment = frag
+            break
+    if ligand_fragment is None:
+        logging.error('ERROR: POI fragment not found')
+    ligand_fixed = Chem.MolToSmiles(ligand_fragment)
+    ligand_fixed = canonize(ligand_fixed.replace(f'[{attachment_id}*]', f'[*:{attachment_id}]'))
+    return ligand_fixed
+def fix_prediction(
+        protac_smiles: str,
+        pred_smiles: str,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        remove_stereochemistry: bool = False,
+        verbose: int = 0,
+) -> Optional[str]:
+    """ Fixes a prediction by replacing the substructure that does not match the PROTAC with the rest of the PROTAC.
+    Args:
+        protac_smiles (str): The SMILES of the PROTAC.
+        pred_smiles (str): The SMILES of the prediction.
+        poi_attachment_id (int): The attachment point id of the POI. Default is 1.
+        e3_attachment_id (int): The attachment point id of the E3 ligase. Default is 2.
+        verbose (int): The verbosity level. Default is 0.
+    Returns:
+        A string containing the fixed predictions, or None if the fixing process failed.
+    """
+    protac_mol = Chem.MolFromSmiles(protac_smiles)
+    if protac_mol is None:
+        logging.warning(f"Invalid PROTAC SMILES: {protac_smiles}")
+        return None
+    substructs = split_prediction(pred_smiles)
+    # If there are at least two None values, there's nothing we can do to fix it
+    if sum(v is None for v in substructs.values()) >= 2:
+        logging.warning(f'Unable to continue, more than two substructures are not valid for given input: "{pred_smiles}"')
+        return None
+    # Get molecules of PROTAC and substructures
+    substructs = {k: {'smiles': v, 'mol': Chem.MolFromSmiles(v) if v is not None else v} for k, v in substructs.items()}
+    # Check if renaming the attachment points might already fix the prediction
+    for sub in ['poi', 'e3', 'both']:
+        if sub == 'e3':
+            if substructs['e3']['smiles'] is None:
+                continue
+            e3_attempt = substructs['e3']['smiles'].replace(f'[*:{poi_attachment_id}]', f'[*:{e3_attachment_id}]')
+            poi_attempt = substructs['poi']['smiles']
+        if sub == 'poi':
+            if substructs['poi']['smiles'] is None:
+                continue
+            e3_attempt = substructs['e3']['smiles']
+            poi_attempt = substructs['poi']['smiles'].replace(f'[*:{e3_attachment_id}]', f'[*:{poi_attachment_id}]')
+        else:
+            if substructs['e3']['smiles'] is None or substructs['poi']['smiles'] is None:
+                continue
+            e3_attempt = substructs['e3']['smiles'].replace(f'[*:{e3_attachment_id}]', f'[*:{poi_attachment_id}]')
+            poi_attempt = substructs['poi']['smiles'].replace(f'[*:{poi_attachment_id}]', f'[*:{e3_attachment_id}]')
+        protac_attempt = f"{e3_attempt}.{substructs['linker']['smiles']}.{poi_attempt}"
+        if check_reassembly(protac_smiles, protac_attempt):
+            logging.info(f'Input works when renaming attachment points in {sub.title()} substruct. SMILES: "{protac_attempt}"')
+            return protac_attempt
+        # Check if swapping the POI and E3 attachments in the linker might already fix the prediction
+        if substructs['linker']['smiles'] is None:
+            continue
+        linker_attempt = substructs['linker']['smiles']
+        linker_attempt = linker_attempt.replace(f'[*:{poi_attachment_id}]', f'[*:DUMMY]')
+        linker_attempt = linker_attempt.replace(f'[*:{e3_attachment_id}]', f'[*:{poi_attachment_id}]')
+        linker_attempt = linker_attempt.replace(f'[*:DUMMY]', f'[*:{e3_attachment_id}]')
+        # Try with the original POI and E3 substructures
+        protac_attempt = f"{substructs['e3']['smiles']}.{linker_attempt}.{substructs['poi']['smiles']}"
+        if check_reassembly(protac_smiles, protac_attempt):
+            logging.info(f'Input works when swapping POI and E3 attachment points in the linker. Fixed SMILES: "{protac_attempt}"')
+            return protac_attempt
+        # Try with the swapped POI and E3 substructures
+        protac_attempt = f"{e3_attempt}.{linker_attempt}.{poi_attempt}"
+        if check_reassembly(protac_smiles, protac_attempt):
+            logging.info(f'Input works when swapping POI and E3 attachment points in the linker and in {sub.title()} substruct. Fixed SMILES: "{protac_attempt}"')
+            return protac_attempt
+    # Check if removing stereochemistry results in a valid prediction
+    if remove_stereochemistry:
+        Chem.RemoveStereochemistry(protac_mol)
+        protac_smiles = Chem.MolToSmiles(protac_mol, canonical=True)
+        for k, v in substructs.items():
+            if v['mol'] is not None:
+                Chem.RemoveStereochemistry(v['mol'])
+                substructs[k]['smiles'] = Chem.MolToSmiles(v['mol'], canonical=True)
+    if all(v['mol'] is not None for v in substructs.values()):
+        if check_reassembly(
+            protac_smiles,
+            '.'.join([v['smiles'] for v in substructs.values()]),
+        ):
+            logging.info(f'Input works when removing stereochemistry. SMILES: "{pred_smiles}"')
+            return f"{substructs['e3']['smiles']}.{substructs['linker']['smiles']}.{substructs['poi']['smiles']}"
+    # Check if any of the substructures is NOT a substructure of the PROTAC, if
+    # so, we mark it as the wrong substructure to fix.
+    num_matches = 0
+    wrong_substruct = None
+    for sub in ['poi', 'linker', 'e3']:
+        if substructs[sub]['mol'] is None:
+            substructs[sub]['match'] = False
+            wrong_substruct = sub
+        elif protac_mol.HasSubstructMatch(dummy2query(substructs[sub]['mol'])):
+            substructs[sub]['match'] = True
+            num_matches += 1
+        else:
+            substructs[sub]['match'] = False
+            wrong_substruct = sub
+    if num_matches < 2:
+        logging.warning(f'Prediction does not contain at least two matching substructures of the PROTAC. Num matches: {num_matches}. Prediction SMILES: "{pred_smiles}"')
+        return None
+    # If the wrong substructure is still matching in the PROTAC, we need to a
+    # more complex approach to fix the prediction (see below).
+    def remove_substructure(mol, substructure, attachment_id, replaceDummies=False):
+        if mol is None or substructure is None:
+            return None
+        smaller_mol = Chem.ReplaceCore(
+            mol,
+            substructure,
+            labelByIndex=False,
+            replaceDummies=replaceDummies,
+        )
+        if smaller_mol is None:
+            logging.warning(f'Failed to remove substructure from prediction SMILES: "{pred_smiles}"')
+            return None
+        smaller_smiles = Chem.MolToSmiles(smaller_mol, canonical=True)
+        smaller_smiles = smaller_smiles.replace('[1*]', f'[*:{attachment_id}]')
+        smaller_smiles = smaller_smiles.replace('[2*]', f'[*:{attachment_id}]')
+        smaller_mol = canonize(Chem.MolFromSmiles(smaller_smiles))
+        return smaller_mol
+    # If we still have 3 matches: for each substructure, we progressively remove
+    # the other substructures, then we check if the resulting molecule is valid
+    # and has only one fragment.
+    if num_matches == 3:
+        wrong_substruct = None
+        for sub in ['poi', 'linker', 'e3']:
+            removed_mol = Chem.MolFromSmiles(protac_smiles)
+            # Put the current substructure at the end of the list [poi, e3, linker]
+            sub_names = ['poi', 'e3', 'linker']
+            sub_names.remove(sub)
+            sub_names.append(sub)
+            # The linker often matches in many parts of the PROTAC, so we remove
+            # it when checking the E3 and POI substructures.
+            if sub != 'linker':
+                sub_names.remove('linker')
+            for s in sub_names:
+                attachment_id = poi_attachment_id if s == 'poi' else e3_attachment_id
+                removed_mol = remove_substructure(
+                    removed_mol,
+                    dummy2query(substructs[s]['mol']),
+                    attachment_id=attachment_id,
+                )
+            # Check if resulting molecule is None, if so, it is the wrong one
+            if removed_mol is None:
+                substructs[sub]['match'] = False
+                wrong_substruct = sub
+                num_matches -= 1
+                break
+            # Count the number of fragments in the removed molecule
+            num_fragments = Chem.GetMolFrags(removed_mol, asMols=True, sanitizeFrags=False)
+            if len(num_fragments) > 1:
+                substructs[sub]['match'] = False
+                wrong_substruct = sub
+                num_matches -= 1
+                break
+    if num_matches == 3:
+        logging.warning(f'Prediction already contains all matching substructures of the PROTAC. Prediction SMILES: "{pred_smiles}"')
+        return None
+    # Get the order in which to remove the substructures and get the final one
+    # as the fixed molecule.
+    if wrong_substruct == 'linker':
+        poi_atoms = substructs['poi']['mol'].GetNumAtoms()
+        e3_atoms = substructs['e3']['mol'].GetNumAtoms()
+        order = ['poi', 'e3'] if poi_atoms > e3_atoms else ['e3', 'poi']
+    else:
+        if wrong_substruct == 'poi':
+            order = ['e3', 'linker']
+        else:
+            order = ['poi', 'linker']
+    logging.debug(f'Wrong substructure: {wrong_substruct.upper()}. Order: {order}')
+    fixed_mol = protac_mol
+    for sub in order:
+        logging.debug(f'Removing substructure {sub.upper()} from PROTAC.')
+        if 'linker' not in order:
+            fixed_attach_id = poi_attachment_id if sub == 'poi' else e3_attachment_id
+        else:
+            fixed_attach_id = poi_attachment_id if 'e3' in order else e3_attachment_id
+        if sub == 'linker':
+            attach_id = poi_attachment_id if wrong_substruct == 'poi' else e3_attachment_id
+            fixed_attach_id = poi_attachment_id if wrong_substruct == 'poi' else e3_attachment_id
+            query_mol = remove_attach_atom(substructs[sub]['mol'], attach_id)
+            replaceDummies = True
+        else:
+            query_mol = dummy2query(substructs[sub]['mol'])
+            replaceDummies = False
+        if verbose:
+            # display(Draw.MolToImage(fixed_mol, legend=f"Starting molecule", size=(800, 300)))
+            # display(Draw.MolToImage(query_mol, legend=f"Molecule {sub.upper()} to remove", size=(800, 300)))
+            pass
+        fixed_mol_tmp = remove_substructure(
+            fixed_mol,
+            query_mol,
+            attachment_id=fixed_attach_id,
+            replaceDummies=replaceDummies,
+        )
+        if fixed_mol_tmp is None:
+            logging.debug(f'Failed to replace substructure "{sub}" in prediction SMILES: "{pred_smiles}"')
+            continue
+        fixed_mol = fixed_mol_tmp
+        # If there are multiple fragments, keep the biggest one
+        fragments = Chem.GetMolFrags(fixed_mol, asMols=True)
+        if len(fragments) > 1:
+            logging.debug(f'Fixed molecule contains more than one fragment. Keeping the biggest one.')
+            max_frag = max(fragments, key=lambda x: x.GetNumAtoms())
+            fixed_mol = max_frag
+    # Get the SMILES of the fixed molecule
+    fixed_smiles = Chem.MolToSmiles(canonize(fixed_mol), canonical=True)
+    substructs[wrong_substruct]['smiles'] = fixed_smiles
+    if verbose:
+        # display(Draw.MolToImage(fixed_mol, legend=f"{wrong_substruct.upper()} fixed molecule: {fixed_smiles}", size=(800, 300)))
+        pass
+    # Concatenate the substructures check if the re-assembly is correct
+    fixed_pred_smiles = f"{substructs['e3']['smiles']}.{substructs['linker']['smiles']}.{substructs['poi']['smiles']}"
+    if not check_reassembly(
+        protac_smiles,
+        fixed_pred_smiles,
+    ):
+        # logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
+        # return None
+        # Check if by flipping the tetrahedral centers of the ligands we can
+        # still fix the prediction.
+        protac_mol = canonize(Chem.MolFromSmiles(protac_smiles))
+        chiral_centers = Chem.FindMolChiralCenters(
+            protac_mol,
+            includeUnassigned=True,
+            useLegacyImplementation=False,
+        )
+        if not chiral_centers:
+            logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
+            return None
+        # Attempt to fix the tetrahedral centers of the ligands
+        e3_fixed = fix_tetrahedral_centers_ligand(protac_mol, substructs['e3']['smiles'], attachment_id=e3_attachment_id)
+        poi_fixed = fix_tetrahedral_centers_ligand(protac_mol, substructs['poi']['smiles'], attachment_id=poi_attachment_id)
+        if e3_fixed is None or poi_fixed is None:
+            logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
+            return None
+        # Update the substructures with the fixed ligands and check re-assembly
+        substructs['e3']['smiles'] = e3_fixed
+        substructs['poi']['smiles'] = poi_fixed
+        fixed_pred_smiles = f"{substructs['e3']['smiles']}.{substructs['linker']['smiles']}.{substructs['poi']['smiles']}"
+        if not check_reassembly(
+            protac_smiles,
+            fixed_pred_smiles,
+        ):
+            logging.warning(f"Failed to fix prediction, re-assembly check failed. Generated fixed prediction (failing): {fixed_pred_smiles}")
+            return None
+    return fixed_pred_smiles

protac_splitter/graphs/README.md ADDED Viewed

	@@ -0,0 +1,114 @@

+# Graph-Based PROTAC-Splitter
+## Heuristic Betweenness Centrality
+```python
+idx = 3765
+for i in range(10):
+    # sample = held_out_df.sample(n=1, random_state=42 + i).iloc[0]
+    sample = held_out_df.iloc[i]
+    # sample = held_out_df.iloc[i]
+    protac_smiles = sample['PROTAC SMILES']
+    wh_smiles = sample['POI Ligand SMILES with direction']
+    lk_smiles = sample['Linker SMILES with direction']
+    e3_smiles = sample['E3 Binder SMILES with direction']
+    protac = Chem.MolFromSmiles(protac_smiles)
+    wh = Chem.MolFromSmiles(wh_smiles)
+    lk = Chem.MolFromSmiles(lk_smiles)
+    e3 = Chem.MolFromSmiles(e3_smiles)
+    # display_mol(Chem.MolFromSmiles(protac_smiles), w=1500, h=600)
+    get_mapped_protac_img(protac_smiles, wh_smiles, lk_smiles, e3_smiles, w=1500, h=600, display_image=True, useSVG=False)
+    # wh_edge = get_atom_idx_at_attachment(protac, wh, lk)
+    # e3_edge = get_atom_idx_at_attachment(protac, e3, lk)
+    ret = nx_split(protac_smiles, representative_e3s_fp, morgan_fp_generator, use_capacity_weight=False, betweenness_threshold=0.4)
+    e3_smiles = ret['e3']
+    wh_smiles = ret['poi']
+    linker_smiles = ret['linker']
+    top_nodes = ret['top_nodes']
+    centrality = ret['centrality']
+    # display_mol(Chem.MolFromSmiles(e3_smiles), w=800, h=400, legend="E3")
+    # display_mol(Chem.MolFromSmiles(linker_smiles), w=800, h=400, legend="Linker")
+    # display_mol(Chem.MolFromSmiles(wh_smiles), w=800, h=400, legend="WH")
+    display_mol(Chem.MolFromSmiles('.'.join([wh_smiles, linker_smiles, e3_smiles])), w=800, h=400, legend="Graph-based split")
+    display(Draw.MolToImage(
+        protac,
+        size=(1500, 400),
+        highlightColor=(1, 0, 1, 0.3), # Light purple
+        highlightAtoms=top_nodes, # Highlight the top nodes
+        legend=f"Graph nodes: {top_nodes} (Betweenness centrality: {centrality[top_nodes[0]]:.3f})",
+    ))
+```
+## Graph Edge Classifier Example
+Example of how to use the GraphEdgeClassifier to train a model on a dataset of PROTACs and their ligands, and then predict edges in new PROTACs.
+```python
+label_cols = [c for c in train_set.columns if c.startswith("label_")]
+train_set = sets["train"].dropna(subset=label_cols)
+train_set = train_set[(train_set["label_e3_split"] + train_set["label_wh_split"]) <= 1]
+X_train = train_set.drop(columns=label_cols)
+graph_features = [c for c in X_train.columns if c.startswith("graph_")]
+# graph_features = [
+#     "graph_betweenness",
+#     "graph_degree",
+#     "graph_degree_r2",
+#     "graph_degree_r3",
+# ]
+categorical_features = ["chem_bond_type", "chem_atom_u", "chem_atom_v"]
+fingerprint_features = [c for c in X_train.columns if c.startswith("chem_mol_fp_")]
+# Instantiate and train
+clf = GraphEdgeClassifier(
+    graph_features=graph_features,
+    categorical_features=categorical_features,
+    fingerprint_features=fingerprint_features,
+    use_descriptors=False,
+    use_fingerprints=False,
+    binary=True,
+)
+y_train = train_set["label_is_split"].astype("int32") if clf.binary else GraphEdgeClassifier.build_multiclass_target(train_set)
+clf.fit(X_train, y_train)
+clf.save("../models/edge_classifier_bin.joblib")
+print(f"Model saved to ../models/edge_classifier_bin.joblib")
+label_cols = [c for c in train_set.columns if c.startswith("label_")]
+train_set = sets["train"].dropna(subset=label_cols)
+train_set = train_set[(train_set["label_e3_split"] + train_set["label_wh_split"]) <= 1]
+X_train = train_set.drop(columns=label_cols)
+graph_features = [c for c in X_train.columns if c.startswith("graph_")]
+# graph_features = [
+#     "graph_betweenness",
+#     "graph_degree",
+#     "graph_degree_r2",
+#     "graph_degree_r3",
+# ]
+categorical_features = ["chem_bond_type", "chem_atom_u", "chem_atom_v"]
+fingerprint_features = [c for c in X_train.columns if c.startswith("chem_mol_fp_")]
+# Instantiate and train
+clf = GraphEdgeClassifier(
+    graph_features=graph_features,
+    categorical_features=categorical_features,
+    fingerprint_features=fingerprint_features,
+    use_descriptors=False,
+    use_fingerprints=False,
+    binary=False,
+)
+y_train = train_set["label_is_split"].astype("int32") if clf.binary else GraphEdgeClassifier.build_multiclass_target(train_set)
+clf.fit(X_train, y_train)
+clf.save("../models/edge_classifier.joblib")
+print(f"Model saved to ../models/edge_classifier.joblib")
+```

protac_splitter/graphs/__init__.py ADDED Viewed

File without changes

protac_splitter/graphs/e3_clustering.py ADDED Viewed

	@@ -0,0 +1,321 @@

+from typing import List, Optional, Tuple, Any, Dict
+import functools
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from sklearn.cluster import AgglomerativeClustering, KMeans
+from scipy.stats import skew
+from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
+from rdkit import Chem, DataStructs
+from rdkit.Chem import rdFingerprintGenerator
+from protac_splitter.graphs.utils import get_fp, numpy_to_rdkit_fp
+from protac_splitter.chemoinformatics import remove_dummy_atoms
+def get_umap_clusters_fp(fp_list: List[str], n_clusters: int = 7) -> np.ndarray:
+    """
+    Cluster a list of SMILES strings using the umap clustering algorithm.
+    From Scaffold Splits Overestimate Virtual Screening Performance
+    https://arxiv.org/abs/2406.00873
+    Args:
+        fp_list (List[str]): List of SMILES strings.
+        n_clusters (int): The number of clusters to use for clustering.
+    Returns:
+        np.ndarray: Array of cluster labels corresponding to each SMILES string in the input list.
+    """
+    ac = AgglomerativeClustering(n_clusters=n_clusters)
+    ac.fit_predict(np.stack(fp_list))
+    return ac.labels_
+def get_kmeans_clusters_fp(fp_list: List[str], n_clusters: int = 10, return_centroids: bool = False) -> np.ndarray:
+    """
+    Cluster a list of SMILES strings using the KMeans clustering algorithm.
+    Args:
+        fp_list (List[str]): List of SMILES strings.
+        n_clusters (int): The number of clusters to use for clustering.
+        return_centroids (bool): If True, return the cluster centroids as well.
+    Returns:
+        np.ndarray: Array of cluster labels corresponding to each SMILES string in the input list.
+    """
+    km = KMeans(n_clusters=n_clusters, n_init='auto', random_state=42, max_iter=1000)
+    if return_centroids:
+        km.fit(np.stack(fp_list))
+        return km.labels_, km.cluster_centers_
+    return km.fit_predict(np.stack(fp_list))
+def evaluate_clusters(X: np.array, clusters: np.ndarray) -> Dict[str, float]:
+    """ Compute clustering metrics and assess cluster size distribution.
+    Args:
+        X (np.array): The input data used for clustering.
+        clusters (np.ndarray): The cluster labels for each data point in X.
+    Returns:
+        Dict[str, float]: A dictionary containing various clustering metrics:
+            - silhouette: Silhouette score of the clustering.
+            - davies_bouldin: Davies-Bouldin index of the clustering.
+            - calinski_harabasz: Calinski-Harabasz index of the clustering.
+            - avg_cluster_size: Average size of clusters.
+            - avg_cluster_data_ratio: Ratio of average cluster size to total data size.
+            - std_cluster_size: Standard deviation of cluster sizes.
+            - min_cluster_size: Minimum size of clusters.
+            - median_cluster_size: Median size of clusters.
+            - max_cluster_size: Maximum size of clusters.
+            - cluster_size_skewness: Skewness of cluster sizes indicating imbalance.
+            - num_clusters: Number of unique clusters found.
+    """
+    unique_clusters = list(set(clusters))
+    if len(unique_clusters) < 2:  # Avoid single-cluster issues
+        return {
+            "silhouette": -1,
+            "davies_bouldin": float("inf"),
+            "calinski_harabasz": -1,
+            "avg_cluster_size": len(X),
+            "avg_cluster_data_ratio": 1,
+            "std_cluster_size": 0,
+            "min_cluster_size": len(X),
+            "median_cluster_size": len(X),
+            "max_cluster_size": len(X),
+            "cluster_size_skewness": 0,
+            "num_clusters": 1,
+        }
+    # Compute standard clustering metrics
+    silhouette = silhouette_score(X, clusters)
+    davies_bouldin = davies_bouldin_score(X, clusters)
+    calinski_harabasz = calinski_harabasz_score(X, clusters)
+    # Compute cluster size statistics
+    cluster_sizes = [len(np.where(clusters == i)[0]) for i in np.unique(clusters)]
+    avg_cluster_size = np.mean(cluster_sizes)
+    avg_cluster_data_ratio = avg_cluster_size / len(X)
+    std_cluster_size = np.std(cluster_sizes)
+    median_cluster_size = np.median(cluster_sizes)
+    min_cluster_size = np.min(cluster_sizes)
+    max_cluster_size = np.max(cluster_sizes)
+    cluster_size_skewness = skew(cluster_sizes, nan_policy="omit")  # Indicates imbalance in cluster sizes
+    return {
+        "silhouette": silhouette,
+        "davies_bouldin": davies_bouldin,
+        "calinski_harabasz": calinski_harabasz,
+        "avg_cluster_size": avg_cluster_size,
+        "avg_cluster_data_ratio": avg_cluster_data_ratio,
+        "std_cluster_size": std_cluster_size,
+        "min_cluster_size": min_cluster_size,
+        "median_cluster_size": median_cluster_size,
+        "max_cluster_size": max_cluster_size,
+        "cluster_size_skewness": cluster_size_skewness,
+        "num_clusters": len(unique_clusters),
+    }
+def get_representative_e3s(
+    train_df: pd.DataFrame,
+    fp_generator: Optional[Any] = None,
+    n_clusters_candidates: List[int] = [10, 25, 50, 100, 150],
+    e3_column: str = 'E3 Binder SMILES with direction',
+) -> Tuple[List[str], List[Any], int, pd.DataFrame]:
+    """
+    Get representative E3 ligands from a DataFrame of training data by clustering their fingerprints.
+    This function computes Morgan fingerprints for unique E3 ligands, clusters them using KMeans and UMAP,
+    evaluates the clusters using silhouette, Davies-Bouldin, and Calinski-Harabasz scores, and identifies
+    the optimal number of clusters based on these metrics.
+    It returns the representative E3 ligands, their fingerprints, the best number of clusters, and a DataFrame
+    containing the clustering metrics.
+    Parameters:
+        train_df (pd.DataFrame): DataFrame containing training data with E3 ligands.
+        fp_generator (Optional[Any]): RDKit fingerprint generator. If None, a default Morgan fingerprint generator with 1024 bits and radius 6 is used.
+        n_clusters_candidates (List[int]): List of candidate numbers of clusters to evaluate.
+        e3_column (str): The column name in the DataFrame that contains the E3 ligand SMILES strings.
+    Returns:
+        Tuple[List[str], List[Any], int, pd.DataFrame]: A tuple containing:
+            - List of representative E3 ligand SMILES strings.
+            - List of RDKit fingerprints corresponding to the representative E3 ligands.
+            - The best number of clusters determined from the clustering metrics.
+            - DataFrame containing clustering metrics for each candidate number of clusters.
+    """
+    if e3_column not in train_df.columns:
+        raise ValueError(f"Column '{e3_column}' not found in the DataFrame.")
+    if fp_generator is None:
+        fp_generator = rdFingerprintGenerator.GetMorganGenerator(
+            radius=16,
+            fpSize=1024,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    fp_dict = {}
+    for smi in tqdm(train_df[e3_column].unique()):
+        fp = get_fp(remove_dummy_atoms(smi), fp_generator)
+        if fp is not None:
+            fp_dict[smi] = fp
+    fp_list = list(fp_dict.values())
+    fp2smiles = {fp.tobytes(): smi for smi, fp in fp_dict.items() if fp is not None}
+    centroids_dict = {}
+    clusters_dict = {}
+    metrics_df = []
+    for n_clusters in tqdm(n_clusters_candidates, desc="Clustering and evaluating"):
+        clusters, centroids = get_kmeans_clusters_fp(fp_list, n_clusters=n_clusters, return_centroids=True)
+        metrics = evaluate_clusters(fp_list, clusters)
+        clusters_dict[f'kmeans_n{n_clusters}'] = clusters.copy()
+        centroids_dict[n_clusters] = centroids.copy()
+        metrics['num_clusters'] = n_clusters
+        metrics['cluster_algorithm'] = 'kmeans'
+        metrics_df.append(metrics.copy())
+        clusters = get_umap_clusters_fp(fp_list, n_clusters=n_clusters)
+        metrics = evaluate_clusters(fp_list, clusters)
+        clusters_dict[f'umap_n{n_clusters}'] = clusters.copy()
+        metrics['num_clusters'] = n_clusters
+        metrics['cluster_algorithm'] = 'umap'
+        metrics_df.append(metrics.copy())
+    metrics_df = pd.DataFrame(metrics_df)
+    # Get the sweet spot for the number of clusters
+    # Flip davies_bouldin so that all metrics are to be maximized
+    metrics_df['-davies_bouldin'] = -metrics_df['davies_bouldin']
+    # Normalize all three metrics (by group if you want per algorithm)
+    metrics = ['silhouette', '-davies_bouldin', 'calinski_harabasz']
+    df_norm = metrics_df.copy()
+    df_norm[metrics] = df_norm.groupby('cluster_algorithm')[metrics].transform(
+        lambda x: (x - x.min()) / (x.max() - x.min())
+    )
+    # Measure divergence: standard deviation of normalized metrics per row
+    df_norm['metric_divergence'] = df_norm[metrics].std(axis=1)
+    # Pick the point with lowest divergence, possibly applying constraints (e.g. not too many clusters)
+    sweet_spots = df_norm.loc[df_norm.groupby('cluster_algorithm')['metric_divergence'].idxmin()]
+    best_n_clusters = sweet_spots[['num_clusters']]['num_clusters'].unique()[0]
+    # Get the centroids of the clusters
+    centroids = centroids_dict[best_n_clusters]
+    # Get the cluster labels for the centroids
+    clusters = np.array(clusters_dict[f'kmeans_n{n_clusters}'])
+    representative_e3s = []
+    representative_e3s_fp = []
+    for label, centroid in enumerate(centroids):
+        # Isolate the FP with the same label as the centroid
+        fp_cluster = np.array(fp_list)[clusters == label]
+        # Get the closest FP for the centroid, use euclidean distance
+        distances = np.linalg.norm(fp_cluster - centroid, axis=1)
+        closest_fp = np.argmin(distances)
+        # To get the SMILES from the FP, use the fp2smiles dictionary
+        closest_smiles = fp2smiles[fp_cluster[closest_fp].tobytes()]
+        # Append the closest SMILES to the representative_e3s list
+        representative_e3s.append(closest_smiles)
+        representative_e3s_fp.append(fp_cluster[closest_fp])
+    # Convert the representative E3s to RDKit fingerprints
+    representative_e3s_fp = [numpy_to_rdkit_fp(fp) for fp in representative_e3s_fp]
+    return representative_e3s, representative_e3s_fp, best_n_clusters, metrics_df
+DEFAULT_REPRESENTATIVE_E3S = [
+    'Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)CN[*:2])cc1',
+    'O=C1CCC(N2Cc3c(N=[*:2])cccc3C2=O)C(=O)N1',
+    'CC(=O)NC(C(=O)N1CC(O)CC1C(=O)[*:2])C(C)(C)C',
+    'CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](Oc2ccccc2[*:2])C[C@H]1C(=O)N[C@@H]1CCCc2ccccc21)C1CCCCC1',
+    'Cc1ncsc1-c1ccc(CNC(=O)C2CC(O)CN2C(=O)C(NC(=O)CCO[*:2])C(C)(C)C)cc1',
+    'O=C1CCC(N2Cc3ccc([*:2])cc3C2=O)C(=O)N1',
+    'COc1ccc(C2=N[C@@H](c3ccc(Cl)cc3)[C@@H](c3ccc(Cl)cc3)N2C(=O)N2CCN(CC(=O)[*:2])C(=O)C2)c(OC(C)C)c1',
+    'CC(NC(=O)C1CC(O)CN1C(=O)C(N[*:2])C(C)(C)C)c1ccc(C2CC2)cc1',
+    'CCOc1cc(C(C)(C)C)ccc1C1=NC(c2ccc(Cl)cc2)C(c2ccc(Cl)cc2)N1C(=O)N1CCN(CCCC[*:2])CC1',
+    'CNC(C)C(=O)NC(C(=O)N1CCCC1c1cncc(C(=O)c2cccc([*:2])c2)c1)C1CCCCC1',
+    'CN[C@@H](C)C(=O)N[C@H](C(=O)N1CCC[C@H]1c1nc(C(=O)c2ccc([*:2])cc2)cs1)C1CCCCC1',
+    'O=C1CCC(N2C(=O)c3cccc(OC[*:2])c3C2=O)C(=O)N1',
+    'CCOc1cc(C(C)(C)C)ccc1C1=NC(c2ccc(Cl)cc2)C(c2ccc(Cl)cc2)N1C(=O)N1CCN([*:2])CC1',
+    'Cc1ncsc1-c1ccc(CNC(=O)[C@H]2C[C@H](O)CN2C(=O)C(N[*:2])C(C)(C)C)cc1',
+    'Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](N[*:2])C(C)(C)C)cc1',
+    'CN[C@@H](C)C(=O)N[C@H](C(=O)N1CCC[C@H]1c1cncc(C(=O)c2cccc([*:2])c2)c1)C1CCCCC1',
+    'Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](N[*:2])C(C)(C)C)c(OC2CCNCC2)c1',
+    'CNC(C)C(=O)NC(C(=O)N1CC(Oc2ccc([*:2])cc2)CC1C(=O)NC1CCCc2ccccc21)C1CCCCC1',
+    'C[C@H](NC(=O)[C@@H]1C[C@@H](O)CN1C(=O)[C@@H](N[*:2])C(C)(C)C)c1ccc(C(C)(C)C)cc1',
+    'CNC(C)C(=O)NC(C(=O)N1CCCC1c1nc(C(=O)c2ccc([*:2])cc2)cs1)C1CCCCC1',
+    'CC(=O)NC(C(=O)N1CC(O)CC1C(=O)NCc1ccc(-c2scnc2C)cc1[*:2])C(C)(C)C',
+    'Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)c([*:2])c1',
+    'CCOc1cc(C(C)(C)C)ccc1C1=NC(C)(c2ccc(Cl)cc2)C(C)(c2ccc(Cl)cc2)N1C(=O)N1CCN(CC(=O)[*:2])CC1',
+    'COc1ccc(C(=O)[*:2])cc1N1CCC(=O)NC1=O',
+    'CN[C@@H](C)C(=O)N[C@H](C(=O)N[C@H]1C[C@H]2CC[C@@H]1N(CCc1ccc([*:2])cc1)C2)C1CCCCC1',
+    'CNC(C)C(=O)NC(C(=O)N1CC(N[*:2])CC1C(=O)NC1CCCc2ccccc21)C1CCCCC1',
+    'CN[C@@H](C)C(=O)N[C@@H](CCCCN[*:2])C(=O)N1CCC[C@H]1C(=O)Nc1snnc1-c1ccccc1',
+    'CNC(C)C(=O)NC(C(=O)NC1CC2CCC1N(CCc1cccc([*:2])c1)C2)C1CCCCC1',
+    'O=C1CCC(N2C(=O)c3ccc(N[*:2])cc3C2=O)C(=O)N1',
+    'CNC(C)C(=O)NC(C(=O)N1CC(NC(=O)CC[*:2])CC1C(=O)Nc1c(F)cccc1F)C(C)(C)C',
+    'Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@H](N[*:2])C(C)(C)C)cc1',
+    'Cc1nc[nH]c1-c1ccc(CNC(=O)C2CC(O)CN2C(=O)C(N[*:2])C(C)(C)C)cc1',
+    'Cc1ncsc1-c1ccc(C(C)NC(=O)C2CC(O)CN2C(=O)C(N[*:2])C(C)(C)C)cc1',
+    'Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](N[*:2])C(C)(C)C)cc1',
+    'O=C1CCC(c2cccc([*:2])c2)C(=O)N1',
+    'CC(=O)N[C@H](C(=O)N1C[C@@H](O)C[C@@H]1C(=O)N[C@@H](CC(=O)N1CCC([*:2])CC1)c1ccccc1)C(C)C',
+    'O=C(CCl)[*:2]',
+    'CC[C@@H](NC(=O)[C@@H]1C[C@H](N[*:2])CN1C(=O)[C@@H](NC(=O)[C@H](C)NC)C(C)(C)C)c1ccccc1',
+    'CN[C@H](C)C(=O)N[C@@H]1CCO[C@@H]2CC(C)(C)[C@H](C(=O)N[C@@H]3CCCc4cc([*:2])ccc43)N2C1=O',
+    'CN[C@@H](C)C(=O)N[C@H](C(=O)N1CCC[C@H]1c1nc(C(=O)c2ccc(F)cc2)cs1)C1CCN(C[*:2])CC1',
+    'Cc1ncsc1-c1ccc(CNC(=O)C2CC(O)CN2C(=O)C(N[*:2])C(C)(C)C)cc1',
+    'CNC(C)C(=O)NC(CCCCN[*:2])C(=O)N1CCCC1C(=O)Nc1snnc1-c1ccccc1',
+    'O=C1CCC(N2C(=O)c3cccc([*:2])c3C2=O)C(=O)O1',
+    'COc1ccc(C2=N[C@@H](c3ccc(Cl)cc3)[C@@H](c3ccc(Cl)cc3)N2C(=O)N2CCN(CC(=O)[*:2])C(=O)C2)cc1OC(C)C',
+    'Cc1ncsc1-c1ccc(CNC(=O)C2CC(O)CN2C(=O)C(N[*:2])C(C)(C)C)c(OC2CCNCC2)c1',
+    'CNC(C)C(=O)NC(C(=O)N1CCCC1c1cncc(-n2ccc3c(C(=O)[*:2])cccc32)c1)C(C)C',
+    'CCN1CCN(Cc2ccc(NC(=O)c3cccc(-c4ccc5nc(N[*:2])sc5n4)c3)cc2C(F)(F)F)CC1',
+    'CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC[*:2])C[C@H]1C(=O)Nc1c(F)cccc1F)C(C)(C)C',
+    'CNC(C)C(=O)NC(C(=O)N1CCCC1C(=O)NC(C(=O)[*:2])C(c1ccccc1)c1ccccc1)C1CCCCC1',
+    'CC(=O)NCC(C(=O)N1CC(O)CC1C(=O)NC(CC(=O)N1CCC(N2CCC([*:2])CC2)CC1)c1ccccc1)C(C)C',
+]
+@functools.lru_cache(maxsize=1, typed=False)
+def get_representative_e3s_fp(
+    e3_list: Optional[List[str]] = None,
+    fp_generator: Optional[Any] = None,
+    verbose: int = 0,
+) -> List[DataStructs.ExplicitBitVect]:
+    """
+    Generate Morgan fingerprints for a list of E3 ligands. If no list is provided,
+    it uses a default list of representative E3 ligands.
+    Parameters:
+        e3_list (Optional[List[str]]): List of SMILES strings for E3 ligands. If None, uses a default list.
+        fp_generator (Optional[Any]): RDKit fingerprint generator. If None, a default Morgan fingerprint generator is used.
+    Returns:
+        List[DataStructs.ExplicitBitVect]: List of RDKit Morgan fingerprints for the E3 ligands.
+    """
+    representative_e3s_fp = []
+    if verbose > 0:
+        iterable = tqdm(e3_list or DEFAULT_REPRESENTATIVE_E3S, desc="Generating fingerprints for E3 ligands")
+    else:
+        iterable = e3_list or DEFAULT_REPRESENTATIVE_E3S
+    for smi in iterable:
+        # Get the Morgan fingerprint for the SMILES string
+        fp = get_fp(remove_dummy_atoms(smi), fp_generator, return_np=False)
+        if fp is not None:
+            representative_e3s_fp.append(fp)
+        else:
+            print(f"Warning: Invalid SMILES string '{smi}' encountered, skipping.")
+    if not representative_e3s_fp:
+        raise ValueError("No valid E3 ligands found in the provided list.")
+    return representative_e3s_fp

protac_splitter/graphs/edge_classifier.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import joblib
+from pathlib import Path
+from typing import Optional, List, Dict, Union, Any, Literal
+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.decomposition import TruncatedSVD
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import Pipeline as ImbPipeline
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+from xgboost import XGBClassifier
+import optuna
+from optuna.samplers import QMCSampler
+from sklearn.metrics import accuracy_score, f1_score
+try:
+    import seaborn as sns
+    import matplotlib.pyplot as plt
+    HAS_VISUALIZATION = True
+except ImportError:
+    HAS_VISUALIZATION = False
+from .edge_features import extract_edge_features, get_edge_features
+class GraphEdgeClassifier(BaseEstimator, ClassifierMixin):
+    """
+    Edge-level graph classifier for PROTACs with integrated pipeline building.
+    """
+    def __init__(
+        self,
+        graph_features: List[str],
+        categorical_features: Optional[List[str]] = None,
+        descriptor_features: Optional[List[str]] = None,
+        fingerprint_features: Optional[List[str]] = None,
+        use_descriptors: bool = True,
+        use_fingerprints: bool = True,
+        scaler_graph: Literal["passthrough", "standard"] = "passthrough",
+        scaler_desc: Literal["passthrough", "standard"] = "passthrough",
+        use_svd_fp: bool = True,
+        n_svd_components: int = 100,
+        binary: bool = False,
+        smote_k_neighbors: Optional[int] = 5,
+        xgb_params: Optional[dict] = None,
+        n_bits: int = 512,
+        radius: int = 6,
+        descriptor_names: Optional[List[str]] = None
+    ):
+        self.graph_features = graph_features
+        self.categorical_features = categorical_features
+        self.descriptor_features = descriptor_features
+        self.fingerprint_features = fingerprint_features
+        self.use_descriptors = use_descriptors
+        self.use_fingerprints = use_fingerprints
+        self.scaler_graph = scaler_graph
+        self.scaler_desc = scaler_desc
+        self.use_svd_fp = use_svd_fp
+        self.n_svd_components = n_svd_components
+        self.binary = binary
+        self.smote_k_neighbors = smote_k_neighbors
+        self.xgb_params = xgb_params or {}
+        self.n_bits = n_bits
+        self.radius = radius
+        self.descriptor_names = descriptor_names or [
+            "MolWt", "HeavyAtomCount", "NumHAcceptors", "NumHDonors",
+            "TPSA", "NumRotatableBonds", "RingCount", "MolLogP"
+        ]
+        self.pipeline = self._build_pipeline()
+    def _build_pipeline(self):
+        transformers = []
+        if self.categorical_features:
+            transformers.append(("cat", OneHotEncoder(handle_unknown="ignore"), self.categorical_features))
+        if self.scaler_graph == "standard":
+            transformers.append(("num", StandardScaler(), self.graph_features))
+        else:
+            transformers.append(("num", "passthrough", self.graph_features))
+        if self.use_descriptors and self.descriptor_features:
+            desc_block = (
+                ("desc", StandardScaler(), self.descriptor_features)
+                if self.scaler_desc == "standard"
+                else ("desc", "passthrough", self.descriptor_features)
+            )
+            transformers.append(desc_block)
+        if self.use_fingerprints and self.fingerprint_features:
+            if self.use_svd_fp:
+                fp_block = ("fp",
+                    ImbPipeline([
+                        ("svd", TruncatedSVD(n_components=self.n_svd_components, random_state=42))
+                    ]),
+                    self.fingerprint_features)
+            else:
+                fp_block = ("fp", "passthrough", self.fingerprint_features)
+            transformers.append(fp_block)
+        preprocessor = ColumnTransformer(transformers)
+        # Define the classifier
+        classifier = XGBClassifier(
+            random_state=42,
+            eval_metric="logloss" if self.binary else "mlogloss",
+            objective="binary:logistic" if self.binary else "multi:softprob",
+            **self.xgb_params
+        )
+        if self.smote_k_neighbors is not None:
+            return ImbPipeline([
+                ("preprocess", preprocessor),
+                ("smote", SMOTE(random_state=42, k_neighbors=self.smote_k_neighbors)),
+                ("clf", classifier)
+            ])
+        else:
+            return Pipeline([
+                ("preprocess", preprocessor),
+                ("clf", classifier)
+            ])
+    def fit(self, X: pd.DataFrame, y: pd.Series):
+        self.pipeline.fit(X, y)
+        return self
+    def predict(self, X: Union[pd.DataFrame, List[Dict], List[str]]) -> Any:
+        X_proc = self._ensure_features(X)
+        return self.pipeline.predict(X_proc)
+    def predict_proba(self, X: Union[pd.DataFrame, List[Dict], List[str]]) -> Any:
+        X_proc = self._ensure_features(X)
+        return self.pipeline.predict_proba(X_proc)
+    def save(self, path: Union[str, Path]):
+        joblib.dump(self, str(path))
+    @classmethod
+    def load(cls, path: Union[str, Path]) -> "GraphEdgeClassifier":
+        return joblib.load(str(path))
+    @staticmethod
+    def extract_graph_features(
+        protac_smiles: Union[str, List[str]],
+        wh_smiles: Optional[Union[str, List[str]]] = None,
+        lk_smiles: Optional[Union[str, List[str]]] = None,
+        e3_smiles: Optional[Union[str, List[str]]] = None,
+        n_bits: int = 512,
+        radius: int = 6,
+        descriptor_names: Optional[List[str]] = None,
+        verbose: int = 0
+    ) -> pd.DataFrame:
+        if any(x is None for x in [wh_smiles, lk_smiles, e3_smiles]):
+            # Get features from PROTAC only, for inference
+            return extract_edge_features(
+                protac_smiles=protac_smiles,
+                n_bits=n_bits,
+                radius=radius,
+                descriptor_names=descriptor_names,
+            )
+        else:
+            # Get features and labels from all components, for training
+            return get_edge_features(
+                protac_smiles=protac_smiles,
+                wh_smiles=wh_smiles,
+                lk_smiles=lk_smiles,
+                e3_smiles=e3_smiles,
+                n_bits=n_bits,
+                radius=radius,
+                descriptor_names=descriptor_names,
+                verbose=verbose
+            )
+    @staticmethod
+    def build_multiclass_target(
+        df: pd.DataFrame,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+    ) -> pd.Series:
+        """
+        Returns multiclass target: 0 = no split, 1 = E3 split, 2 = WH split
+        """
+        assert ((df["label_e3_split"] + df["label_wh_split"]) <= 1).all()
+        y = (
+            df["label_wh_split"] * poi_attachment_id +
+            df["label_e3_split"] * e3_attachment_id
+        )
+        return y.astype("int32")
+    def _ensure_features(self, X: Union[pd.DataFrame, List[Dict], List[str]]) -> pd.DataFrame:
+        """ Filter out features/columns that are are not used in the pipeline. """
+        required_columns = (
+            (self.graph_features or []) +
+            (self.categorical_features or []) +
+            (self.descriptor_features or []) +
+            (self.fingerprint_features or [])
+        )
+        # If input is a DataFrame with SMILES, assume already featurized
+        if isinstance(X, pd.DataFrame):
+            Xf = X
+        elif isinstance(X, list) and isinstance(X[0], dict):
+            Xf = pd.DataFrame(X)
+        else:
+            raise ValueError("Provide either a DataFrame or list of feature dicts. Use extract_graph_features for SMILES.")
+        missing = set(required_columns) - set(Xf.columns)
+        if missing:
+            raise ValueError(f"Input data missing required columns: {missing}")
+        return Xf[required_columns].copy()
+    def predict_proba_from_smiles(
+        self,
+        protac_smiles: Union[str, List[str]],
+        wh_smiles: Union[str, List[str]],
+        lk_smiles: Union[str, List[str]],
+        e3_smiles: Union[str, List[str]],
+        verbose: int = 0,
+    ):
+        features = self.extract_graph_features(
+            protac_smiles, wh_smiles, lk_smiles, e3_smiles,
+            n_bits=self.n_bits,
+            radius=self.radius,
+            descriptor_names=self.descriptor_names,
+            verbose=verbose
+        )
+        Xf = self._ensure_features(features)
+        return self.pipeline.predict_proba(Xf)
+    def predict_from_smiles(
+        self,
+        protac_smiles: Union[str, List[str]],
+        wh_smiles: Union[str, List[str]],
+        lk_smiles: Union[str, List[str]],
+        e3_smiles: Union[str, List[str]],
+        top_n: int = 1,
+        return_array: bool = True,
+        verbose: int = 0,
+    ) -> Union[pd.DataFrame, np.ndarray]:
+        """
+        For binary classification:
+            For each SMILES, return the top_n edge chem_bond_idx indices among those predicted as class 1,
+            sorted by predicted probability. If not enough edges are class 1, pad with -1.
+        For multiclass:
+            For each SMILES, return the chem_bond_idx with highest probability for class 1 (E3 split)
+            and for class 2 (WH split). Shape: (num_smiles, 2).
+            If no edge is predicted as that class, value is -1.
+        """
+        features = self.extract_graph_features(
+            protac_smiles, wh_smiles, lk_smiles, e3_smiles,
+            n_bits=self.n_bits,
+            radius=self.radius,
+            descriptor_names=self.descriptor_names,
+            verbose=verbose
+        )
+        Xf = self._ensure_features(features)
+        pred_proba = self.pipeline.predict_proba(Xf)
+        pred_label = self.pipeline.predict(Xf)
+        features = features.copy()
+        features["pred_label"] = pred_label
+        features["pred_proba"] = pred_proba[:, 1] if pred_proba.shape[1] > 1 else pred_proba[:, 0]
+        unique_smiles = pd.Series(features["chem_mol_smiles"]).drop_duplicates().tolist()
+        groupby = features.groupby("chem_mol_smiles")
+        results = []
+        if return_array:
+            if pred_proba.shape[1] == 2:  # Binary case
+                for mol_smiles in unique_smiles:
+                    group = groupby.get_group(mol_smiles)
+                    # Only consider edges predicted as label 1
+                    edges_class1 = group[group["pred_label"] == 1]
+                    # If none, pad with -1
+                    if len(edges_class1) == 0:
+                        results.append(np.full(top_n, -1))
+                        continue
+                    # Sort by proba, take top_n
+                    top_edges = edges_class1.nlargest(top_n, "pred_proba")
+                    idxs = top_edges["chem_bond_idx"].to_numpy()
+                    if len(idxs) < top_n:
+                        idxs = np.pad(idxs, (0, top_n - len(idxs)), constant_values=-1)
+                    results.append(idxs[:top_n])
+                return np.vstack(results)
+            else:  # Multiclass case
+                for mol_smiles in unique_smiles:
+                    group = groupby.get_group(mol_smiles)
+                    # For class 1
+                    class1_idx = -1
+                    if (group["pred_label"] == 1).any():
+                        # Take the edge with highest class-1 probability
+                        mask = group["pred_label"] == 1
+                        idx1 = group.loc[mask, "pred_proba"].idxmax()
+                        class1_idx = group.loc[idx1, "chem_bond_idx"]
+                    # For class 2
+                    class2_idx = -1
+                    if (group["pred_label"] == 2).any():
+                        mask = group["pred_label"] == 2
+                        idx2 = group.loc[mask, "pred_proba"].idxmax()
+                        class2_idx = group.loc[idx2, "chem_bond_idx"]
+                    results.append([class1_idx, class2_idx])
+                return np.array(results, dtype=int)
+        else:
+            return features
+def get_classification_report(y_true, y_pred, labels):
+    report = classification_report(y_true, y_pred, target_names=labels, output_dict=True)
+    df_report = pd.DataFrame(report).transpose().round(2)
+    print(df_report)
+    return df_report
+def plot_confusion_matrix(y_true, y_pred, labels):
+    cm = confusion_matrix(y_true, y_pred)
+    if HAS_VISUALIZATION:
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
+        plt.xlabel("Predicted")
+        plt.ylabel("True")
+        plt.title("Confusion Matrix")
+        plt.show()
+    else:
+        print("Visualization libraries not available. Skipping confusion matrix plot.")
+        print("Confusion Matrix:")
+        print(cm)
+def get_classification_report_and_plot(y_true, y_pred, labels):
+    report = get_classification_report(y_true, y_pred, labels)
+    plot_confusion_matrix(y_true, y_pred, labels)
+    return report
+def train_edge_classifier(
+    train_df: pd.DataFrame,
+    val_df: Optional[pd.DataFrame] = None,
+    test_df: Optional[pd.DataFrame] = None,
+    model_filename: Optional[Union[str, Path]] = None,
+    edge_classifier_kwargs: Optional[Dict[str, Any]] = None,
+    cache_dir: Optional[Union[str, Path]] = None,
+    return_reports: bool = True,
+    plot_confusion_matrix: bool = False,
+) -> GraphEdgeClassifier:
+    """
+    Train an edge-level graph classifier for PROTACs.
+    Args:
+        train_df (pd.DataFrame): Training data with columns:
+            - 'PROTAC SMILES'
+            - 'POI Ligand SMILES with direction'
+            - 'Linker SMILES with direction'
+            - 'E3 Binder SMILES with direction'
+        val_df (Optional[pd.DataFrame]): Validation data, same format as train_df.
+        test_df (Optional[pd.DataFrame]): Test data, same format as train_df.
+        model_filename (Optional[Union[str, Path]]): Path to save the trained model.
+        edge_classifier_kwargs (Optional[Dict[str, Any]]): Additional parameters for GraphEdgeClassifier.
+        return_reports (bool): Whether to return classification reports for validation and test sets.
+    Returns:
+        GraphEdgeClassifier: Trained edge classifier instance.
+    """
+    sets = {}
+    for set_name, df in [
+        ("train", train_df),
+        ("val", val_df),
+        ("test", test_df),
+    ]:
+        if cache_dir is not None:
+            cache_path = Path(cache_dir) / f"{set_name}.csv"
+            if cache_path.exists():
+                print(f"Loading cached features for {set_name} from {cache_path}")
+                sets[set_name] = pd.read_csv(cache_path)
+                continue
+            else:
+                print(f"Cache not found for {set_name}, extracting features...")
+        if df is None or df.empty:
+            continue
+        print(f"Set: {set_name}, size: {len(df):,}")
+        if 'PROTAC SMILES' not in df.columns or \
+           'POI Ligand SMILES with direction' not in df.columns or \
+           'Linker SMILES with direction' not in df.columns or \
+           'E3 Binder SMILES with direction' not in df.columns:
+            raise ValueError(f"DataFrame for {set_name} is missing required columns: 'PROTAC SMILES', 'POI Ligand SMILES with direction', 'Linker SMILES with direction', 'E3 Binder SMILES with direction'.")
+        sets[set_name] = GraphEdgeClassifier.extract_graph_features(
+            df['PROTAC SMILES'].tolist(),
+            df['POI Ligand SMILES with direction'].tolist(),
+            df['Linker SMILES with direction'].tolist(),
+            df['E3 Binder SMILES with direction'].tolist(),
+            verbose=1,
+        )
+        # Drop rows with label_e3_split + label_wh_split > 1
+        sets[set_name] = sets[set_name][(sets[set_name]["label_e3_split"] + sets[set_name]["label_wh_split"]) <= 1]
+        print(f"Set: {set_name}, size: {len(sets[set_name]):,}")
+        if cache_dir is not None:
+            cache_path = Path(cache_dir) / f"{set_name}.csv"
+            cache_path.parent.mkdir(parents=True, exist_ok=True)
+            sets[set_name].to_csv(cache_path, index=False)
+            print(f"Saved {set_name} features to {cache_path}")
+    train_set = sets["train"]
+    label_cols = [c for c in train_set.columns if c.startswith("label_")]
+    train_set = train_set.dropna(subset=label_cols)
+    train_set = train_set[(train_set["label_e3_split"] + train_set["label_wh_split"]) <= 1]
+    X_train = train_set.drop(columns=label_cols)
+    # Instantiate and train
+    clf = GraphEdgeClassifier(**edge_classifier_kwargs or {
+        "graph_features": [c for c in X_train.columns if c.startswith("graph_")],
+        "categorical_features": ["chem_bond_type", "chem_atom_u", "chem_atom_v"],
+        "fingerprint_features": [c for c in X_train.columns if c.startswith("chem_mol_fp_")],
+        "use_descriptors": False,
+        "use_fingerprints": True,
+        "n_svd_components": 50,
+        "binary": True,
+        "smote_k_neighbors": 10,
+        "xgb_params": {
+            "max_depth": 6,
+            "learning_rate": 0.3,
+            "alpha": 0.1, # Default: 0
+            "lambda": 0.5, # Default: 1
+            "gamma": 0.1, # Default: 0
+        },
+    })
+    # Prepare target variable according to classification type
+    if clf.binary:
+        y_train = train_set["label_is_split"].astype("int32")
+    else:
+        y_train = GraphEdgeClassifier.build_multiclass_target(train_set)
+    print(f"Training set size: {len(X_train):,}, labels: {y_train.unique()}")
+    clf.fit(X_train, y_train)
+    print("Training complete.")
+    if model_filename is not None:
+        clf.save(model_filename)
+        print(f"Model saved to {model_filename}")
+    target_labels = ["No Split", "Split"] if clf.binary else ["No Split", "WH-Linker", "E3-Linker"]
+    report = None
+    if "val" in sets:
+        # Get validation data
+        val_set = sets["val"].dropna(subset=label_cols)
+        val_set = val_set[(val_set["label_e3_split"] + val_set["label_wh_split"]) <= 1]
+        X_val = val_set.drop(columns=label_cols)
+        y_val = val_set["label_is_split"].astype("int32") if clf.binary else GraphEdgeClassifier.build_multiclass_target(val_set)
+        y_pred = clf.predict(X_val)
+        if plot_confusion_matrix:
+            report = get_classification_report_and_plot(y_val, y_pred, target_labels)
+        else:
+            report = get_classification_report(y_val, y_pred, target_labels)
+        print(f"Validation set classification report:\n{report.to_markdown(index=False)}")
+    if "test" in sets:
+        # Get test data
+        test_set = sets["test"].dropna(subset=label_cols)
+        test_set = test_set[(test_set["label_e3_split"] + test_set["label_wh_split"]) <= 1]
+        X_test = test_set.drop(columns=label_cols)
+        y_test = test_set["label_is_split"].astype("int32") if clf.binary else GraphEdgeClassifier.build_multiclass_target(test_set)
+        y_pred = clf.predict(X_test)
+        if plot_confusion_matrix:
+            report = get_classification_report_and_plot(y_test, y_pred, target_labels)
+        else:
+            report = get_classification_report(y_test, y_pred, target_labels)
+        print(f"Test set classification report:\n{report.to_markdown(index=False)}")
+    if return_reports:
+        return clf, report
+    else:
+        return clf
+def objective(trial, train_df, val_df):
+    # HP space
+    max_depth = trial.suggest_int("max_depth", 3, 10)
+    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
+    alpha = trial.suggest_float("alpha", 0.0, 2.0)
+    reg_lambda = trial.suggest_float("lambda", 0.0, 2.0)
+    gamma = trial.suggest_float("gamma", 0.0, 1.0)
+    n_svd_components = trial.suggest_int("n_svd_components", 16, 128)
+    smote_k_neighbors = trial.suggest_int("smote_k_neighbors", 3, 15)
+    use_descriptors = trial.suggest_categorical("use_descriptors", [False, True])
+    use_fingerprints = trial.suggest_categorical("use_fingerprints", [True, False])
+    edge_classifier_kwargs = {
+        "graph_features": None,  # Will be set in train_edge_classifier
+        "categorical_features": None,
+        "fingerprint_features": None,
+        "use_descriptors": use_descriptors,
+        "use_fingerprints": use_fingerprints,
+        "n_svd_components": n_svd_components,
+        "binary": True,
+        "smote_k_neighbors": smote_k_neighbors,
+        "xgb_params": {
+            "max_depth": max_depth,
+            "learning_rate": learning_rate,
+            "alpha": alpha,
+            "lambda": reg_lambda,
+            "gamma": gamma,
+        },
+    }
+    _, val_report = train_edge_classifier(
+        train_df=train_df,
+        val_df=val_df,
+        edge_classifier_kwargs=edge_classifier_kwargs,
+        return_reports=True,
+    )
+    # Evaluate metrics on validation set
+    # Assume val_report has columns: ['Label', 'precision', 'recall', 'f1-score', 'support']
+    # and that the binary positive class is "Split" or "1"
+    try:
+        f1_1 = float(val_report[val_report["Label"].isin(["Split", 1, "1"])]["f1-score"])
+    except Exception:
+        f1_1 = 0.0
+    try:
+        acc = float(val_report[val_report["Label"] == "accuracy"]["f1-score"])
+    except Exception:
+        acc = 0.0
+    # Multi-objective: prioritize F1 for minority class, but keep accuracy
+    # Adjust weight depending on task (here equal)
+    score = 0.5 * acc + 0.5 * f1_1
+    return score
+def run_optuna_search(
+    train_df: pd.DataFrame,
+    val_df: pd.DataFrame,
+    n_trials: int = 50,
+    study_name: str = "edge_classifier_hp_search",
+    study_dir: str = "./optuna_studies",
+    seed: int = 42,
+) -> Any:
+    import os
+    os.makedirs(study_dir, exist_ok=True)
+    study_path = f"sqlite:///{os.path.join(study_dir, study_name)}.db"
+    study = optuna.create_study(
+        study_name=study_name,
+        direction="maximize",
+        sampler=QMCSampler(seed=seed, qmc_type="sobol"),
+        storage=study_path,
+        load_if_exists=True,
+    )
+    func = lambda trial: objective(trial, train_df, val_df)
+    study.optimize(func, n_trials=n_trials, show_progress_bar=True)
+    print("Best trial:")
+    print(study.best_trial)
+    # Train classifier with best HP and return it
+    best_params = study.best_trial.params
+    edge_classifier_kwargs = {
+        "graph_features": None,
+        "categorical_features": None,
+        "fingerprint_features": None,
+        "use_descriptors": best_params["use_descriptors"],
+        "use_fingerprints": best_params["use_fingerprints"],
+        "n_svd_components": best_params["n_svd_components"],
+        "binary": True,
+        "smote_k_neighbors": best_params["smote_k_neighbors"],
+        "xgb_params": {
+            "max_depth": best_params["max_depth"],
+            "learning_rate": best_params["learning_rate"],
+            "alpha": best_params["alpha"],
+            "lambda": best_params["lambda"],
+            "gamma": best_params["gamma"],
+        },
+    }
+    clf, _ = train_edge_classifier(
+        train_df=train_df,
+        val_df=val_df,
+        edge_classifier_kwargs=edge_classifier_kwargs,
+        return_reports=True,
+    )
+    study_file = os.path.join(study_dir, f"{study_name}_study.pkl")
+    import joblib
+    joblib.dump(study, study_file)
+    print(f"Optuna study saved to {study_file}")
+    return clf, study

protac_splitter/graphs/edge_features.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from typing import Tuple, List
+from rdkit import Chem
+from rdkit.Chem import AllChem, Descriptors, Draw
+import networkx as nx
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from protac_splitter.chemoinformatics import get_atom_idx_at_attachment
+from protac_splitter.display_utils import safe_display, get_mapped_protac_img
+def bond_capacity(bond: Chem.Bond) -> int:
+    """ Calculate the capacity of a bond based on its type and properties.
+    Parameters:
+        bond (Chem.Bond): The bond object from RDKit.
+    Returns:
+        int: The capacity of the bond, where higher values indicate less preference for cutting.
+    """
+    # High capacity for aromatic and ring bonds to avoid cutting them
+    if bond.GetIsAromatic() or bond.IsInRing():
+        return 1000  # very high capacity: avoid cutting aromatic bonds
+    elif bond.GetBondType() == Chem.BondType.SINGLE:
+        return 1     # low capacity: prefer to cut here
+    elif bond.GetBondType() == Chem.BondType.DOUBLE:
+        return 10    # medium penalty
+    elif bond.GetBondType() == Chem.BondType.TRIPLE:
+        return 20    # stronger penalty
+    else:
+        return 50    # fallback for unknown/rare types
+def smiles_to_nx(
+    smiles: str,
+    use_capacity: bool = False,
+) -> nx.Graph:
+    """ Convert a SMILES string to a NetworkX graph.
+    Parameters:
+        smiles (str): The SMILES string to convert.
+        use_capacity (bool): Whether to use bond capacity as edge weights.
+    Returns:
+        nx.Graph: The NetworkX graph representation of the molecule.
+    """
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        raise ValueError(f"Input SMILES could not be parsed: {smiles}")
+    # Canonicalize the SMILES
+    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, canonical=True))
+    if mol is None:
+        raise ValueError(f"Input SMILES could not be canonicalized: {smiles}")
+    # Convert SMILES to NetworkX graph
+    G = nx.Graph()
+    if use_capacity:
+        for bond in mol.GetBonds():
+            capacity = bond_capacity(bond)
+            G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), capacity=capacity)
+    else:
+        for bond in mol.GetBonds():
+            G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
+    return G
+def extract_edge_features(
+    protac_smiles: str,
+    e3_split_pair: Tuple[int, int] = None,
+    wh_split_pair: Tuple[int, int] = None,
+    n_bits: int = 512,
+    radius: int = 6,
+    descriptor_names: List[str] = None,
+    fp_as_string: bool = False,
+) -> pd.DataFrame:
+    """Extract features from the edges of a PROTAC molecule represented as a SMILES string.
+    Parameters:
+        protac_smiles (str): SMILES representation of the PROTAC molecule.
+        e3_split_pair (Tuple[int, int]): Indices of the E3 split pair.
+        wh_split_pair (Tuple[int, int]): Indices of the warhead split pair.
+        n_bits (int): Number of bits for Morgan fingerprints.
+        radius (int): Radius for Morgan fingerprints.
+        descriptor_names (List[str]): List of RDKit descriptor names to compute.
+    Returns:
+        pd.DataFrame: DataFrame containing edge features.
+    """
+    mol = Chem.MolFromSmiles(protac_smiles)
+    if mol is None:
+        raise ValueError(f"Input SMILES could not be parsed: {protac_smiles}")
+    # Canonicalize the SMILES
+    mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol, canonical=True))
+    if mol is None:
+        raise ValueError(f"Input SMILES could not be canonicalized: {protac_smiles}")
+    # Step 1: Convert SMILES to NetworkX
+    G = smiles_to_nx(protac_smiles, use_capacity=False)
+    num_nodes = G.number_of_nodes()
+    num_edges = G.number_of_edges()
+    # Step 2: Create line graph and compute betweenness + degree
+    LG = nx.line_graph(G)
+    line_betweenness = nx.betweenness_centrality(LG, endpoints=True)
+    betweenness = nx.betweenness_centrality(G, endpoints=True)
+    # Compute k-hop degrees (number of nodes within 2, 3 hops)
+    # TODO: Shall I get the degree of the node in the line graph or the original graph?
+    line_degree = dict(LG.degree())
+    line_degree_r2 = {}
+    line_degree_r3 = {}
+    for node in LG.nodes():
+        # Nodes within radius 2 and 3 (excluding the center node)
+        neighbors_r2 = nx.single_source_shortest_path_length(LG, node, cutoff=2)
+        neighbors_r3 = nx.single_source_shortest_path_length(LG, node, cutoff=3)
+        line_degree_r2[node] = len([n for n, d in neighbors_r2.items() if d == 2])
+        line_degree_r3[node] = len([n for n, d in neighbors_r3.items() if d == 3])
+    degree = dict(G.degree())
+    degree_r2 = {}
+    degree_r3 = {}
+    for node in G.nodes():
+        # Nodes within radius 2 and 3 (excluding the center node)
+        neighbors_r2 = nx.single_source_shortest_path_length(G, node, cutoff=2)
+        neighbors_r3 = nx.single_source_shortest_path_length(G, node, cutoff=3)
+        degree_r2[node] = len([n for n, d in neighbors_r2.items() if d == 2])
+        degree_r3[node] = len([n for n, d in neighbors_r3.items() if d == 3])
+    if e3_split_pair is not None and wh_split_pair is not None:
+        true_split_edges = {frozenset(e3_split_pair), frozenset(wh_split_pair)}
+    # Get molecular characteristics, i.e., Morgan fingerprints and descriptors
+    # Generate Morgan fingerprint
+    fp_bitvec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
+    fp = np.zeros((n_bits,), dtype=np.float32)
+    AllChem.DataStructs.ConvertToNumpyArray(fp_bitvec, fp)
+    if fp_as_string:
+        fp = {"chem_mol_fp": "".join([str(int(bit)) for bit in fp])}
+    else:
+        fp = {f"chem_mol_fp_{i}": bool(fp[i]) for i in range(n_bits)}
+    # Generate RDKit descriptors
+    descriptor_func_names = descriptor_names or [
+        "MolWt", "HeavyAtomCount", "NumHAcceptors", "NumHDonors",
+        "TPSA", "NumRotatableBonds", "RingCount", "MolLogP"
+    ]
+    functions = [getattr(Descriptors, name) for name in descriptor_func_names]
+    descriptors = {f"chem_mol_desc_{name}": func(mol) for name, func in zip(descriptor_func_names, functions)}
+    # Step 3: Gather edge features
+    # NOTE: Only consider bridge nodes
+    edge_features = []
+    for (u, v) in nx.bridges(G):
+        bond = mol.GetBondBetweenAtoms(u, v)
+        # Avoid reporting the same edge twice (i.e., swap u and v if needed) and
+        # ensure to find the node pair in the line graph
+        node = (u, v) if (u, v) in LG else (v, u)
+        node_key = node if node in line_betweenness else (v, u)
+        features = {
+            "graph_num_nodes": num_nodes,
+            "graph_num_edges": num_edges,
+            "graph_betweenness": line_betweenness.get(node_key, 0.0),
+            "graph_degree": line_degree.get(node_key, 0),
+            "graph_degree_r2": line_degree_r2.get(node_key, 0),
+            "graph_degree_r3": line_degree_r3.get(node_key, 0),
+            "graph_node_u_degree": degree.get(u, 0),
+            "graph_node_u_degree_r2": degree_r2.get(u, 0),
+            "graph_node_u_degree_r3": degree_r3.get(u, 0),
+            "graph_node_v_degree": degree.get(v, 0),
+            "graph_node_v_degree_r2": degree_r2.get(v, 0),
+            "graph_node_v_degree_r3": degree_r3.get(v, 0),
+            "graph_node_u_betweenness": betweenness.get(u, 0.0),
+            "graph_node_v_betweenness": betweenness.get(v, 0.0),
+            "chem_bond_idx": bond.GetIdx(),
+            "chem_bond_type": str(bond.GetBondType()),
+            "chem_atom_u": mol.GetAtomWithIdx(u).GetSymbol(),
+            "chem_atom_v": mol.GetAtomWithIdx(v).GetSymbol(),
+            "chem_is_aromatic": bond.GetIsAromatic(),
+            "chem_is_in_ring": bond.IsInRing(),
+            "chem_mol_smiles": protac_smiles,
+            "chem_mol_n_bits": n_bits,
+            "chem_mol_radius": radius,
+        }
+        # Add RDKit descriptors and Morgan fingerprint
+        features.update(fp)
+        features.update(descriptors)
+        # Add E3 and warhead split labels
+        if e3_split_pair is not None and wh_split_pair is not None:
+            features.update({
+                "label_is_split": frozenset([u, v]) in true_split_edges,
+                "label_e3_split": frozenset([u, v]) == frozenset(e3_split_pair),
+                "label_wh_split": frozenset([u, v]) == frozenset(wh_split_pair),
+            })
+        # Append the features to the list of edge features
+        edge_features.append(features)
+    df = pd.DataFrame(edge_features)
+    # Identify columns with int64 dtype
+    int64_cols = df.select_dtypes(include=['int64']).columns
+    # Create a dictionary mapping these columns to int32
+    dtype_mapping = {col: np.int32 for col in int64_cols}
+    # Apply the type conversion
+    df = df.astype(dtype_mapping)
+    return df
+def get_edge_features(
+    protac_smiles: str | List[str],
+    wh_smiles: str | List[str],
+    lk_smiles: str | List[str],
+    e3_smiles: str | List[str],
+    n_bits: int = 512,
+    radius: int = 6,
+    descriptor_names: List[str] = None,
+    fp_as_string: bool = False,
+    verbose: int = 0,
+) -> pd.DataFrame:
+    """Get edge features for a given PROTAC molecule and its components.
+    Parameters:
+        protac_smiles (str | List[str]): SMILES representation of the PROTAC molecule.
+        wh_smiles (str | List[str]): SMILES representation of the warhead.
+        lk_smiles (str | List[str]): SMILES representation of the linker.
+        e3_smiles (str | List[str]): SMILES representation of the E3 binder.
+        n_bits (int): Number of bits for Morgan fingerprints.
+        radius (int): Radius for Morgan fingerprints.
+        descriptor_names (List[str]): List of RDKit descriptor names to compute.
+    Returns:
+        pd.DataFrame: DataFrame containing edge features.
+    """
+    if isinstance(protac_smiles, str):
+        protac_smiles = [protac_smiles]
+    if isinstance(wh_smiles, str):
+        wh_smiles = [wh_smiles]
+    if isinstance(lk_smiles, str):
+        lk_smiles = [lk_smiles]
+    if isinstance(e3_smiles, str):
+        e3_smiles = [e3_smiles]
+    iterables = zip(protac_smiles, wh_smiles, lk_smiles, e3_smiles)
+    iterables = tqdm(iterables, desc="Extracting edge features", total=len(protac_smiles), disable=verbose == 0)
+    features_list = []
+    for protac_smi, wh_smi, lk_smi, e3_smi in iterables:
+        if verbose > 1:
+            get_mapped_protac_img(protac_smi, wh_smi, lk_smi, e3_smi, w=1500, h=600, display_image=True, useSVG=True)
+        # Convert SMILES to RDKit molecules
+        protac = Chem.MolFromSmiles(protac_smi)
+        wh = Chem.MolFromSmiles(wh_smi)
+        lk = Chem.MolFromSmiles(lk_smi)
+        e3 = Chem.MolFromSmiles(e3_smi)
+        if protac is None or wh is None or lk is None or e3 is None:
+            raise ValueError(f"Invalid SMILES string: {protac}, {wh}, {lk}, {e3}")
+        # Get the attachment points
+        wh_edge = get_atom_idx_at_attachment(protac, wh, lk)
+        e3_edge = get_atom_idx_at_attachment(protac, e3, lk)
+        # Extract features
+        features = extract_edge_features(
+            protac_smi,
+            e3_split_pair=e3_edge,
+            wh_split_pair=wh_edge,
+            n_bits=n_bits,
+            radius=radius,
+            descriptor_names=descriptor_names,
+            fp_as_string=fp_as_string,
+        )
+        if verbose > 1:
+            # Randomly sample and display 5 edges
+            sample_edges = features.sample(n=5, random_state=42)
+            # Display the sampled edges
+            for _, row in sample_edges.iterrows():
+                bond = protac.GetBondWithIdx(row['chem_bond_idx'])
+                u, v = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                safe_display(Draw.MolToImage(
+                    protac,
+                    size=(1500, 400),
+                    highlightColor=(1, 0, 1, 0.3), # Light purple
+                    highlightAtoms=[u, v], # Highlight the two atoms
+                    legend=f"Graph nodes: {u}, {v} (Betweenness centrality: {row['graph_betweenness']:.3f})",
+                ))
+                # print(row[[c for c in features.columns if c.startswith('graph_')] + ['chem_atom_u', 'chem_atom_v', 'chem_is_in_ring']])
+                print(row)
+        # Append the features to the list
+        features_list.append(features)
+    return pd.concat(features_list, ignore_index=True)

protac_splitter/graphs/splitting_algorithms.py ADDED Viewed

	@@ -0,0 +1,512 @@

+import re
+from typing import Dict, Any, Optional, List, Union
+from pathlib import Path
+from joblib import Parallel, delayed
+import numpy as np
+import networkx as nx
+from rdkit import Chem, DataStructs
+from rdkit.Chem import rdFingerprintGenerator
+from .edge_classifier import GraphEdgeClassifier
+from .e3_clustering import get_representative_e3s_fp
+from .utils import average_tanimoto_distance
+from protac_splitter.data.curation.bond_adjustments import (
+    adjust_amide_bonds_in_substructs,
+    adjust_ester_bonds_in_substructs
+)
+def bond_capacity(bond: Chem.Bond) -> int:
+    if bond.GetIsAromatic() or bond.IsInRing():
+        return 1000  # very high capacity: avoid cutting aromatic bonds
+    elif bond.GetBondType() == Chem.BondType.SINGLE:
+        return 1     # low capacity: prefer to cut here
+    elif bond.GetBondType() == Chem.BondType.DOUBLE:
+        return 10    # medium penalty
+    elif bond.GetBondType() == Chem.BondType.TRIPLE:
+        return 20    # stronger penalty
+    else:
+        return 50    # fallback for unknown/rare types
+def smiles_to_nx(smiles: str) -> nx.Graph:
+    mol = Chem.MolFromSmiles(smiles)
+    G = nx.Graph()
+    for bond in mol.GetBonds():
+        capacity = bond_capacity(bond)
+        G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), capacity=capacity)
+    return G
+def extract_attachment_point(smiles):
+    """
+    Extracts the number X from the pattern [X*] in a SMILES string.
+    Parameters:
+        smiles (str): The SMILES string containing the attachment point.
+    Returns:
+        str or None: The extracted number as a string, or None if not found.
+    """
+    match = re.search(r'\[(\d+)\*\]', smiles)
+    return match.group(1) if match else None
+def split_protac_with_betweenness_centrality(
+    protac_smiles: str,
+    representative_e3s_fp: List[DataStructs.ExplicitBitVect] = None,
+    morgan_fp_generator: Optional[Any] = None,
+    use_capacity_weight: bool = False,
+    betweenness_threshold: float = 0.4,
+) -> Dict[str, str]:
+    """
+    Split the PROTAC molecule into two parts using the NetworkX library.
+    Parameters:
+        protac_smiles (str): The SMILES string of the PROTAC molecule.
+        representative_e3s_fp (list): List of representative E3 ligands fingerprints.
+        morgan_fp_generator: RDKit Morgan fingerprint generator (should be the same as the one that generated the E3 fingerprints).
+        use_capacity_weight (bool): Whether to use bond capacity as weight for the graph.
+        betweenness_threshold (float): Threshold for betweenness centrality to consider a node as a candidate for splitting.
+    Returns:
+        dict: A dictionary containing the E3 ligand, warhead, linker, top nodes, and max centrality score.
+    """
+    if morgan_fp_generator is None:
+        # Create a default Morgan fingerprint generator
+        morgan_fp_generator = rdFingerprintGenerator.GetMorganGenerator(
+            radius=16,
+            fpSize=1024,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    if representative_e3s_fp is None:
+        # Get the representative E3 ligands fingerprints
+        representative_e3s_fp = get_representative_e3s_fp(fp_generator=morgan_fp_generator)
+    # -----------------------------------
+    # Deterministic graph-based algorithm
+    # -----------------------------------
+    protac = Chem.MolFromSmiles(protac_smiles)
+    if protac is None:
+        raise ValueError(f"Invalid SMILES string: {protac_smiles}")
+    G = smiles_to_nx(protac_smiles)
+    # Compute betweenness centrality
+    weight = 'capacity' if use_capacity_weight else None
+    centrality = nx.betweenness_centrality(G, normalized=True, endpoints=True, weight=weight)
+    # Get the two nodes with the highest betweenness centrality
+    sorted_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)
+    # Get the list of bridges in the graph
+    bridges = list(nx.bridges(G))
+    # Get the top two nodes
+    top_nodes = [n for n, _ in sorted_nodes if n in bridges][:2]
+    # Get the top nodes with the highest betweenness centrality that are not in
+    # a ring, but are adjacent to the top nodes or have a high betweenness
+    for node, score in sorted_nodes:
+        # Check if the node is in a ring in the protac molecule
+        atom = protac.GetAtomWithIdx(node)
+        if not atom.IsInRing():
+            # Check if the atom is adjacent to any of the top nodes, if so, add it to the list
+            for neighbor in G.neighbors(node):
+                if neighbor in top_nodes:
+                    top_nodes.append(node)
+                    break
+            if score > betweenness_threshold:
+                top_nodes.append(node)
+    # If a node as only top nodes as neighbors, add it to the list
+    for node in G.nodes():
+        if node not in top_nodes:
+            neighbors = list(G.neighbors(node))
+            if all(neighbor in top_nodes for neighbor in neighbors):
+                top_nodes.append(node)
+    # Get all paths between the top nodes, e.g., rings
+    for i in range(len(top_nodes)):
+        for j in range(i + 1, len(top_nodes)):
+            node1 = top_nodes[i]
+            node2 = top_nodes[j]
+            for path in nx.all_simple_paths(G, node1, node2):
+                for node in path:
+                    if node not in top_nodes:
+                        top_nodes.append(node)
+    # Remove duplicates
+    top_nodes = list(set(top_nodes))
+    # Loop over the top nodes and find the nodes that have a neighbor outside
+    # the top nodes
+    edge_nodes = set()
+    for top_node in top_nodes:
+        for neighbor in G.neighbors(top_node):
+            if neighbor not in top_nodes:
+                edge_nodes.update([(top_node, neighbor)])
+                break
+    # Get molecule fragment from the top nodes
+    bonds = [protac.GetBondBetweenAtoms(i, j) for (i, j) in edge_nodes]
+    bonds_idx = [bond.GetIdx() for bond in bonds if bond is not None]
+    # Try any pair of indexes, if the number of resulting fragments is not 3,
+    # then do not consider them as candidates for splitting
+    candidate_bonds = []
+    for i in range(len(bonds_idx)):
+        for j in range(i + 1, len(bonds_idx)):
+            bond1 = bonds_idx[i]
+            bond2 = bonds_idx[j]
+            # Get the fragments
+            fragments = Chem.FragmentOnBonds(protac, [bond1, bond2])
+            # Check if there are 3 fragments
+            if Chem.MolToSmiles(fragments).count(".") == 2:
+                frag_lens = []
+                avg_len = 0
+                for frag in Chem.GetMolFrags(fragments, asMols=True):
+                    frag_len = frag.GetNumAtoms()
+                    frag_lens.append(frag_len)
+                    avg_len += frag_len
+                avg_len /= 3
+                # Calculate the standard deviation of the fragment lengths
+                len_std = 0
+                for frag_len in frag_lens:
+                    len_std += (frag_len - avg_len) ** 2
+                len_std = (len_std / 3) ** 0.5
+                candidate_bonds.append(((bond1, bond2), len_std))
+    # Sort the candidate bonds by distance to average (smallest first)
+    candidate_bonds = sorted(candidate_bonds, key=lambda x: x[1])
+    ligands = None
+    while ligands is None and len(candidate_bonds) > 0:
+        bonds_idx = candidate_bonds[0][0]
+        try:
+            ligands = Chem.FragmentOnBonds(protac, bonds_idx, addDummies=True, dummyLabels=[(1, 1), (2, 2)])
+        except Exception as e:
+            print(f"Error fragmenting the molecule: {e}")
+            candidate_bonds.pop(0)
+    # If no candidate bonds were found, return None
+    if ligands is None:
+        print(f"No candidate bonds found for splitting PROTAC: {protac_smiles}")
+        return {'e3': None, 'poi': None, 'linker': None, 'top_nodes': None, 'centrality': None}
+    # Get the linker
+    substructures = []
+    for ligand in Chem.GetMolFrags(ligands, asMols=True):
+        ligand_smiles = Chem.MolToSmiles(ligand, canonical=True)
+        if ligand_smiles.count("*") == 2:
+            linker_smiles = ligand_smiles
+        else:
+            substructures.append(ligand_smiles)
+    sub1_dist = average_tanimoto_distance(substructures[0], representative_e3s_fp, morgan_fp_generator)
+    sub2_dist = average_tanimoto_distance(substructures[1], representative_e3s_fp, morgan_fp_generator)
+    if sub1_dist < sub2_dist:
+        e3_smiles = substructures[0]
+        wh_smiles = substructures[1]
+    else:
+        e3_smiles = substructures[1]
+        wh_smiles = substructures[0]
+    # Get the attachment point using a regex, e.g., should return 1 if [1*] is in the SMILES
+    e3_attach_point = extract_attachment_point(e3_smiles)
+    e3_smiles = e3_smiles.replace(f"[{e3_attach_point}*]", "[*:2]")
+    linker_smiles = linker_smiles.replace(f"[{e3_attach_point}*]", "[*:2]")
+    wh_attach_point = extract_attachment_point(wh_smiles)
+    wh_smiles = wh_smiles.replace(f"[{wh_attach_point}*]", "[*:1]")
+    linker_smiles = linker_smiles.replace(f"[{wh_attach_point}*]", "[*:1]")
+    return {'e3': e3_smiles, 'poi': wh_smiles, 'linker': linker_smiles, 'top_nodes': top_nodes, 'centrality': centrality}
+def split_protac_with_edge_classifier(
+        protac_smiles: str,
+        pipeline: Union[str, Path],
+        representative_e3s_fp: Optional[List[np.array]] = None,
+        morgan_fp_generator: Optional[Any] = None,
+) -> Dict[str, str]:
+    """ Split the PROTAC molecule into two parts using the pretrained edge classifier.
+    Parameters:
+        protac_smiles (str): The SMILES string of the PROTAC molecule.
+        pipeline (Union[str, Path]): Path to the trained GraphEdgeClassifier model.
+        representative_e3s_fp (Optional[List[np.array]]): Precomputed fingerprints of representative E3 ligands.
+        morgan_fp_generator (Optional[Any]): RDKit Morgan fingerprint generator (should be the same as the one that generated the E3 fingerprints).
+    Returns:
+        dict: A dictionary containing the E3 ligand, warhead, linker, and bonds_idx
+    """
+    if morgan_fp_generator is None:
+        # Create a default Morgan fingerprint generator
+        morgan_fp_generator = rdFingerprintGenerator.GetMorganGenerator(
+            radius=16,
+            fpSize=1024,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    if representative_e3s_fp is None:
+        # Get the representative E3 ligands fingerprints
+        representative_e3s_fp = get_representative_e3s_fp(fp_generator=morgan_fp_generator)
+    protac = Chem.MolFromSmiles(protac_smiles)
+    if protac is None:
+        raise ValueError(f"Invalid SMILES string: {protac_smiles}")
+    if isinstance(pipeline, str):
+        pipeline = GraphEdgeClassifier.load(pipeline)
+    # TODO: Get the top-n bonds, if splitting results in more than 3 ligands,
+    # test other pairs of bonds, then repeat until we get 3 ligands exactly.
+    bonds_idx = pipeline.predict_from_smiles(
+        protac_smiles,
+        wh_smiles=None,
+        lk_smiles=None,
+        e3_smiles=None,
+        top_n=2,
+        return_array=True,
+    ).flatten().tolist()
+    # print(f"Predicted bonds: {bonds_idx}")
+    if -1 in bonds_idx:
+        bonds_idx = [bond for bond in bonds_idx if bond != -1]
+        # Randomly select a bond index from the PROTAC molecule
+        # that is not in the predicted bonds
+        for _ in range(2 - len(bonds_idx)):
+            bond = np.random.choice([bond.GetIdx() for bond in protac.GetBonds() if bond.GetIdx() not in bonds_idx and not bond.IsInRing()])
+            bonds_idx.append(int(bond))
+    ligands = Chem.FragmentOnBonds(protac, bonds_idx, addDummies=True, dummyLabels=[(1, 1), (2, 2)])
+    # Get the linker
+    substructures = []
+    for ligand in Chem.GetMolFrags(ligands, asMols=True):
+        ligand_smiles = Chem.MolToSmiles(ligand, canonical=True)
+        if ligand_smiles.count("*") == 2:
+            linker_smiles = ligand_smiles
+        else:
+            substructures.append(ligand_smiles)
+    if not pipeline.binary:
+        e3_smiles = substructures[0]
+        wh_smiles = substructures[1]
+        # NOTE: The classifier was trained on the following labels assignment:
+        e3_attach_point = 1
+        wh_attach_point = 2
+    else:
+        if representative_e3s_fp is None or morgan_fp_generator is None:
+            raise ValueError("For pipeline trained on binary classification, representative_e3s_fp and morgan_fp_generator must be provided.")
+        sub1_dist = average_tanimoto_distance(substructures[0], representative_e3s_fp, morgan_fp_generator)
+        sub2_dist = average_tanimoto_distance(substructures[1], representative_e3s_fp, morgan_fp_generator)
+        if sub1_dist < sub2_dist:
+            e3_smiles = substructures[0]
+            wh_smiles = substructures[1]
+        else:
+            e3_smiles = substructures[1]
+            wh_smiles = substructures[0]
+        # Get the attachment point using a regex, e.g., should return 1 if [1*] is in the SMILES
+        e3_attach_point = extract_attachment_point(e3_smiles)
+        wh_attach_point = extract_attachment_point(wh_smiles)
+    e3_smiles = e3_smiles.replace(f"[{e3_attach_point}*]", "[*:2]")
+    linker_smiles = linker_smiles.replace(f"[{e3_attach_point}*]", "[*:2]")
+    wh_smiles = wh_smiles.replace(f"[{wh_attach_point}*]", "[*:1]")
+    linker_smiles = linker_smiles.replace(f"[{wh_attach_point}*]", "[*:1]")
+    return {'e3': e3_smiles, 'poi': wh_smiles, 'linker': linker_smiles, "bonds_idx": bonds_idx}
+def split_protac_graph_based(
+    protac_smiles: str,
+    use_classifier: bool = False,
+    classifier: Optional['GraphEdgeClassifier'] = None,
+    representative_e3s_fp: Optional[List[Any]] = None,
+    morgan_fp_generator: Optional[Any] = None,
+    use_capacity_weight: bool = False,
+    betweenness_threshold: float = 0.4,
+) -> Dict[str, str]:
+    """
+    Splits a PROTAC molecule using either ML classifier or deterministic betweenness centrality.
+    Returns a dictionary with e3, poi, linker, bonds_idx.
+    """
+    if representative_e3s_fp is None:
+        if morgan_fp_generator is None:
+            # Create a default Morgan fingerprint generator
+            morgan_fp_generator = rdFingerprintGenerator.GetMorganGenerator(
+                radius=16,
+                fpSize=1024,
+                useBondTypes=True,
+                includeChirality=True,
+            )
+        # Get the representative E3 ligands fingerprints
+        representative_e3s_fp = get_representative_e3s_fp(fp_generator=morgan_fp_generator)
+    if use_classifier:
+        ret = split_protac_with_edge_classifier(
+            protac_smiles=protac_smiles,
+            pipeline=classifier,
+            representative_e3s_fp=representative_e3s_fp,
+            morgan_fp_generator=morgan_fp_generator,
+        )
+    else:
+        ret = split_protac_with_betweenness_centrality(
+            protac_smiles=protac_smiles,
+            representative_e3s_fp=representative_e3s_fp,
+            morgan_fp_generator=morgan_fp_generator,
+            use_capacity_weight=use_capacity_weight,
+            betweenness_threshold=betweenness_threshold,
+        )
+    substructs = {
+        "e3": ret["e3"],
+        "poi": ret["poi"],
+        "linker": ret["linker"],
+    }
+    # If all of the substructures are not None, fix the amide and ester bonds
+    if all(x is not None for x in substructs.values()):
+        substructs = adjust_amide_bonds_in_substructs(substructs, protac_smiles)
+        substructs = adjust_ester_bonds_in_substructs(substructs, protac_smiles)
+        ret["e3"] = substructs["e3"]
+        ret["poi"] = substructs["poi"]
+        ret["linker"] = substructs["linker"]
+    return ret
+def split_protac_with_graphs_wrapper(
+    protac_smiles: List[str],
+    use_classifier: bool = False,
+    classifier: Optional['GraphEdgeClassifier'] = None,
+    representative_e3s: Optional[List[Any]] = None,
+    representative_e3s_fp: Optional[List[Any]] = None,
+    morgan_fp_generator: Optional[Any] = None,
+    use_capacity_weight: bool = False,
+    betweenness_threshold: float = 0.4,
+) -> List[Dict[str, str]]:
+    """ Wrapper function to apply split_protac_graph_based over a list of PROTAC SMILES.
+    Parameters:
+        protac_smiles (List[str]): List of SMILES strings of PROTAC molecules.
+        use_classifier (bool): Whether to use a classifier for splitting.
+        classifier (Optional[GraphEdgeClassifier]): Classifier to use if use_classifier is True.
+        representative_e3s_fp (Optional[List[Any]]): Precomputed fingerprints of representative E3 ligands.
+        morgan_fp_generator (Optional[Any]): RDKit Morgan fingerprint generator.
+        use_capacity_weight (bool): Whether to use bond capacity as weight for the graph.
+        betweenness_threshold (float): Threshold for betweenness centrality to consider a node as a candidate for splitting.
+    Returns:
+        List[Dict[str, str]]: List of dictionaries containing the split results for each PROTAC molecule.
+    """
+    if morgan_fp_generator is None and (representative_e3s is None or representative_e3s_fp is None):
+        # Create a default Morgan fingerprint generator
+        morgan_fp_generator = rdFingerprintGenerator.GetMorganGenerator(
+            radius=16,
+            fpSize=1024,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    if representative_e3s is None and representative_e3s_fp is None:
+        # Get the representative E3 ligands fingerprints
+        representative_e3s_fp = get_representative_e3s_fp(fp_generator=morgan_fp_generator)
+    elif representative_e3s is not None and representative_e3s_fp is None:
+        # Convert representative E3 ligands to fingerprints
+        representative_e3s_fp = get_representative_e3s_fp(e3_list=representative_e3s, fp_generator=morgan_fp_generator)
+    # Load the classifier if it is a string or Path
+    if use_classifier and classifier is not None and isinstance(classifier, (str, Path)):
+        classifier = GraphEdgeClassifier.load(classifier)
+    return [
+        split_protac_graph_based(
+            protac_smiles=smi,
+            use_classifier=use_classifier,
+            classifier=classifier,
+            representative_e3s_fp=representative_e3s_fp,
+            morgan_fp_generator=morgan_fp_generator,
+            use_capacity_weight=use_capacity_weight,
+            betweenness_threshold=betweenness_threshold,
+        ) for smi in protac_smiles
+    ]
+def split_protac_with_graphs_parallel(
+    protac_smiles: List[str],
+    use_classifier: bool = False,
+    classifier: Optional['GraphEdgeClassifier'] = None,
+    representative_e3s: Optional[List[Any]] = None,
+    representative_e3s_fp: Optional[List[Any]] = None,
+    morgan_fp_generator: Optional[Any] = None,
+    use_capacity_weight: bool = False,
+    betweenness_threshold: float = 0.4,
+    n_jobs: int = 1,
+    batch_size: int = 1,
+) -> List[Dict[str, str]]:
+    """ Splits a list of PROTAC molecules using either ML classifier or deterministic betweenness centrality.
+    Parameters:
+        protac_smiles (List[str]): List of SMILES strings of PROTAC molecules.
+        use_classifier (bool): Whether to use a classifier for splitting.
+        classifier (Optional[GraphEdgeClassifier]): Classifier to use if use_classifier is True.
+        representative_e3s (Optional[List[Any]]): List of representative E3 ligands. If None, uses precomputed fingerprints.
+        representative_e3s_fp (Optional[List[Any]]): Precomputed fingerprints of representative E3 ligands.
+        morgan_fp_generator (Optional[Any]): RDKit Morgan fingerprint generator.
+        use_capacity_weight (bool): Whether to use bond capacity as weight for the graph.
+        betweenness_threshold (float): Threshold for betweenness centrality to consider a node as a candidate for splitting.
+        n_jobs (int): Number of parallel jobs to run. If 1, runs sequentially.
+        batch_size (int): Size of each batch for parallel processing.
+    """
+    # Load the classifier if it is a string or Path
+    if use_classifier and classifier is not None and isinstance(classifier, (str, Path)):
+        classifier = GraphEdgeClassifier.load(classifier)
+    if n_jobs < 1:
+        raise ValueError("n_jobs must be a positive integer.")
+    if n_jobs == 1:
+        # If n_jobs is 1, run the function sequentially
+        return split_protac_with_graphs_wrapper(
+            protac_smiles=protac_smiles,
+            use_classifier=use_classifier,
+            classifier=classifier,
+            representative_e3s=representative_e3s,
+            representative_e3s_fp=representative_e3s_fp,
+            morgan_fp_generator=morgan_fp_generator,
+            use_capacity_weight=use_capacity_weight,
+            betweenness_threshold=betweenness_threshold,
+        )
+    # Raise a warning if the n_jobs > 1 and the fingerprint generator is provided
+    if morgan_fp_generator is not None:
+        print("Warning: Using a custom Morgan fingerprint generator with n_jobs > 1 may be un-pickleable.")
+    # Split the SMILES list into batches
+    smiles_batches = [protac_smiles[i:i + batch_size] for i in range(0, len(protac_smiles), batch_size)]
+    # Ensure all SMILES are processed, even if the last batch is smaller than batch_size
+    smiles_batches = [protac_smiles[i:i + batch_size] for i in range(0, len(protac_smiles), batch_size)]
+    # Remove any empty batches (shouldn't happen, but for safety)
+    smiles_batches = [batch for batch in smiles_batches if batch]
+    # Run each batch in parallel
+    results = Parallel(n_jobs=n_jobs)(
+        delayed(split_protac_with_graphs_wrapper)(
+            protac_smiles=batch,
+            use_classifier=use_classifier,
+            classifier=classifier,
+            representative_e3s=representative_e3s,
+            representative_e3s_fp=representative_e3s_fp,
+            morgan_fp_generator=morgan_fp_generator,
+            use_capacity_weight=use_capacity_weight,
+            betweenness_threshold=betweenness_threshold,
+        ) for batch in smiles_batches
+    )
+    # Flatten the list of lists into a single list
+    return [item for batch_result in results for item in batch_result]

protac_splitter/graphs/utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Any, Optional, List
+import numpy as np
+from rdkit import Chem, DataStructs
+from rdkit.Chem import rdFingerprintGenerator
+def get_fp(
+    smiles: str,
+    fp_generator: Optional[Any] = None,
+    return_np: bool = True,
+) -> Optional[np.ndarray]:
+    """
+    Get the Morgan fingerprint of a molecule from its SMILES representation.
+    Parameters:
+        smiles (str): The SMILES string of the molecule.
+        fp_generator (Any, optional): The fingerprint generator to use. If None, a default generator is used.
+        return_np (bool): Whether to return the fingerprint as a NumPy array. Defaults to True.
+    Returns:
+        Optional[np.ndarray]: The Morgan fingerprint of the molecule as a NumPy array, or None if the SMILES is invalid.
+    """
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    if fp_generator is None:
+        fp_generator = rdFingerprintGenerator.GetMorganGenerator(
+            radius=16,
+            fpSize=1024,
+            useBondTypes=True,
+            includeChirality=True,
+        )
+    if return_np:
+        return fp_generator.GetFingerprintAsNumPy(mol)
+    else:
+        return fp_generator.GetFingerprint(mol)
+def average_tanimoto_distance(
+    smiles: str,
+    fingerprints: List[DataStructs.ExplicitBitVect],
+    morgan_fp_generator: Optional[Any] = None,
+) -> float:
+    """
+    Compute the average Tanimoto distance between a query SMILES and a list of RDKit fingerprints.
+    Parameters:
+        smiles (str): SMILES string of the query molecule.
+        fingerprints (list): List of RDKit fingerprint objects (e.g., ExplicitBitVect).
+        morgan_fp_generator: RDKit Morgan fingerprint generator.
+    Returns:
+        float: Average Tanimoto distance (1 - similarity) between the query and the fingerprints.
+    """
+    query_fp = get_fp(smiles, morgan_fp_generator, return_np=False)
+    if query_fp is None:
+        raise ValueError(f"Invalid SMILES string: {smiles}")
+    distances = DataStructs.BulkTanimotoSimilarity(query_fp, fingerprints, returnDistance=True)
+    return np.array(distances).mean()
+def numpy_to_rdkit_fp(arr: np.ndarray) -> DataStructs.ExplicitBitVect:
+    """
+    Convert a NumPy array to an RDKit ExplicitBitVect.
+    """
+    return DataStructs.CreateFromBitString(''.join(arr.astype(str)))

protac_splitter/graphs_utils.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from numba import njit
+import numpy as np
+import networkx as nx
+from rdkit import Chem
+def mol2graph(mol: Chem.Mol) -> nx.Graph:
+    """ Convert an RDKit molecule to a NetworkX graph.
+    Args:
+        mol (Chem.Mol): The RDKit molecule to convert.
+    Returns:
+        nx.Graph: The NetworkX graph representation of the molecule.
+    """
+    # NOTE: https://github.com/maxhodak/keras-molecules/pull/32/files
+    # TODO: Double check this implementation too: https://gist.github.com/jhjensen2/6450138cda3ab796a30850610843cfff
+    if mol is None:
+        return nx.empty_graph()
+    G = nx.Graph()
+    for atom in mol.GetAtoms():
+        # Skip non-heavy atoms
+        if atom.GetAtomicNum() != 0:
+            G.add_node(atom.GetIdx(), label=atom.GetSymbol())
+    for bond in mol.GetBonds():
+        # Skip bonds to non-heavy atoms
+        if bond.GetBeginAtom().GetAtomicNum() == 0 or bond.GetEndAtom().GetAtomicNum() == 0:
+            continue
+        G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), label=bond.GetBondType())
+    return G
+def smiles2graph(smiles: str) -> nx.Graph:
+    """ Convert a SMILES string to a NetworkX graph.
+    Args:
+        smiles (str): The SMILES string to convert.
+    Returns:
+        nx.Graph: The NetworkX graph representation of the molecule.
+    """
+    return mol2graph(Chem.MolFromSmiles(smiles))
+def get_smiles2graph_edit_distance(smi1: str, smi2: str, **kwargs) -> float:
+    """ Compute the graph edit distance between two SMILES strings.
+    Args:
+        smi1 (str): The first SMILES string.
+        smi2 (str): The second SMILES string.
+        **kwargs: Additional keyword arguments for `nx.graph_edit_distance`.
+    Returns:
+        float: The graph edit distance between the two SMILES strings.
+    """
+    ged = nx.graph_edit_distance(smiles2graph(smi1), smiles2graph(smi2), **kwargs)
+    return ged if ged is not None else np.inf
+def get_mol2graph_edit_distance(mol1: str, mol2: str, **kwargs) -> float:
+    """ Compute the graph edit distance between two RDKit molecules.
+    Args:
+        mol1 (Chem.Mol): The first RDKit molecule.
+        mol2 (Chem.Mol): The second RDKit molecule.
+        **kwargs: Additional keyword arguments for `nx.graph_edit_distance`.
+    Returns:
+        float: The graph edit distance between the two RDKit molecules.
+    """
+    ged = nx.graph_edit_distance(mol2graph(mol1), mol2graph(mol2), **kwargs)
+    return ged if ged is not None else np.inf
+def get_smiles2graph_edit_distance_norm(
+        smi1: str,
+        smi2: str,
+        ged_G1_G2: None,
+        eps: float = 1e-9,
+        **kwargs,
+) -> float:
+    """ Compute the normalized graph edit distance between two SMILES strings.
+    Args:
+        smi1 (str): The first SMILES string.
+        smi2 (str): The second SMILES string.
+        ged_G1_G2 (float): The graph edit distance between the two graphs. If None, it will be computed using `nx.graph_edit_distance`.
+        eps (float): A small value to avoid division by zero.
+        **kwargs: Additional keyword arguments for `nx.graph_edit_distance`.
+    Returns:
+        float: The normalized graph edit distance between the two SMILES strings.
+    """
+    G1 = smiles2graph(smi1)
+    G2 = smiles2graph(smi2)
+    G0 = nx.empty_graph()
+    ged_G1_G2 = ged_G1_G2 if ged_G1_G2 is not None else nx.graph_edit_distance(G1, G2, **kwargs)
+    ged_G1_G0 = nx.graph_edit_distance(G1, G0, **kwargs)
+    ged_G2_G0 = nx.graph_edit_distance(G2, G0, **kwargs)
+    if None in [ged_G1_G2, ged_G1_G0, ged_G2_G0]:
+        return np.inf
+    return ged_G1_G2 / (ged_G1_G0 + ged_G2_G0 + eps)
+def smiles2adjacency_matrix(smiles: str) -> np.ndarray:
+    return nx.adjacency_matrix(smiles2graph(smiles)).todense()
+def build_label_mapping(G1, G2):
+    labels = set()
+    for G in [G1, G2]:
+        for node in G.nodes():
+            labels.add(G.nodes[node]['label'])
+    label_to_int = {label: idx for idx, label in enumerate(sorted(labels))}
+    return label_to_int
+def preprocess_graph(G, label_to_int):
+    n = G.number_of_nodes()
+    adj = np.zeros((n, n), dtype=np.int32)
+    labels = np.zeros(n, dtype=np.int32)
+    node_id_to_idx = {}
+    for idx, node in enumerate(G.nodes()):
+        node_id_to_idx[node] = idx
+        label = G.nodes[node]['label']
+        labels[idx] = label_to_int[label]
+    for u, v in G.edges():
+        idx_u = node_id_to_idx[u]
+        idx_v = node_id_to_idx[v]
+        adj[idx_u, idx_v] = 1
+        adj[idx_v, idx_u] = 1  # Assuming undirected graph
+    return adj, labels
+@njit
+def compute_cost_matrix(labels1, labels2, degrees1, degrees2):
+    n1 = labels1.shape[0]
+    n2 = labels2.shape[0]
+    C = np.zeros((n1, n2), dtype=np.float64)
+    for i in range(n1):
+        for j in range(n2):
+            label_cost = 0.0 if labels1[i] == labels2[j] else 1.0
+            neighborhood_cost = abs(degrees1[i] - degrees2[j])
+            C[i, j] = label_cost + neighborhood_cost
+    return C
+@njit
+def greedy_assignment(C):
+    n1, n2 = C.shape
+    assigned_cols = np.full(n2, False)
+    row_ind = np.full(n1, -1, dtype=np.int32)
+    for i in range(n1):
+        min_cost = np.inf
+        min_j = -1
+        for j in range(n2):
+            if not assigned_cols[j] and C[i, j] < min_cost:
+                min_cost = C[i, j]
+                min_j = j
+        if min_j != -1:
+            row_ind[i] = min_j
+            assigned_cols[min_j] = True
+    return row_ind
+@njit
+def compute_total_cost(C, row_ind, n1, n2, c_node_del, c_node_ins):
+    total_cost = 0.0
+    assigned_cols = np.full(n2, False)
+    for i in range(n1):
+        j = row_ind[i]
+        if j != -1:
+            total_cost += C[i, j]
+            assigned_cols[j] = True
+        else:
+            total_cost += c_node_del
+    for j in range(n2):
+        if not assigned_cols[j]:
+            total_cost += c_node_ins
+    return total_cost
+def approximate_graph_edit_distance(adj1, labels1, adj2, labels2, c_node_del=1.0, c_node_ins=1.0):
+    degrees1 = adj1.sum(axis=1)
+    degrees2 = adj2.sum(axis=1)
+    C = compute_cost_matrix(labels1, labels2, degrees1, degrees2)
+    row_ind = greedy_assignment(C)
+    total_cost = compute_total_cost(C, row_ind, labels1.shape[0], labels2.shape[0], c_node_del, c_node_ins)
+    return total_cost
+def get_approximate_ged(G1, G2):
+    label_to_int = build_label_mapping(G1, G2)
+    adj1, labels1 = preprocess_graph(G1, label_to_int)
+    adj2, labels2 = preprocess_graph(G2, label_to_int)
+    cost = approximate_graph_edit_distance(adj1, labels1, adj2, labels2)
+    return cost
+def get_smiles2graph_edit_distance_approx(smi1: str, smi2: str) -> float:
+    G1 = smiles2graph(smi1)
+    G2 = smiles2graph(smi2)
+    return get_approximate_ged(G1, G2)

protac_splitter/llms/__init__.py ADDED Viewed

File without changes

protac_splitter/llms/data_utils.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import os
+import random
+import logging
+from typing import Optional, Union
+import torch
+from datasets import load_dataset, concatenate_datasets, Dataset
+from transformers import AutoTokenizer
+from rdkit import Chem
+from protac_splitter.evaluation import split_prediction
+def randomize_smiles_dataset(
+    batch: dict,
+    repeat: int = 1,
+    prob: float = 0.5,
+    apply_to_text: bool = True,
+    apply_to_labels: bool = False,
+) -> dict:
+    """ Randomize SMILES in a batch of data.
+    Args:
+        batch (dict): Batch of data with "text" and "labels" keys.
+        repeat (int, optional): Number of times to repeat the randomization. Defaults to 1.
+        prob (float, optional): Probability of randomizing SMILES. Defaults to 0.5.
+        apply_to_text (bool, optional): Whether to apply randomization to text. Defaults to True.
+        apply_to_labels (bool, optional): Whether to apply randomization to labels. Defaults to False.
+    Returns:
+        dict: Randomized batch of data.
+    """
+    new_texts, new_labels = [], []
+    for text, label in zip(batch["text"], batch["labels"]):
+        try:
+            mol_text = Chem.MolFromSmiles(text)
+            mol_label = Chem.MolFromSmiles(label)
+        except Exception:
+            logging.error("Failed to convert SMILES to Mol!")
+            new_texts.append(text)
+            new_labels.append(label)
+            continue
+        if random.random() < prob:
+            if apply_to_text:
+                rand_texts = [Chem.MolToSmiles(mol_text, canonical=False, doRandom=True) for _ in range(repeat)]
+            else:
+                rand_texts = [text] * repeat
+            if apply_to_labels:
+                rand_labels = [Chem.MolToSmiles(mol_label, canonical=False, doRandom=True) for _ in range(repeat)]
+            else:
+                rand_labels = [label] * repeat
+            new_texts.extend(rand_texts)
+            new_labels.extend(rand_labels)
+        else:
+            new_texts.append(text)
+            new_labels.append(label)
+    return {"text": new_texts, "labels": new_labels}
+def process_data_to_model_inputs(
+        batch,
+        tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1",
+        encoder_max_length: int = 512,
+        decoder_max_length: int = 512,
+):
+    if isinstance(tokenizer, str):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    # tokenize the inputs and labels
+    inputs = tokenizer(batch["text"], truncation=True, max_length=encoder_max_length)
+    outputs = tokenizer(batch["labels"], truncation=True, max_length=decoder_max_length)
+    batch["input_ids"] = inputs.input_ids
+    batch["attention_mask"] = inputs.attention_mask
+    batch["labels"] = outputs.input_ids.copy()
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # batch["input_ids"] = batch["input_ids"].to(device)
+    # batch["attention_mask"] = batch["attention_mask"].to(device)
+    # batch["labels"] = batch["labels"].to(device)
+    # Because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
+    # We have to make sure that the PAD token is ignored when calculating the loss.
+    # NOTE: Check the `ignore_index` argument in nn.CrossEntropyLoss.
+    # NOTE: The following is already done in the DataCollatorForSeq2Seq
+    # batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
+    return batch
+def get_fragments_in_labels(labels: str, linkers_only_as_labels: bool = True) -> list[str]:
+    """ Get the fragments in the labels.
+    Args:
+        labels (str): The labels.
+        linkers_only_as_labels (bool, optional): Whether to get only the linkers in the labels. Defaults to True.
+    Returns:
+        list[str]: The fragments in the labels.
+    """
+    ligands = split_prediction(labels)
+    if linkers_only_as_labels:
+        return ligands.get("linker", None)
+    if None in ligands.values():
+        return None
+    return f"{ligands['e3']}.{ligands['poi']}"
+def load_tokenized_dataset(
+        dataset_dir: str,
+        dataset_config: str = 'default',
+        tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1",
+        batch_size: int = 512,
+        encoder_max_length: int = 512,
+        decoder_max_length: int = 512,
+        token: Optional[str] = None,
+        num_proc_map: int = 1,
+        randomize_smiles: bool = False,
+        randomize_smiles_prob: float = 0.5,
+        randomize_smiles_repeat: int = 1,
+        randomize_text: bool = True,
+        randomize_labels: bool = False,
+        cache_dir: Optional[str] = None,
+        all_fragments_as_labels: bool = True,
+        linkers_only_as_labels: bool = False,
+        causal_language_modeling: bool = False,
+        train_size_ratio: float = 1.0,
+) -> Dataset:
+    """ Load dataset and tokenize it.
+    Args:
+        dataset_dir (str): The directory of the dataset or the name of the data on the Hugging Face Hub.
+        dataset_config (str, optional): The configuration of the dataset. Defaults to 'default'.
+        tokenizer (AutoTokenizer | str, optional): The tokenizer to use for tokenization. If a string, the tokenizer will be loaded using `AutoTokenizer.from_pretrained(tokenizer)`. Defaults to "seyonec/ChemBERTa-zinc-base-v1".
+        batch_size (int, optional): The batch size for tokenization. Defaults to 512.
+        encoder_max_length (int, optional): The maximum length of the encoder input sequence. Defaults to 512.
+        decoder_max_length (int, optional): The maximum length of the decoder input sequence. Defaults to 512.
+        token (Optional[str], optional): The Hugging Face API token. Defaults to None.
+        num_proc_map (int, optional): The number of processes to use for mapping. Defaults to 1.
+        randomize_smiles (bool, optional): Whether to randomize SMILES. Defaults to False.
+        randomize_smiles_prob (float, optional): The probability of randomizing SMILES. Defaults to 0.5.
+        randomize_smiles_repeat (int, optional): The number of times to repeat the randomization. Defaults to 1.
+        randomize_text (bool, optional): Whether to randomize text. Defaults to True.
+        randomize_labels (bool, optional): Whether to randomize labels. Defaults to False.
+        cache_dir (Optional[str], optional): The directory to cache the dataset. Defaults to None.
+        all_fragments_as_labels (bool, optional): Whether to get all fragments in the labels. Defaults to True.
+        linkers_only_as_labels (bool, optional): Whether to get only the linkers in the labels. Defaults to False.
+        causal_language_modeling (bool, optional): Whether to use causal language modeling. Defaults to False.
+        train_size_ratio (float, optional): The ratio of the training dataset to use. Defaults to 1.0.
+    Returns:
+        Dataset: The tokenized dataset.
+    """
+    if isinstance(tokenizer, str):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    if os.path.exists(dataset_dir):
+        # NOTE: We need a different argument to load a dataset from disk:
+        dataset = load_dataset(
+            dataset_dir,
+            data_dir=dataset_config,
+        )
+        print(f"Dataset loaded from disk at: \"{dataset_dir}\". Length: {dataset.num_rows}")
+    else:
+        dataset = load_dataset(
+            dataset_dir,
+            dataset_config,
+            token=token,
+            cache_dir=cache_dir,
+        )
+        print(f"Dataset loaded from hub. Length: {dataset.num_rows}")
+    if train_size_ratio < 1.0 and train_size_ratio > 0:
+        # Reduce the size of the training dataset but just selecting a fraction of the samples
+        dataset["train"] = dataset["train"].select(range(int(train_size_ratio * dataset["train"].num_rows)))
+        print(f"Reduced training dataset size to {train_size_ratio}. Length: {dataset.num_rows}")
+    elif train_size_ratio > 1.0 or train_size_ratio < 0:
+        raise ValueError("train_size_ratio must be between 0 and 1.")
+    if not all_fragments_as_labels:
+        dataset = dataset.map(
+            lambda x: {
+                "text": x["text"],
+                "labels": get_fragments_in_labels(x["labels"], linkers_only_as_labels),
+            },
+            batched=False,
+            num_proc=num_proc_map,
+            load_from_cache_file=True,
+            desc="Getting fragments in labels",
+        )
+        # Filter out the samples with None labels
+        dataset = dataset.filter(lambda x: x["labels"] is not None)
+        if linkers_only_as_labels:
+            print(f"Set labels to linkers only. Length: {dataset.num_rows}")
+        else:
+            print(f"Set labels to E3 and WH only. Length: {dataset.num_rows}")
+    if randomize_smiles:
+        dataset["train"] = dataset["train"].map(
+            randomize_smiles_dataset,
+            batched=True,
+            batch_size=batch_size,
+            fn_kwargs={
+                "repeat": randomize_smiles_repeat,
+                "prob": randomize_smiles_prob,
+                "apply_to_text": randomize_text,
+                "apply_to_labels": randomize_labels,
+            },
+            num_proc=num_proc_map,
+            load_from_cache_file=True,
+            desc="Randomizing SMILES",
+        )
+        print(f"Randomized SMILES in dataset. Length: {dataset.num_rows}")
+    if causal_language_modeling:
+        dataset = dataset.map(
+            lambda x: {
+                "text": x["text"] + "." + x["labels"],
+                "labels": x["labels"],
+            },
+            batched=False,
+            num_proc=num_proc_map,
+            load_from_cache_file=True,
+            desc="Setting labels to text",
+        )
+        print(f"Appended labels to text. Length: {dataset.num_rows}")
+    # NOTE: Remove the "labels" column if causal language modeling, since the
+    # DataCollatorForLM will automatically set the labels to the input_ids.
+    dataset = dataset.map(
+        process_data_to_model_inputs,
+        batched=True,
+        batch_size=batch_size,
+        remove_columns=["text", "labels"] if causal_language_modeling else ["text"],
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "encoder_max_length": encoder_max_length,
+            "decoder_max_length": decoder_max_length,
+        },
+        num_proc=num_proc_map,
+        load_from_cache_file=True,
+        desc="Tokenizing dataset",
+    )
+    print(f"Tokenized dataset. Length: {dataset.num_rows}")
+    return dataset
+def load_trl_dataset(
+    tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1",
+    token: Optional[str] = None,
+    max_length: int = 512,
+    dataset_name: str = "ailab-bio/PROTAC-Splitter-Dataset",
+    ds_config: str = "standard",
+    ds_unalabeled: Optional[str] = None,
+) -> Dataset:
+    if isinstance(tokenizer, str):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    # Load training data
+    train_dataset = load_dataset(
+        dataset_name,
+        ds_config,
+        split="train",
+        token=token,
+    )
+    train_dataset = train_dataset.rename_column("text", "query")
+    train_dataset = train_dataset.remove_columns(["labels"])
+    if ds_unalabeled is not None:
+        # Load un-labelled data
+        unlabeled_dataset = load_dataset(
+            dataset_name,
+            ds_unalabeled,
+            split="train",
+            token=token,
+        )
+        unlabeled_dataset = unlabeled_dataset.rename_column("text", "query")
+        unlabeled_dataset = unlabeled_dataset.remove_columns(["labels"])
+        # Concatenate datasets row-wise
+        dataset = concatenate_datasets([train_dataset, unlabeled_dataset])
+    else:
+        dataset = train_dataset
+    def tokenize(sample, tokenizer, max_length=512):
+        input_ids = tokenizer.encode(sample["query"], padding="max_length", max_length=max_length)
+        return {"input_ids": input_ids, "query": sample["query"]}
+    return dataset.map(lambda x: tokenize(x, tokenizer, max_length), batched=False)
+def data_collator_for_trl(batch):
+    return {
+        "input_ids": [torch.tensor(x["input_ids"]) for x in batch],
+        "query": [x["query"] for x in batch],
+    }

protac_splitter/llms/evaluation.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from typing import Union
+from transformers import AutoTokenizer, EvalPrediction
+import numpy as np
+from rdkit import Chem, DataStructs
+import evaluate
+import multiprocessing as mp
+import datetime
+from protac_splitter.evaluation import (
+    # is_valid_smiles,
+    # has_three_substructures,
+    # has_all_attachment_points,
+    # check_substructs,
+    score_prediction,
+)
+def process_predictions(args) -> list:
+    """ Process one iteration of the prediction scoring.
+    Args:
+        args (tuple): Tuple of arguments for the scoring function.
+    Returns:
+        dict: The scores for the prediction.
+    """
+    pred_smiles, protac_smiles, label_smiles, fpgen, compute_rdkit_metrics, compute_graph_metrics = args
+    scores = []
+    for protac, pred, label in zip(protac_smiles, pred_smiles, label_smiles):
+        scores.append(score_prediction(
+            protac_smiles=protac,
+            label_smiles=label,
+            pred_smiles=pred,
+            fpgen=fpgen,
+            compute_rdkit_metrics=compute_rdkit_metrics,
+            compute_graph_metrics=compute_graph_metrics,
+            graph_edit_kwargs={"timeout": 0.05},
+        ))
+    return scores
+def decode_and_get_metrics(
+        pred: EvalPrediction,
+        tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1",
+        rouge = None, # Optional[evaluate.metrics.rouge.Rouge] = None,
+        fpgen = None, # Optional[Chem.rdFingerprintGenerator] = None,
+        compute_rdkit_metrics: bool = False,
+        compute_graph_metrics: bool = True,
+        num_proc: int = 1,
+        batch_size: int = 128,
+        use_nan_for_missing: bool = True,
+        causal_language_modeling: bool = False,
+) -> dict[str, float]:
+    """ Compute metrics for tokenized PROTAC predictions.
+    Args:
+        pred (transformers.EvalPrediction): The predictions from the model.
+        rouge (Rouge): The Rouge object to use for scoring. Example: `rouge = evaluate.load("rouge")`
+        tokenizer (AutoTokenizer | str): The tokenizer to use for decoding the predictions. If a string, the tokenizer will be loaded using `AutoTokenizer.from_pretrained(tokenizer)`. Default: "seyonec/ChemBERTa-zinc-base-v1"
+        fpgen (Chem.rdFingerprintGenerator): The fingerprint generator to use for computing the Tanimoto similarity. Default: `Chem.rdFingerprintGenerator.GetMorganGenerator(radius=8, fpSize=2048)`
+    Returns:
+        dict[str, float]: A dictionary containing the scores for the predictions
+    """
+    print(f"[{datetime.datetime.now()}] Starting decode_and_get_metrics (protac_splitter/llms/evaluation.py)")
+    if causal_language_modeling:
+        # NOTE: For causal language models, we only care about perplexity, so we
+        # only need the eval_loss, which is automatically added.
+        return {}
+    if isinstance(tokenizer, str):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    labels_ids = pred.label_ids
+    pred_ids = pred.predictions
+    input_ids = pred.inputs
+    if causal_language_modeling:
+        # The prediction logits will be of shape: (batch_size, sequence_length, vocabulary_size)
+        # So we need to get the argmax of the last dimension to get the
+        # predicted token IDs.
+        # NOTE: Not exactly the same as what would happen during generation, but
+        # hopefully it's close enough to assess model performance during
+        # training.
+        pred_ids = np.argmax(pred_ids, axis=-1)
+    # Replace -100 in the IDs with the tokenizer pad token id
+    # NOTE: Check the `ignore_index` argument in nn.CrossEntropyLoss.
+    # TODO: Understand why this needs to be done to the inputs as well
+    ignore_index = -100
+    labels_ids[labels_ids == ignore_index] = tokenizer.pad_token_id
+    pred_ids[pred_ids == ignore_index] = tokenizer.pad_token_id
+    # Get strings from IDs
+    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
+    if not causal_language_modeling:
+        input_ids[input_ids == ignore_index] = tokenizer.pad_token_id
+        input_str = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+    else:
+        # NOTE: For causal language models, i.e., decoder only, the input PROTAC
+        # is in the label. Therefore, we need to decode the label to get the
+        # input. The label looks something like "PROTAC.E3.Linker.WH", so we
+        # need to split it and get the last (three) parts.
+        input_str = [str(s.split('.')[0]) for s in label_str]
+        label_str = ['.'.join(s.split('.')[1:]) for s in label_str]
+        pred_str = ['.'.join(s.split('.')[1:]) if '.' in s else s for s in pred_str]
+    # Get scores
+    if num_proc == 1:
+        scores = process_predictions((
+            pred_str, input_str, label_str, fpgen, compute_rdkit_metrics, compute_graph_metrics
+        ))
+    else:
+        # Use pools to process batches of predictions
+        with mp.Pool(processes=num_proc) as pool:
+            scores = []
+            for i in range(0, len(pred_str), batch_size):
+                scores += pool.map(process_predictions, [
+                    (pred_str[i:i+batch_size], input_str[i:i+batch_size], label_str[i:i+batch_size], fpgen, compute_rdkit_metrics, compute_graph_metrics)
+                ])
+            # Flatten the list of scores
+            scores = [s for ls in scores for s in ls]
+    # Aggregate scores
+    scores_labels = set()
+    for s in scores:
+        scores_labels.update(s.keys())
+    aggregated_scores = {}
+    for k in scores_labels:
+        values = np.array([s.get(k, np.nan) for s in scores], dtype=float)
+        # If values is all NaN, set the aggregated score to NaN and continue
+        if np.all(np.isnan(values)):
+            aggregated_scores[k] = None
+            continue
+        # Compute average, excluding `NaN` values if necessary
+        if use_nan_for_missing:
+            aggregated_scores[k] = np.nanmean(values)
+        else:
+            valid_values = values[~np.isnan(values)]
+            aggregated_scores[k] = np.mean(valid_values) if valid_values.size > 0 else float('nan')
+    # Get Rouge score
+    if rouge is not None:
+        rouge_output = rouge.compute(predictions=pred_str, references=label_str)
+        aggregated_scores.update({k: v for k, v in rouge_output.items()})
+    # TODO
+    # # Get tanimoto score
+    # pred_str = np.array(pred_str)[valid_smiles == 1]
+    # label_str = np.array(label_str)[valid_smiles == 1]
+    # if len(pred_str) == 0:
+    #     scores['tanimoto'] = 0.0
+    #     return scores
+    # pred_mols = [Chem.MolFromSmiles(s) for s in pred_str]
+    # label_mols = [Chem.MolFromSmiles(s) for s in label_str]
+    # pred_fps = [fpgen.GetFingerprint(m) for m in pred_mols]
+    # label_fps = [fpgen.GetFingerprint(m) for m in label_mols]
+    # tanimoto = [DataStructs.TanimotoSimilarity(l, p) for l, p in zip(label_fps, pred_fps)]
+    # scores['tanimoto'] = np.array(tanimoto).mean()
+    print(f"[{datetime.datetime.now()}] Done with decode_and_get_metrics (protac_splitter/llms/evaluation.py)")
+    return aggregated_scores

protac_splitter/llms/hf_utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+""" Hugging Face Hub utilities for repository management and file uploads. """
+from typing import Optional
+import huggingface_hub as hf
+from huggingface_hub import repo_info
+from huggingface_hub.utils import RepositoryNotFoundError
+def repo_exists(repo_id: str, token: Optional[str] = None) -> bool:
+    """ Checks if a Hugging Face repository exists. """
+    try:
+        print(repo_info(repo_id, token=token))
+        return True
+    except RepositoryNotFoundError:
+        return False
+def create_hf_repository(**kwargs):
+    """Creates a new Hugging Face repository."""
+    api = hf.HfApi()
+    return api.create_repo(**kwargs)
+def delete_hf_repository(**kwargs):
+    """Creates a new Hugging Face repository."""
+    print(f'Deleting repository {kwargs["repo_id"]}.')
+    api = hf.HfApi()
+    return api.delete_repo(**kwargs)
+def upload_single_file(**kwargs):
+    """Uploads a single file to a Hugging Face repository."""
+    try:
+        api = hf.HfApi()
+        api.upload_file(**kwargs)
+    except Exception as e:
+        print(e)
+        print("WARNING. Best parameters NOT pushed to the hub.")

protac_splitter/llms/model_utils.py ADDED Viewed

	@@ -0,0 +1,256 @@

+""" Hugging Face utilities for model loading and pipeline creation. """
+from typing import Optional, List, Dict, Union
+from datasets import Dataset
+from transformers import (
+    AutoTokenizer,
+    EncoderDecoderModel,
+    AutoModelForCausalLM,
+    pipeline,
+    GenerationConfig,
+)
+from transformers.pipelines.pt_utils import KeyDataset
+from tqdm import tqdm
+import torch
+def get_encoder_decoder_model(
+        pretrained_encoder: str = "seyonec/ChemBERTa-zinc-base-v1",
+        pretrained_decoder: str = "seyonec/ChemBERTa-zinc-base-v1",
+        max_length: Optional[int] = 512,
+        tie_encoder_decoder: bool = False,
+) -> EncoderDecoderModel:
+    """ Get the EncoderDecoderModel model for the PROTAC splitter.
+    Args:
+        pretrained_encoder (str): The pretrained model to use for the encoder. Default: "seyonec/ChemBERTa-zinc-base-v1"
+        pretrained_decoder (str): The pretrained model to use for the decoder. Default: "seyonec/ChemBERTa-zinc-base-v1"
+        max_length (int): The maximum length of the input sequence. Default: 512
+        tie_encoder_decoder (bool): Whether to tie the encoder and decoder weights. Default: False
+    Returns:
+        EncoderDecoderModel: The EncoderDecoderModel model for the PROTAC splitter
+    """
+    bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
+        pretrained_encoder,
+        pretrained_decoder,
+        tie_encoder_decoder=tie_encoder_decoder,
+    )
+    print(f"Number of parameters: {bert2bert.num_parameters():,}")
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder)
+    # Tokenizer-related configs
+    bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
+    bert2bert.config.eos_token_id = tokenizer.sep_token_id
+    bert2bert.config.pad_token_id = tokenizer.pad_token_id
+    bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
+    # Generation configs
+    # NOTE: See full list of configurations can be found here: https://huggingface.co/docs/transformers/v4.33.3/en/main_classes/text_generation#transformers.GenerationConfig
+    bert2bert.encoder.config.max_length = max_length
+    bert2bert.decoder.config.max_length = max_length
+    def setup_gen(config):
+        config.do_sample = True
+        config.num_beams = 5
+        config.top_k = 20
+        config.max_length = 512
+        # config.max_new_tokens = 512
+        return config
+    bert2bert.config = setup_gen(bert2bert.config)
+    bert2bert.encoder.config = setup_gen(bert2bert.encoder.config)
+    bert2bert.decoder.config = setup_gen(bert2bert.decoder.config)
+    bert2bert.decoder.config.is_decoder = True
+    bert2bert.generation_config = setup_gen(bert2bert.generation_config)
+    # bert2bert.config.do_sample = True
+    # bert2bert.config.num_beams = 5
+    # bert2bert.config.top_k = 20
+    # bert2bert.config.max_length=512
+    # bert2bert.config.max_new_tokens=512
+    # bert2bert.generation_config.max_new_tokens = 512
+    # bert2bert.generation_config.min_new_tokens = 512
+    # bert2bert.config.max_new_tokens = 514
+    # bert2bert.config.early_stopping = True
+    # bert2bert.config.length_penalty = 2.0
+    # # bert2bert.config.no_repeat_ngram_size = 3 # Default: 0
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    bert2bert.to(device)
+    return bert2bert
+def get_causal_model(
+        pretrained_model: str = "seyonec/ChemBERTa-zinc-base-v1",
+        max_length: Optional[int] = 512,
+) -> AutoModelForCausalLM:
+    """ Get the causal language model for the PROTAC splitter.
+    Args:
+        pretrained_model (str): The pretrained model to use for the causal language model. Default: "seyonec/ChemBERTa-zinc-base-v1"
+        max_length (int): The maximum length of the input sequence. Default: 512
+    Returns:
+        AutoModelForCausalLM: The causal language model for the PROTAC splitter
+    """
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model, is_decoder=True)
+    # model.is_decoder = True # It might not be necessary, but it's good to be explicit
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    return model
+# REF: https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/generation/configuration_utils.py#L71
+GENERATION_STRATEGY_PARAMS = {
+    "greedy": {"num_beams": 1, "do_sample": False},
+    "contrastive_search": {"penalty_alpha": 0.1, "top_k": 10},
+    "multinomial_sampling": {"num_beams": 1, "do_sample": True},
+    "beam_search_decoding": {"num_beams": 5, "do_sample": False, "num_return_sequences": 5},
+    "beam_search_multinomial_sampling": {"num_beams": 5, "do_sample": True, "num_return_sequences": 5},
+    "diverse_beam_search_decoding": {"num_beams": 5, "num_beam_groups": 5, "diversity_penalty": 1.0, "num_return_sequences": 5},
+}
+def avail_generation_strategies() -> List[str]:
+    """ Get the available generation strategies. """
+    return list(GENERATION_STRATEGY_PARAMS.keys())
+def get_generation_config(generation_strategy: str) -> GenerationConfig:
+    """ Get the generation config for the given generation strategy. """
+    return GenerationConfig(
+        max_length=512,
+        max_new_tokens=512,
+        **GENERATION_STRATEGY_PARAMS[generation_strategy],
+    )
+def get_pipeline(
+        model_name: str,
+        token: str,
+        is_causal_language_model: bool,
+        generation_strategy: Optional[str] = None,
+        num_return_sequences: int = 1,
+        device: Optional[Union[int, str]] = None,
+) -> pipeline:
+    """ Get the pipeline for the given model name and generation strategy.
+    """
+    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+    if is_causal_language_model and generation_strategy is None:
+        print('Loading pipeline for causal language models...')
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, padding_side='left')
+        return pipeline(
+            "text-generation",
+            model=model_name,
+            tokenizer=tokenizer,
+            token=token,
+            device=device,
+            num_return_sequences=num_return_sequences,
+        )
+    if is_causal_language_model and generation_strategy is not None:
+        print('Loading pipeline for causal language models...')
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, padding_side='left')
+        return pipeline(
+            "text-generation",
+            model=model_name,
+            tokenizer=tokenizer,
+            token=token,
+            device=device,
+            generation_config=get_generation_config(generation_strategy),
+        )
+    if not is_causal_language_model and generation_strategy is None:
+        print('Loading pipeline for sequence-to-sequence models...')
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
+        return pipeline(
+            "text2text-generation",
+            model=model_name,
+            tokenizer=tokenizer,
+            token=token,
+            device=device,
+        )
+    if not is_causal_language_model and generation_strategy is not None:
+        print('Loading pipeline for sequence-to-sequence models...')
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
+        return pipeline(
+            "text2text-generation",
+            model=model_name,
+            tokenizer=tokenizer,
+            token=token,
+            device=device,
+            generation_config=get_generation_config(generation_strategy),
+        )
+def run_causal_pipeline(
+        pipe: pipeline,
+        test_ds: Dataset,
+        batch_size: int,
+        smiles_column: str = 'prompt',
+) -> List[Dict[str, str]]:
+    """ Run the pipeline for causal language models and return the predictions.
+    Args:
+        pipe (pipeline): The pipeline object to use for generating predictions.
+        test_ds (Dataset): The test dataset to generate predictions for.
+        batch_size (int): The batch size to use for generating predictions.
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries containing the predictions.
+    """
+    preds = []
+    for pred in tqdm(pipe(KeyDataset(test_ds, smiles_column), batch_size=batch_size, max_length=512), total=len(test_ds) // batch_size):
+        generated_text = [p['generated_text'] for p in pred]
+        # Remove the prompt from the generated text
+        generated_text = ['.'.join(t.split('.')[1:]) for t in generated_text]
+        # Add the predictions to the list
+        p = {f'pred_n{i}': t for i, t in enumerate(generated_text)}
+        preds.append(p)
+    return preds
+def run_seq2seq_pipeline(
+        pipe: pipeline,
+        test_ds: Dataset,
+        batch_size: int,
+        smiles_column: str = 'text',
+) -> List[Dict[str, str]]:
+    """ Run the pipeline for sequence-to-sequence models and return the predictions.
+    Args:
+        pipe (pipeline): The pipeline object to use for generating predictions.
+        test_ds (Dataset): The test dataset to generate predictions for.
+        batch_size (int): The batch size to use for generating predictions.
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries containing the predictions.
+    """
+    preds = []
+    for pred in tqdm(pipe(KeyDataset(test_ds, smiles_column), batch_size=batch_size, max_length=512), total=len(test_ds) // batch_size):
+        p = {f'pred_n{i}': p['generated_text'] for i, p in enumerate(pred)}
+        preds.append(p)
+    return preds
+def run_pipeline(
+        pipe: pipeline,
+        test_ds: Dataset,
+        batch_size: int,
+        is_causal_language_model: bool,
+        smiles_column: str = 'text',
+) -> List[Dict[str, str]]:
+    """ Run the pipeline and return the predictions.
+    Args:
+        pipe (pipeline): The pipeline object to use for generating predictions.
+        test_ds (Dataset): The test dataset to generate predictions for.
+        batch_size (int): The batch size to use for generating predictions.
+        is_causal_language_model (bool): Whether the model is a causal language model or not.
+        smiles_column (str): The column name in the dataset that contains the SMILES strings. Default: 'text'
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries containing the beam-size predictions in the format: [{'pred_n0': 'prediction_0', 'pred_n1': 'prediction_1', ...}, ...]
+    """
+    if is_causal_language_model:
+        return run_causal_pipeline(pipe, test_ds, batch_size, smiles_column)
+    else:
+        return run_seq2seq_pipeline(pipe, test_ds, batch_size, smiles_column)

protac_splitter/llms/training.py ADDED Viewed

	@@ -0,0 +1,869 @@

+import os
+from typing import Optional, Dict, Any, Callable, Tuple, Union
+from functools import partial
+import subprocess
+import copy
+import datetime
+import logging
+import math
+import json
+import torch
+import numpy as np
+import huggingface_hub as hf
+from transformers import (
+    Trainer,
+    TrainingArguments,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    DataCollatorForSeq2Seq,
+    DataCollatorForLanguageModeling,
+    AutoTokenizer,
+    GenerationConfig,
+    TrainerCallback,
+    set_seed,
+)
+from accelerate.utils import write_basic_config
+from accelerate import Accelerator
+import optuna
+from optuna.samplers import QMCSampler
+from optuna.pruners import (
+    BasePruner,
+    HyperbandPruner,
+    ThresholdPruner,
+    PatientPruner,
+    MedianPruner,
+)
+from optuna.study._study_direction import StudyDirection
+from .data_utils import load_tokenized_dataset
+from .evaluation import decode_and_get_metrics
+from .hf_utils import (
+    create_hf_repository,
+    delete_hf_repository,
+    repo_exists,
+    upload_single_file,
+)
+from .model_utils import get_encoder_decoder_model, get_causal_model
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU with index 0
+# logging.basicConfig(level=logging.DEBUG)
+class PrintStepCallback(TrainerCallback):
+    def on_init_end(self, args, state, control, **kwargs):
+        print(f"[{datetime.datetime.now()}] Initialization complete. Training is starting.")
+    def on_step_begin(self, args, state, control, **kwargs):
+        if state.global_step % args.logging_steps == 0:
+            print(f"[{datetime.datetime.now()}] Global step: {state.global_step:,}")
+class ScoreMetric:
+    def __init__(self):
+        self.batch_scores = []
+    def update(self, scores):
+        self.batch_scores.append(scores)
+    def compute(self):
+        all_labels = set()
+        for scores in self.batch_scores:
+            all_labels.update(scores.keys())
+        aggregate_scores = {}
+        for k in all_labels:
+            scores = [s.get(k, np.nan) for s in self.batch_scores]
+            print(f"{k}: {np.nanmean(scores):.4f}")
+            aggregate_scores[k] = np.nanmean(scores)
+        self.batch_scores = []
+        return aggregate_scores
+score_metric = ScoreMetric()
+hp_score_metric = ScoreMetric()
+class WrappedEarlyStoppingPruner(BasePruner):
+    """
+    Pruner that wraps another pruner and checks if the trial should be pruned.
+    It first evaluates the wrapped pruner and, if the wrapped pruner suggests
+    pruning, prune. Otherwise, evaluates based on a patience threshold with a
+    tolerance (min_delta) and eventually prunes.
+    Args:
+        wrapped_pruner:
+            Wrapped pruner to check first. Pruning is only applied if this pruner recommends it.
+        patience:
+            Number of steps to wait for an improvement before pruning.
+        min_delta:
+            Minimum improvement required to reset patience.
+        n_warmup_steps:
+            Number of initial steps to skip the patience check.
+    """
+    def __init__(
+            self,
+            wrapped_pruner: BasePruner,
+            patience: int,
+            min_delta: float = 0.0,
+            n_warmup_steps: int = 0,
+    ) -> None:
+        if wrapped_pruner is None or not isinstance(wrapped_pruner, BasePruner):
+            raise ValueError(f"wrapped_pruner must be an instance of BasePruner but got {wrapped_pruner}.")
+        if patience < 0:
+            raise ValueError(f"patience cannot be negative but got {patience}.")
+        if min_delta < 0:
+            raise ValueError(f"min_delta cannot be negative but got {min_delta}.")
+        if n_warmup_steps < 0:
+            raise ValueError(f"n_warmup_steps cannot be negative but got {n_warmup_steps}.")
+        self._wrapped_pruner = wrapped_pruner
+        self._patience = patience
+        self._min_delta = min_delta
+        self._n_warmup_steps = n_warmup_steps
+    def prune(self, study: "optuna.study.Study", trial: "optuna.trial.FrozenTrial") -> bool:
+        step = trial.last_step
+        if step is None:
+            return False
+        intermediate_values = trial.intermediate_values
+        steps = np.asarray(list(intermediate_values.keys()))
+        # If there are insufficient steps or we are still in the warmup phase, do not prune.
+        if steps.size <= self._patience + 1 or step < self._n_warmup_steps:
+            return False
+        # First, check the wrapped pruner. If it suggests pruning, prune.
+        if self._wrapped_pruner.prune(study, trial):
+            return True
+        steps.sort()
+        # This is the score patience steps ago
+        steps_before_patience = steps[: -self._patience - 1]
+        scores_before_patience = np.asarray(
+            list(intermediate_values[step] for step in steps_before_patience)
+        )
+        # And these are the scores after that
+        steps_after_patience = steps[-self._patience - 1 :]
+        scores_after_patience = np.asarray(
+            list(intermediate_values[step] for step in steps_after_patience)
+        )
+        direction = study.direction
+        if direction == StudyDirection.MINIMIZE:
+            should_prune = np.nanmin(scores_before_patience) + self._min_delta < np.nanmin(
+                scores_after_patience
+            )
+        else:
+            should_prune = np.nanmax(scores_before_patience) - self._min_delta > np.nanmax(
+                scores_after_patience
+            )
+        return should_prune
+def get_lr_scheduler_kwargs(lr_scheduler_type: str) -> Dict[str, Any]:
+    """ Returns the default learning rate scheduler kwargs for a given type.
+    Reference: https://huggingface.co/docs/timm/en/reference/schedulers
+    Args:
+        lr_scheduler_type (str): The type of the learning rate scheduler.
+    Returns:
+        Dict[str, Any]: The default learning rate scheduler kwargs.
+    """
+    if lr_scheduler_type == "cosine":
+        return {}
+    elif lr_scheduler_type == "cosine_with_restarts":
+        return {"num_cycles": 3}
+    elif lr_scheduler_type == "cosine_with_min_lr":
+        return {}
+    elif lr_scheduler_type == "polynomial":
+        return {"power": 1.0}
+    elif lr_scheduler_type == "reduce_lr_on_plateau":
+        return {"min_lr": 1e-6}
+    else:
+        raise ValueError(f"Unknown learning rate scheduler type: '{lr_scheduler_type}'")
+def get_best_hyperparameters(
+        model_init: Callable,
+        tokenizer: AutoTokenizer,
+        data_collator: Union[DataCollatorForSeq2Seq, DataCollatorForLanguageModeling],
+        compute_metrics: Callable,
+        dataset_tokenized: Dict[str, Any],
+        training_args: Dict[str, Any],
+        num_optuna_trials: int,
+        lr_scheduler_type: Optional[str] = None,
+        causal_language_modeling: bool = False,
+        all_fragments_as_labels: bool = True,
+        linkers_only_as_labels: bool = False,
+) -> Tuple[float, Dict[str, Any], Dict[str, Any]]:
+    """Runs an Optuna hyperparameter search to find the best hyperparameters.
+    Args:
+        model_init (Callable): The model initialization function.
+        tokenizer (AutoTokenizer): The tokenizer.
+        data_collator (DataCollatorForSeq2Seq): The data collator.
+        compute_metrics (Callable): The compute metrics function.
+        dataset_tokenized (Dict[str, Any]): The tokenized dataset.
+        training_args (Dict[str, Any]): The training arguments.
+        num_optuna_trials (int): The number of Optuna trials.
+    Returns:
+        Tuple[float, Dict[str, Any], Dict[str, Any]]: The best objective, the best hyperparameters, and the best training arguments.
+    """
+    def optuna_hp_space(trial):
+        # NOTE: Tuning generation config is not implemented yet, please refer to this issue: https://github.com/huggingface/transformers/issues/33755
+        # Suggest hparams "shared" across all scheduler types
+        # learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True)
+        # warmup_ratio = trial.suggest_float("warmup_ratio", 0.01, 0.1, step=0.01)
+        # Restrict learning rate closer to best-performing values
+        learning_rate = trial.suggest_float("learning_rate", 5e-6, 2e-4, log=True)  # Previously 1e-6 to 1e-3
+        # Slightly adjust warmup ratio to avoid extreme values
+        warmup_ratio = trial.suggest_float("warmup_ratio", 0.02, 0.06, step=0.01)  # Previously 0.01 to 0.1
+        # NOTE: We might want to use QMCSampler instead of TPESampler, which
+        # doesn't support categorical parameters. Categories can be encoded as
+        # integers and then decoded back to the original categories.
+        # NOTE: According to the GitHub code, the number of training and warmup
+        # steps for the scheduler types are automatically set, we don't need to
+        # pass them in the lr_scheduler_kwargs.
+        if lr_scheduler_type is None:
+            lr_scheduler_types = ["cosine", "cosine_with_restarts", "reduce_lr_on_plateau"] # "cosine_with_min_lr", "polynomial"
+            suggested_lr_sched = trial.suggest_int("lr_scheduler_type", 0, len(lr_scheduler_types) - 1)
+            suggested_lr_sched = lr_scheduler_types[suggested_lr_sched]
+            lr_scheduler_kwargs = get_lr_scheduler_kwargs(lr_scheduler_type)
+        elif lr_scheduler_type == "cosine":
+            lr_scheduler_kwargs = {
+                "num_cycles": trial.suggest_float("num_cycles", 0.5, 10, step=0.5),
+            }
+        elif lr_scheduler_type == "cosine_with_restarts":
+            lr_scheduler_kwargs = {
+                "num_cycles": trial.suggest_int("num_cycles", 1, 10, step=1),
+            }
+        elif lr_scheduler_type == "reduce_lr_on_plateau":
+            lr_scheduler_kwargs = {
+                "min_lr": trial.suggest_float("min_lr", 1e-10, 1e-8, log=True),  # Previously 1e-12 to 1e-9
+                "factor": trial.suggest_float("factor", 0.8, 0.98, step=0.01),  # Previously 0.1 to 0.99
+            }
+        return {
+            "lr_scheduler_kwargs": lr_scheduler_kwargs,
+            "lr_scheduler_type": lr_scheduler_type if lr_scheduler_type is not None else suggested_lr_sched,
+            "learning_rate": learning_rate,
+            "warmup_ratio": warmup_ratio,
+        }
+    if causal_language_modeling:
+        def compute_objective(metrics: Dict[str, float]):
+            # NOTE: We want to minimize the model perplexity, which is the
+            # exponential of the negative log-likelihood loss. Optuna is setup
+            # to maximize the objective, so we return the negative perplexity.
+            return -math.exp(metrics["eval_loss"])
+    else:
+        if all_fragments_as_labels:
+            def compute_objective(metrics: Dict[str, float]):
+                # NOTE: Having a higher eval_reassembly score should also correspond
+                # to a low eval loss, so we just focus on the reassembly score.
+                return metrics["eval_all_ligands_equal"]
+        else:
+            if linkers_only_as_labels:
+                def compute_objective(metrics: Dict[str, float]):
+                    return metrics["eval_linker_equal"]
+            else:
+                def compute_objective(metrics: Dict[str, float]):
+                    return metrics["eval_e3_equal"] + metrics["eval_poi_equal"]
+    def hp_name(trial: Any) -> str:
+        trial_name = f"trial-number={trial.number}"
+        for hparam, value in trial.params.items():
+            # Check if the value is a float and round it to 3 decimals
+            if hparam == "learning_rate":
+                value = f"{value:.1e}"
+            elif isinstance(value, float):
+                value = f"{value:.3f}"
+            trial_name += f"-{hparam}={value}"
+        return trial_name
+    # Override the training steps
+    hp_training_args = copy.deepcopy(training_args)
+    hp_training_args["num_train_epochs"] = -1
+    hp_training_args["max_steps"] = 10_000
+    hp_training_args["eval_steps"] = 2500
+    hp_training_args["eval_delay"] = 5000 # TODO: Double check if this is needed
+    hp_training_args["logging_steps"] = 500
+    hp_training_args["save_steps"] = 5000
+    if not causal_language_modeling:
+        # Use greedy decoding for the evaluation during HP search
+        hp_training_args["generation_config"] = GenerationConfig(
+            max_length=512,
+            max_new_tokens=512,
+            do_sample=False,
+            num_beams=1,
+        )
+    print("Hyperparameter search training arguments:")
+    for k, v in hp_training_args.items():
+        if 'token' in k:
+            continue
+        print(f"  - {k}: {v}")
+    if causal_language_modeling:
+        TrainerClass = Trainer
+        TrainingArgumentsClass = TrainingArguments
+    else:
+        TrainerClass = Seq2SeqTrainer
+        TrainingArgumentsClass = Seq2SeqTrainingArguments
+    # Setup a "fake" Trainer for the hyperparameter search
+    trainer = TrainerClass(
+        model_init=model_init,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=TrainingArgumentsClass(**hp_training_args),
+        compute_metrics=compute_metrics,
+        train_dataset=dataset_tokenized["train"],
+        eval_dataset=dataset_tokenized["validation"],
+        callbacks=[PrintStepCallback],
+    )
+    # Setup the Optuna pruner and sampler
+    max_warmup_ratio = 0.1
+    pruner = WrappedEarlyStoppingPruner(
+        MedianPruner(
+            n_startup_trials=0,
+            interval_steps=1,
+            n_warmup_steps=int(max_warmup_ratio * hp_training_args["max_steps"]),
+        ),
+        patience=5, # Check every 5000 training steps
+        min_delta=0.01,
+        n_warmup_steps=int(max_warmup_ratio * hp_training_args["max_steps"]),
+    )
+    sampler = QMCSampler(scramble=True, seed=42)
+    # NOTE: The Trainer will return a BestRun object, not the Optuna trial
+    best_run = trainer.hyperparameter_search(
+        direction="maximize",
+        backend="optuna",
+        hp_space=optuna_hp_space,
+        hp_name=hp_name,
+        n_trials=num_optuna_trials,
+        compute_objective=compute_objective, # Default: Will sum over all metrics but loss
+        sampler=sampler,
+        pruner=pruner,
+    )
+    # Set the best hyperparameters in the original Trainer arguments
+    try:
+        print("-" * 80)
+        print(f"Best trial objective: {best_run.objective:.4f}. Summary: {best_run.run_summary}")
+    except Exception as e:
+        print(e)
+        print("WARNING. Best trial objective could not be printed.")
+    return best_run, hp_training_args
+def train_model(
+        model_id: str,
+        ds_name: str,
+        ds_config: str = 'default',
+        learning_rate: float = 5e-5,
+        max_steps: int = -1,
+        num_train_epochs: int = 40,
+        batch_size: int = 128,
+        batch_size_tokenizer: int = 512,
+        gradient_accumulation_steps: int = 4,
+        hub_token: Optional[str] = None,
+        organization: Optional[str] = None,
+        output_dir: str = "./models/",
+        tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1",
+        pretrained_encoder: str = "seyonec/ChemBERTa-zinc-base-v1",
+        pretrained_decoder: str = "seyonec/ChemBERTa-zinc-base-v1",
+        encoder_max_length: int = 512,
+        decoder_max_length: int = 512,
+        tie_encoder_decoder: bool = False,
+        delete_repo_if_exists: bool = False,
+        delete_local_repo_if_exists: bool = False,
+        training_args: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[str] = None,
+        num_optuna_trials: int = 0,
+        num_proc_map: int = 1,
+        per_device_train_batch_size: Optional[int] = None,
+        per_device_eval_batch_size: Optional[int] = None,
+        lr_scheduler_type: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        randomize_smiles: bool = False,
+        randomize_smiles_prob: float = 0.0,
+        all_fragments_as_labels: bool = True,
+        linkers_only_as_labels: bool = False,
+        warmup_ratio: Optional[float] = None,
+        num_cycles: Optional[int] = None,
+        warmup_steps: Optional[int] = None,
+        causal_language_modeling: bool = False,
+        train_size_ratio: float = 1.0,
+        training_args_bin: Optional[str] = None,
+):
+    """Trains a model on a given dataset.
+    Args:
+        model_id (str): The name of the model to be trained.
+        ds_name (str): The name of the dataset to be used for training.
+        ds_config (str, optional): The name of the dataset configuration to be used for training. Defaults to 'default'.
+        learning_rate (float, optional): The learning rate. Defaults to 5e-5.
+        max_steps (int, optional): The maximum number of training steps. Defaults to -1.
+        num_train_epochs (int, optional): The number of training epochs. Defaults to 40.
+        batch_size (int, optional): The batch size. Defaults to 128.
+        batch_size_tokenizer (int, optional): The batch size for the tokenizer. Defaults to 512.
+        gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 4.
+        hub_token (Optional[str], optional): The Hugging Face token. Defaults to None.
+        organization (Optional[str], optional): The Hugging Face organization. Defaults to None.
+        output_dir (str, optional): The output directory. Defaults to "./models/".
+        tokenizer (AutoTokenizer | str, optional): The tokenizer. Defaults to "seyonec/ChemBERTa-zinc-base-v1".
+        pretrained_encoder (str, optional): The name of the pretrained encoder. Defaults to "seyonec/ChemBERTa-zinc-base-v1".
+        pretrained_decoder (str, optional): The name of the pretrained decoder. Defaults to "seyonec/ChemBERTa-zinc-base-v1".
+        encoder_max_length (int, optional): The maximum length of the encoder. Defaults to 256.
+        decoder_max_length (int, optional): The maximum length of the decoder. Defaults to 256.
+        delete_repo_if_exists (bool, optional): Whether to delete the repository first. Defaults to False.
+        training_args (Optional[Seq2SeqTrainingArguments], optional): The training arguments. Defaults to None.
+        resume_from_checkpoint (Optional[str], optional): The checkpoint to resume training from. Defaults to None.
+        num_optuna_trials (int, optional): The number of Optuna trials. Defaults to 0, i.e., no Optuna hyperparameter search.
+    """
+    set_seed(42)
+    # if torch.cuda.is_available():
+    #     write_basic_config(mixed_precision='fp16')
+    accelerator = Accelerator()
+    accelerator.print(f"Accelerator state from the current environment:\n{accelerator.state}")
+    # Check if resume_from_checkpoint exists and it's a file
+    if resume_from_checkpoint is not None:
+        # Check if the checkpoint exists: it can be either a file or a directory
+        if not os.path.exists(resume_from_checkpoint):
+            raise ValueError(f"Checkpoint file '{resume_from_checkpoint}' does not exist.")
+    if hub_token is not None:
+        hf.login(token=hub_token)
+    # Setup output directory and Hugging Face repository
+    output_dir += f"/{model_id}"
+    if organization is not None:
+        hub_model_id = f"{organization}/{model_id}"
+        if delete_local_repo_if_exists and os.path.exists(output_dir):
+            subprocess.run(["rm", "-rf", output_dir])
+            if not os.path.exists(output_dir):
+                print(f"Local repository '{output_dir}' deleted.")
+            else:
+                print(f"Local repository '{output_dir}' could not be deleted.")
+                return
+        if delete_repo_if_exists and repo_exists(hub_model_id, token=hub_token):
+            delete_hf_repository(repo_id=hub_model_id, token=hub_token, missing_ok=True)
+            print(f"Repository '{hub_model_id}' deleted.")
+        repo_url = create_hf_repository(
+            repo_id=hub_model_id,
+            repo_type="model",
+            exist_ok=True,
+            private=True,
+            token=hub_token,
+        )
+        print(f"Repository '{hub_model_id}' created at URL: {repo_url}")
+    else:
+        hub_model_id = None
+    print(f"Hub model ID: {hub_model_id}")
+    if isinstance(tokenizer, str):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    elif tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder)
+    # Load the tokenized dataset
+    print("Loading tokenized dataset.")
+    dataset_tokenized = load_tokenized_dataset(
+        ds_name,
+        ds_config,
+        tokenizer,
+        batch_size_tokenizer,
+        encoder_max_length,
+        decoder_max_length,
+        token=hub_token,
+        num_proc_map=num_proc_map,
+        cache_dir=cache_dir,
+        randomize_smiles=randomize_smiles,
+        randomize_smiles_prob=randomize_smiles_prob,
+        all_fragments_as_labels=all_fragments_as_labels,
+        linkers_only_as_labels=linkers_only_as_labels,
+        causal_language_modeling=causal_language_modeling,
+        train_size_ratio=train_size_ratio,
+    )
+    print("Dataset loaded.")
+    if causal_language_modeling:
+        # Setup the model for `model_init` in the Trainer
+        model_lambda = lambda: get_causal_model(
+            pretrained_model=pretrained_decoder,
+        )
+        # Setup the data collator, which will efficiently pad the inputs and targets
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer,
+            mlm=False,
+            pad_to_multiple_of=8, # Default: None, Original: 8
+        )
+    else:
+        # Precompute a "length" column for the dataset using the map function
+        def add_length(x):
+            x["length"] = len(x["input_ids"])
+            return x
+        dataset_tokenized = dataset_tokenized.map(
+            add_length,
+            num_proc=num_proc_map,
+        )
+        # Setup the model for `model_init` in the Trainer
+        model_lambda = lambda: get_encoder_decoder_model(
+            pretrained_encoder=pretrained_encoder,
+            pretrained_decoder=pretrained_decoder,
+            max_length=encoder_max_length,
+            tie_encoder_decoder=tie_encoder_decoder,
+        )
+        # Setup the data collator, which will efficiently pad the inputs and targets
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer,
+            model=model_lambda(),
+            pad_to_multiple_of=32, # Default: None, Original: 8
+        )
+    # Setup the training arguments
+    if per_device_train_batch_size is None:
+        per_device_train_batch_size = batch_size // gradient_accumulation_steps
+    if per_device_eval_batch_size is None:
+        per_device_eval_batch_size = batch_size // gradient_accumulation_steps
+    if training_args is None:
+        training_args = {
+            "output_dir": output_dir,
+            # Optimizer-related configs
+            "learning_rate": learning_rate,
+            "optim": "adamw_torch",
+            "lr_scheduler_type": "cosine" if lr_scheduler_type is None else lr_scheduler_type,
+            "lr_scheduler_kwargs": get_lr_scheduler_kwargs(lr_scheduler_type),
+            # "warmup_steps": int(0.08 * 10_000), # NOTE: ChemFormer: 8000
+            # "warmup_ratio": warmup_ratio,
+            "adam_beta1": 0.9, # NOTE: ChemFormer: 0.9
+            "adam_beta2": 0.999, # NOTE: ChemFormer: 0.999
+            "adam_epsilon": 1e-8, # Default: 1e-8
+            # Batch size, device, and performance optimizations configs
+            "batch_eval_metrics": False, # Default: False
+            "group_by_length": True,
+            "per_device_train_batch_size": per_device_train_batch_size,
+            "per_device_eval_batch_size": per_device_eval_batch_size,
+            "gradient_accumulation_steps": gradient_accumulation_steps,
+            "auto_find_batch_size": True,
+            "fp16": True if torch.cuda.is_available() else False,
+            "fp16_full_eval" : True,  # Enable full BF16 evaluation for efficiency
+            "half_precision_backend" : "auto",  # Let Hugging Face decide the best backend. Default: "auto"
+            "use_cpu": False, # Default: False
+            "dataloader_num_workers": 8, # Default: 0 (main process only)
+            "dataloader_prefetch_factor": None, # Default: None
+            # Evaluation and checkpointing configs
+            "max_steps": max_steps,
+            "num_train_epochs": num_train_epochs,
+            "save_steps": 20_000, # NOTE: 200
+            "save_strategy": "steps",
+            "eval_steps": 20_000, # NOTE: 500
+            "eval_delay": max(int(max(max_steps, num_train_epochs) * 0.7), 0), # Default: None
+            "eval_strategy": "steps", # NOTE: "evaluation_strategy" is deprecated.
+            "save_total_limit": 2, # This will save both the best and the last trainer checkpoint
+            "load_best_model_at_end": True,
+            "metric_for_best_model": "all_ligands_equal",
+            "include_inputs_for_metrics": True,
+            "eval_on_start": False, # Default: False
+            # Logging configs
+            "log_level": "debug",
+            "logging_steps": 5000,
+            "disable_tqdm": True,
+            "report_to": ["tensorboard"],
+            "save_only_model": False, # Default: False
+            # Hub information configs
+            "push_to_hub": hub_model_id is not None, # NOTE: Also manually done further down
+            "push_to_hub_model_id": model_id,
+            "push_to_hub_organization": organization,
+            "hub_model_id": hub_model_id,
+            "hub_token": hub_token,
+            "hub_strategy": "checkpoint", # NOTE: Allows to resume training from last checkpoint
+            "hub_private_repo": True,
+            # Other configs
+            "seed": 42,
+            "data_seed": 42,
+        }
+        if 'num_cycles' in training_args["lr_scheduler_kwargs"] and num_cycles is not None:
+            training_args["lr_scheduler_kwargs"]["num_cycles"] = num_cycles
+        if warmup_ratio is not None:
+            training_args["warmup_ratio"] = warmup_ratio
+        if warmup_steps is not None:
+            training_args["warmup_steps"] = warmup_steps
+        # Add Generation configs
+        if causal_language_modeling:
+            training_args["metric_for_best_model"] = "eval_loss"
+        else:
+            generation_config = GenerationConfig(
+                max_length=512,
+                max_new_tokens=512,
+                do_sample=True,
+                num_beams=5,
+                temperature=1.0,
+            )
+            training_args["generation_config"] = generation_config
+            training_args["predict_with_generate"] = True
+            training_args["generation_config"] = generation_config
+            training_args["generation_max_length"] = 512
+    print("Training arguments:")
+    for k, v in training_args.items():
+        if 'token' in k:
+            continue
+        print(f"  - {k}: {v}")
+    # Modify the training arguments with Optuna hyperparameter search
+    if num_optuna_trials > 0:
+        # Setup the compute_metrics function for the hyperparameter search
+        hp_compute_metrics = partial(
+            decode_and_get_metrics,
+            tokenizer=tokenizer,
+            compute_rdkit_metrics=False,
+            compute_graph_metrics=False,
+            num_proc=num_proc_map,
+            causal_language_modeling=causal_language_modeling,
+        )
+        # Run the HP search (and update the training_args accordingly)
+        best_run, hp_training_args = get_best_hyperparameters(
+            model_init=model_lambda,
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+            compute_metrics=hp_compute_metrics,
+            dataset_tokenized=dataset_tokenized,
+            training_args=copy.deepcopy(training_args),
+            lr_scheduler_type=lr_scheduler_type,
+            num_optuna_trials=num_optuna_trials,
+            causal_language_modeling=causal_language_modeling,
+            all_fragments_as_labels=all_fragments_as_labels,
+            linkers_only_as_labels=linkers_only_as_labels,
+        )
+        best_objective = best_run.objective
+        best_trial_number = best_run.run_id
+        best_hparams = best_run.hyperparameters
+        # Save to output directory the best hyperparameters
+        with open(f"{output_dir}/best_hyperparameters.md", "w") as f:
+            f.write(f"Number of Optuna trials: {num_optuna_trials}\n\n")
+            f.write(f"Best trial objective: {best_objective:.4f} (best trial number: {best_trial_number})\n\n")
+            f.write("Best hyperparameters:\n")
+            for hparam, value in best_hparams.items():
+                f.write(f"- {hparam}: {value}\n")
+            f.write("\n")
+            f.write("Training arguments:\n")
+            for hparam, value in hp_training_args.items():
+                if "token" in hparam:
+                    continue
+                elif isinstance(value, str):
+                    if 'hf_' in value:
+                        continue
+                f.write(f"- {hparam}: {value}\n")
+        # Open the file and remove any line that might contain the token
+        with open(f"{output_dir}/best_hyperparameters.md", "r") as f:
+            lines = f.readlines()
+            with open(f"{output_dir}/best_hyperparameters.md", "w") as f:
+                for line in lines:
+                    if "hf_" in line:
+                        continue
+                    f.write(line)
+        print(f"Best hyperparameters saved to '{output_dir}/best_hyperparameters.md'.")
+        if hub_model_id is not None:
+            upload_single_file(
+                path_or_fileobj=f"{output_dir}/best_hyperparameters.md",
+                path_in_repo="best_hyperparameters.md",
+                repo_id=hub_model_id,
+                token=hub_token,
+            )
+        # Save the best_hparams to a JSON file
+        with open(f"{output_dir}/best_hyperparameters.json", "w") as f:
+            json.dump(best_hparams, f, indent=4)
+        print(f"Best hyperparameters saved to '{output_dir}/best_hyperparameters.json'.")
+        if hub_model_id is not None:
+            upload_single_file(
+                path_or_fileobj=f"{output_dir}/best_hyperparameters.json",
+                path_in_repo="best_hyperparameters.json",
+                repo_id=hub_model_id,
+                token=hub_token,
+            )
+        # Update the training arguments with the best hyperparameters
+        hp_specific_args = [
+            "num_train_epochs",
+            "max_steps",
+            "eval_steps",
+            "eval_delay",
+            "logging_steps",
+            "save_steps",
+            "generation_config",
+        ]
+        for k, v in hp_training_args.items():
+            # Skip the specific arguments set/modifed by the HP search
+            if k in hp_specific_args:
+                continue
+            training_args[k] = v
+        # Update the num_cycles according to the original max_steps
+        lr_scheduler_kwargs = hp_training_args["lr_scheduler_kwargs"]
+        if "num_cycles" in lr_scheduler_kwargs:
+            hp_num_cycles = lr_scheduler_kwargs["num_cycles"]
+            hp_max_steps = hp_training_args["max_steps"]
+            # Adjust/scale the max_cycles according to the number of steps
+            if hp_max_steps > 0:
+                hp_cycle_ratio = hp_num_cycles / hp_max_steps
+                num_cycles = int(hp_cycle_ratio * max_steps)
+                training_args["lr_scheduler_kwargs"]["num_cycles"] = num_cycles
+                print(f"Adjusted number of cycles: {num_cycles}")
+        # Adjust the warmup steps according to the original max_steps
+        if "warmup_ratio" in hp_training_args:
+            hp_warmup_ratio = hp_training_args["warmup_ratio"]
+            hp_max_steps = hp_training_args["max_steps"]
+            warmup_steps = int(hp_warmup_ratio * hp_max_steps)
+            warmup_ratio = warmup_steps / max_steps
+            training_args["warmup_steps"] = warmup_steps
+            training_args["warmup_ratio"] = warmup_ratio
+        print("Training arguments updated with the best hyperparameters:")
+        for k, v in training_args.items():
+            if 'token' in k:
+                continue
+            print(f"  - {k}: {v}")
+        print("-" * 80)
+        print("Starting training with the best hyperparameters.")
+        print("-" * 80)
+    # rouge = evaluate.load("rouge") # , cache_dir="/mimer/NOBACKUP/groups/naiss2023-6-290/stefano/.cache/huggingface/evaluate/")
+    # fpgen = Chem.rdFingerprintGenerator.GetMorganGenerator(
+    #     radius=11,
+    #     fpSize=1024,
+    # )
+    rouge = None
+    fpgen = None
+    compute_metrics = partial(
+        decode_and_get_metrics,
+        tokenizer=tokenizer,
+        rouge=rouge,
+        fpgen=fpgen,
+        compute_rdkit_metrics=False,
+        compute_graph_metrics=True,
+        num_proc=max(1, num_proc_map - 2), # NOTE: Use 2 less process for the metrics, since there will be a timeout logic
+        causal_language_modeling=causal_language_modeling,
+    )
+    if training_args_bin is not None:
+        print(f"Loading training arguments from: {training_args_bin}.")
+        # Load training arguments from a binary file and update model-specific arguments
+        args = torch.load(training_args_bin)
+        args.output_dir = output_dir
+        args.overwrite_output_dir = True if delete_local_repo_if_exists else False
+        args.push_to_hub_model_id = model_id
+        args.push_to_hub_organization = organization
+        args.hub_model_id = hub_model_id
+        args.hub_token = hub_token
+        # Print all the training arguments
+        print("Training arguments loaded:")
+        for k, v in args.__dict__.items():
+            if 'token' in k:
+                continue
+            print(f"  - {k}: {v}")
+    else:
+        if causal_language_modeling:
+            args = TrainingArguments(**training_args)
+        else:
+            args = Seq2SeqTrainingArguments(**training_args)
+    if causal_language_modeling:
+        TrainerClass = Trainer
+    else:
+        TrainerClass = Seq2SeqTrainer
+    # Setup the Trainer and start training (no Optuna hyperparameter search)
+    trainer = TrainerClass(
+        model_init=model_lambda,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=args,
+        compute_metrics=compute_metrics,
+        train_dataset=dataset_tokenized["train"],
+        eval_dataset=dataset_tokenized["test"],
+    )
+    if resume_from_checkpoint is not None:
+        trainer.train(
+            resume_from_checkpoint=resume_from_checkpoint,
+        )
+    else:
+        trainer.train()
+    print("-" * 80)
+    print("Training completed.")
+    print("-" * 80)
+    if causal_language_modeling:
+        tasks = ["Text Generation"]
+    else:
+        tasks = ["Text2Text Generation", "question-answering"]
+    tokenizer.save_pretrained(output_dir)
+    if hub_model_id is not None:
+        print("Pushing model to Hugging Face Hub.")
+        print("-" * 80)
+        trainer.push_to_hub(
+            commit_message="Initial version",
+            model_name=hub_model_id,
+            license="mit",
+            finetuned_from=f"{pretrained_encoder}",
+            tasks=tasks,
+            tags=["PROTAC", "cheminformatics"],
+            dataset=[ds_name],
+            dataset_args=[ds_config],
+        )
+        tokenizer.push_to_hub(
+            repo_id=hub_model_id,
+            commit_message="Upload tokenizer",
+            private=True,
+            token=hub_token,
+            tags=["PROTAC", "cheminformatics"],
+        )
+    else:
+        print("Pushing model to local directory.")
+        print("-" * 80)
+        trainer.save_model(output_dir)
+        tokenizer.save_pretrained(output_dir)
+        print(f"Model saved to '{output_dir}'.")
+    print("All done.")

protac_splitter/llms/training_causal_model.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+from typing import Dict, Any
+import torch
+from transformers import TrainerCallback
+from trl import SFTTrainer
+from rdkit import Chem
+from protac_splitter.llms.data_utils import load_tokenized_dataset
+from protac_splitter.llms.model_utils import get_model
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU if available
+# Placeholder for a scoring function that evaluates the generated SMILES
+def score_function(smiles1, predicted_smiles):
+    """ Evaluates the generated SMILES sequence based on validity. """
+    mol = Chem.MolFromSmiles(predicted_smiles)
+    return 1 if mol else 0  # Returns 1 if valid, 0 if invalid
+# Custom Trainer subclass to integrate SMILES evaluation
+class CustomSFTTrainer(SFTTrainer):
+    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+        if eval_dataset is None:
+            eval_dataset = self.eval_dataset
+        # Generate predictions
+        predictions = self.predict(eval_dataset)
+        generated_texts = self.tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
+        total_score = 0
+        total_samples = len(generated_texts)
+        for i, example in enumerate(eval_dataset):
+            input_text = example["text"]  # Full input: "Smiles1 Smiles2.Smiles3.Smiles4"
+            smiles1 = input_text.split(" ")[0]  # Extract Smiles1 (the prompt)
+            # Remove the prompt from the generated text to get the predicted completion
+            predicted_completion = generated_texts[i].removeprefix(smiles1).strip()
+            # Compute custom score
+            score = score_function(smiles1, predicted_completion)
+            total_score += score
+        # Compute average score
+        average_score = total_score / total_samples if total_samples > 0 else 0
+        # Log metrics
+        metrics = {f"{metric_key_prefix}_average_score": average_score}
+        self.log(metrics)
+        return metrics
+def train():
+    """ Main training function """
+    model = get_model()  # Load the model
+    tokenizer = model.tokenizer  # Get tokenizer from model
+    # Load dataset
+    dataset = load_tokenized_dataset()
+    # Training arguments
+    training_args = {
+        "output_dir": "./trained_model",
+        "evaluation_strategy": "steps",
+        "save_strategy": "steps",
+        "logging_steps": 100,
+        "save_steps": 500,
+        "num_train_epochs": 3,
+        "per_device_train_batch_size": 8,
+        "per_device_eval_batch_size": 8,
+        "learning_rate": 5e-5,
+        "save_total_limit": 2,
+    }
+    # Initialize custom trainer
+    trainer = CustomSFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["validation"],
+    )
+    # Train model
+    trainer.train()
+if __name__ == "__main__":
+    train()

protac_splitter/llms/training_mlm_model.py ADDED Viewed

	@@ -0,0 +1,287 @@

+""" Train a masked language model (MLM) using an encoder-decoder architecture. """
+import os
+from typing import Optional, Dict, Any, Union
+import subprocess
+import torch
+import huggingface_hub as hf
+from transformers import (
+    Trainer,
+    TrainingArguments,
+    DataCollatorForLanguageModeling,
+    AutoTokenizer,
+)
+from protac_splitter.llms.data_utils import load_tokenized_dataset
+from protac_splitter.llms.hf_utils import (
+    create_hf_repository,
+    delete_hf_repository,
+    repo_exists,
+)
+from protac_splitter.llms.model_utils import get_encoder_decoder_model
+def compute_metrics_for_mlm(pred) -> Dict[str, float]:
+    """Compute metrics for MLM predictions, i.e., perplexity."""
+    logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
+    labels = pred.label_ids
+    # Convert to torch tensors
+    logits = torch.tensor(logits)
+    labels = torch.tensor(labels)
+    # Compute masked loss
+    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
+    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+    return {
+        "perplexity": torch.exp(loss).item(),
+        "loss": loss.item()
+    }
+def train_mlm_model(
+        model_id: str,
+        ds_name: str,
+        ds_config: str = 'default',
+        learning_rate: float = 5e-5,
+        max_steps: int = -1,
+        num_train_epochs: int = 40,
+        batch_size: int = 128,
+        batch_size_tokenizer: int = 512,
+        gradient_accumulation_steps: int = 4,
+        hub_token: Optional[str] = None,
+        organization: Optional[str] = None,
+        output_dir: str = "./models/",
+        tokenizer: Union[AutoTokenizer, str] = "seyonec/ChemBERTa-zinc-base-v1",
+        pretrained_encoder: str = "seyonec/ChemBERTa-zinc-base-v1",
+        pretrained_decoder: str = "seyonec/ChemBERTa-zinc-base-v1",
+        encoder_max_length: int = 512,
+        decoder_max_length: int = 512,
+        tie_encoder_decoder: bool = False,
+        delete_repo_if_exists: bool = False,
+        delete_local_repo_if_exists: bool = False,
+        training_args: Optional[Dict[str, Any]] = None,
+        resume_from_checkpoint: Optional[str] = None,
+        num_proc_map: int = 1,
+        per_device_batch_size: Optional[int] = None,
+        lr_scheduler_type: Optional[str] = None,
+        mlm_probability: float = 0.15,
+        randomize_smiles: bool = False,
+        randomize_smiles_prob: float = 0.5,
+        randomize_smiles_repeat: int = 1,
+):
+    """
+    Trains a masked language model (MLM) using an encoder-decoder architecture.
+    Args:
+        model_id (str): The name of the model to be trained.
+        ds_name (str): The name of the dataset to use for training.
+        ds_config (str): The configuration of the dataset to use. Default: 'default'.
+        learning_rate (float): The learning rate for training. Default: 5e-5.
+        max_steps (int): The maximum number of training steps. Default: -1.
+        num_train_epochs (int): The number of training epochs. Default: 40.
+        batch_size (int): The total batch size. Default: 128.
+        batch_size_tokenizer (int): The batch size for the tokenizer. Default: 512.
+        gradient_accumulation_steps (int): The number of gradient accumulation steps. Default: 4.
+        hub_token (str): The Hugging Face token for authentication. Default: None.
+        organization (str): The organization to push the model to. Default: None.
+        output_dir (str): The output directory for the model. Default: "./models/".
+        tokenizer (AutoTokenizer | str): The tokenizer to use for training. Default: "seyonec/ChemBERTa-zinc-base-v1".
+        pretrained_encoder (str): The pretrained encoder model to use. Default: "seyonec/ChemBERTa-zinc-base-v1".
+        pretrained_decoder (str): The pretrained decoder model to use. Default: "seyonec/ChemBERTa-zinc-base-v1".
+        encoder_max_length (int): The maximum length of the encoder input. Default: 512.
+        decoder_max_length (int): The maximum length of the decoder input. Default: 512.
+        tie_encoder_decoder (bool): Whether to tie the encoder and decoder weights. Default: False.
+        delete_repo_if_exists (bool): Whether to delete the repository if it already exists. Default: False.
+        delete_local_repo_if_exists (bool): Whether to delete the local repository if it already exists. Default: False.
+        training_args (Dict[str, Any]): The training arguments for the Trainer. Default: None.
+        resume_from_checkpoint (str): The checkpoint to resume training from. Default: None.
+        num_optuna_trials (int): The number of Optuna hyperparameter search trials. Default: 0.
+        num_proc_map (int): The number of processes to use for mapping. Default: 1.
+        per_device_batch_size (int): The batch size per device. If defined, it will overwrite batch_size. Default: None.
+        lr_scheduler_type (str): The learning rate scheduler type. Default: None.
+        mlm_probability (float): The probability of masking tokens in the input. Default: 0.15.
+        randomize_smiles (bool): Whether to randomize SMILES strings. Default: False.
+        randomize_smiles_prob (float): The probability of randomizing SMILES strings. Default: 0.5.
+        randomize_smiles_repeat (int): The number of times to repeat randomizing SMILES strings. Default: 1.
+    """
+        # Check if resume_from_checkpoint exists and it's a file
+    if resume_from_checkpoint is not None:
+        # Check if the checkpoint exists: it can be either a file or a directory
+        if not os.path.exists(resume_from_checkpoint):
+            raise ValueError(f"Checkpoint file '{resume_from_checkpoint}' does not exist.")
+    if hub_token is not None:
+        hf.login(token=hub_token)
+    # Setup output directory and Hugging Face repository
+    output_dir += f"/{model_id}"
+    if organization is not None:
+        hub_model_id = f"{organization}/{model_id}"
+        if delete_repo_if_exists and repo_exists(hub_model_id, token=hub_token):
+            delete_hf_repository(repo_id=hub_model_id, token=hub_token)
+            if not repo_exists(hub_model_id, token=hub_token):
+                print(f"Repository '{hub_model_id}' deleted.")
+            else:
+                print(f"Repository '{hub_model_id}' could not be deleted.")
+                return
+        if delete_local_repo_if_exists and os.path.exists(output_dir):
+            subprocess.run(["rm", "-rf", output_dir])
+            if not os.path.exists(output_dir):
+                print(f"Local repository '{output_dir}' deleted.")
+            else:
+                print(f"Local repository '{output_dir}' could not be deleted.")
+                return
+        repo_url = create_hf_repository(
+            repo_id=hub_model_id,
+            repo_type="model",
+            exist_ok=True,
+            private=True,
+            token=hub_token,
+        )
+        print(f"Repository '{hub_model_id}' created at URL: {repo_url}")
+    else:
+        hub_model_id = None
+    print(f"Hub model ID: {hub_model_id}")
+    if isinstance(tokenizer, str):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+    elif tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder)
+    # Set the pad token to the end of the sequence, required for MLM training
+    tokenizer.pad_token = tokenizer.eos_token
+    # Load the tokenized dataset
+    print("Loading tokenized dataset.")
+    dataset_tokenized = load_tokenized_dataset(
+        ds_name,
+        ds_config,
+        tokenizer,
+        batch_size_tokenizer,
+        encoder_max_length,
+        decoder_max_length,
+        token=hub_token,
+        num_proc_map=num_proc_map,
+        randomize_smiles=randomize_smiles,
+        randomize_smiles_prob=randomize_smiles_prob,
+        randomize_smiles_repeat=randomize_smiles_repeat,
+        randomize_text=True,
+        randomize_labels=False,
+    )
+    # Remove "labels" column from the dataset
+    dataset_tokenized = dataset_tokenized.remove_columns(["labels"])
+    print("Dataset loaded.")
+    # Setup the model for `model_init` in the Trainer
+    bert2bert = lambda: get_encoder_decoder_model(
+        pretrained_encoder=pretrained_encoder,
+        pretrained_decoder=pretrained_decoder,
+        max_length=encoder_max_length,
+        tie_encoder_decoder=tie_encoder_decoder,
+    )
+    # Setup the data collator
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer,
+        mlm=True,
+        mlm_probability=mlm_probability,
+        pad_to_multiple_of=8,
+    )
+    # Setup the training arguments
+    if per_device_batch_size is None:
+        per_device_batch_size = batch_size // gradient_accumulation_steps
+    if training_args is None:
+        training_args = {
+            "output_dir": output_dir,
+            # Optimizer-related configs
+            "learning_rate": learning_rate,
+            "optim": "adamw_torch",
+            "lr_scheduler_type": "cosine" if lr_scheduler_type is None else lr_scheduler_type,
+            "warmup_steps": 8000, # NOTE: ChemFormer: 8000
+            # "warmup_ratio": 0,
+            "adam_beta1": 0.9, # NOTE: ChemFormer: 0.9
+            "adam_beta2": 0.999, # NOTE: ChemFormer: 0.999
+            "adam_epsilon": 1e-8, # Default: 1e-8
+            # Batch size, device, and performance optimizations configs
+            # "torch_compile": True,
+            "group_by_length": True,
+            "per_device_train_batch_size": per_device_batch_size,
+            "per_device_eval_batch_size": per_device_batch_size,
+            "gradient_accumulation_steps": gradient_accumulation_steps,
+            "auto_find_batch_size": True,
+            "fp16": True if torch.cuda.is_available() else False,
+            # Evaluation and checkpointing configs
+            "max_steps": max_steps,
+            "num_train_epochs": num_train_epochs,
+            "save_steps": 1000, # NOTE: 200
+            "save_strategy": "steps",
+            "eval_steps": 1000, # NOTE: 500
+            "evaluation_strategy": "steps",
+            "save_total_limit": 1,
+            "load_best_model_at_end": True,
+            "metric_for_best_model": "perplexity",
+            "include_inputs_for_metrics": True,
+            # Logging configs
+            "log_level": "warning",
+            "logging_steps": 500,
+            "disable_tqdm": True,
+            "report_to": ["tensorboard"],
+            "save_only_model": False, # Default: False
+            # Hub information configs
+            "push_to_hub": True, # NOTE: Also manually done further down
+            "push_to_hub_model_id": model_id,
+            "push_to_hub_organization": organization,
+            "hub_model_id": hub_model_id,
+            "hub_token": hub_token,
+            "hub_strategy": "checkpoint", # NOTE: Allows to resume training from last checkpoint
+            "hub_private_repo": True,
+            # Other configs
+            "seed": 42,
+            "data_seed": 42,
+        }
+    # Setup the Trainer and start training (no Optuna hyperparameter search)
+    trainer = Trainer(
+        model_init=bert2bert,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        args=TrainingArguments(**training_args),
+        compute_metrics=compute_metrics_for_mlm,
+        train_dataset=dataset_tokenized["train"],
+        eval_dataset=dataset_tokenized["validation"],
+    )
+    if resume_from_checkpoint is not None:
+        trainer.train(
+            resume_from_checkpoint=resume_from_checkpoint,
+        )
+    else:
+        trainer.train()
+    print("-" * 80)
+    print("Training completed.")
+    print("-" * 80)
+    if hub_model_id is not None:
+        print("Pushing model to Hugging Face Hub.")
+        print("-" * 80)
+        tokenizer.save_pretrained(output_dir)
+        trainer.push_to_hub(
+            commit_message="Initial version",
+            model_name=hub_model_id,
+            license="mit",
+            finetuned_from=f"{pretrained_encoder}",
+            tasks=["Text2Text Generation", "question-answering"],
+            tags=["PROTAC", "cheminformatics"],
+            dataset=[ds_name],
+            dataset_args=[ds_config],
+        )
+        tokenizer.push_to_hub(
+            repo_id=hub_model_id,
+            commit_message="Upload tokenizer",
+            private=True,
+            token=hub_token,
+            tags=["PROTAC", "cheminformatics"],
+        )
+    print("All done.")

protac_splitter/llms/training_rl_models.py ADDED Viewed

	@@ -0,0 +1,406 @@

+""" Train a PPO and DPO model for PROTAC-Splitter using Hugging Face
+Transformers and TRL. This is a work in progress code, so it's not tested nor
+used in the package.
+"""
+from typing import Optional, Literal
+from functools import partial
+import os
+import subprocess
+import torch
+import evaluate
+import huggingface_hub as hf
+from tqdm import tqdm
+from datasets import load_dataset
+from rdkit import Chem
+from transformers import (
+    AutoTokenizer,
+    TrainingArguments,
+    EncoderDecoderModel,
+    AutoConfig,
+)
+from trl import (
+    AutoModelForSeq2SeqLMWithValueHead,
+    PPOConfig,
+    PPOTrainer,
+    DPOTrainer,
+)
+from protac_splitter.llms.data_utils import (
+    load_trl_dataset,
+    data_collator_for_trl,
+)
+from protac_splitter.llms.hf_utils import (
+    create_hf_repository,
+    delete_hf_repository,
+    repo_exists,
+)
+from protac_splitter.llms.evaluation import decode_and_get_metrics
+from protac_splitter.evaluation import check_substructs, split_prediction
+def clean_text(text: str) -> str:
+    """ Cleans the text by removing special tokens. """
+    return text.replace("<s>", "").replace("</s>", "")
+def reward_function(
+        query: str,
+        response: str,
+) -> float:
+    """ Reward function for the RL-based models.
+    Args:
+        query (str): The query SMILES string.
+        response (str): The response SMILES string.
+    Returns:
+        float: The reward value.
+    """
+    substructs = split_prediction(response)
+    if substructs is None:
+        return torch.Tensor(-1.)
+    if not check_substructs(
+        protac_smiles=query,
+        poi_smiles=substructs['poi'],
+        linker_smiles=substructs['linker'],
+        e3_smiles=substructs['e3'],
+        return_bond_types=False,
+        poi_attachment_id=1,
+        e3_attachment_id=2,
+    ):
+        return torch.Tensor(0.)
+    return torch.Tensor(1.)
+def train_ppo_model(
+    model_id: str = "PROTAC-Splitter-PPO-standard_rand_recombined-ChemBERTa-zinc-base",
+    organization: str = 'ailab-bio',
+    output_dir: str = "./models/",
+    max_steps: int = 2000,
+    ppo_epochs: int = 5,
+    batch_size: int = 128,
+    hub_token: Optional[str] = None,
+    pretrained_model_name: str = "ailab-bio/PROTAC-Splitter-standard_rand_recombined-ChemBERTa-zinc-base",
+    max_length: int = 512,
+    delete_repo_if_exists: bool = False,
+    delete_local_repo_if_exists: bool = False,
+    ds_name: str = "ailab-bio/PROTAC-Splitter-Dataset",
+    ds_config: str = "standard",
+):
+    """ Trains a PPO model on a given dataset.
+    Args:
+        model_id (str, optional): The name of the model to be trained. Defaults to "PROTAC-Splitter-PPO-standard_rand_recombined-ChemBERTa-zinc-base".
+        organization (str, optional): The organization name. Defaults to 'ailab-bio'.
+        output_dir (str, optional): The output directory. Defaults to "./models/".
+        max_steps (int, optional): The maximum number of training steps. Defaults to 2000.
+        ppo_epochs (int, optional): The number of PPO epochs. Defaults to 4.
+        batch_size (int, optional): The batch size. Defaults to 128.
+        hub_token (Optional[str], optional): The Hugging Face token. Defaults to None.
+        pretrained_model_name (str, optional): The name of the pretrained model. Defaults to "ailab-bio/PROTAC-Splitter-standard_rand_recombined-ChemBERTa-zinc-base".
+        max_length (int, optional): The maximum length of the input sequence. Defaults to 512.
+        delete_repo_first (bool, optional): Whether to delete the repository first. Defaults to False.
+    """
+    if ppo_epochs < 1:
+        raise ValueError(f"ppo_epochs must be >= 1, got {ppo_epochs}.")
+    if hub_token is not None:
+        hf.login(token=hub_token)
+    # Setup output directory and Hugging Face repository
+    output_dir += f"/{model_id}"
+    if organization is not None:
+        hub_model_id = f"{organization}/{model_id}"
+        if delete_repo_if_exists and repo_exists(hub_model_id, token=hub_token):
+            delete_hf_repository(repo_id=hub_model_id, token=hub_token)
+            if not repo_exists(hub_model_id, token=hub_token):
+                print(f"Repository '{hub_model_id}' deleted.")
+            else:
+                print(f"Repository '{hub_model_id}' could not be deleted.")
+                return
+        if delete_local_repo_if_exists and os.path.exists(output_dir):
+            subprocess.run(["rm", "-rf", output_dir])
+            if not os.path.exists(output_dir):
+                print(f"Local repository '{output_dir}' deleted.")
+            else:
+                print(f"Local repository '{output_dir}' could not be deleted.")
+                return
+        repo_url = create_hf_repository(
+            repo_id=hub_model_id,
+            repo_type="model",
+            exist_ok=True,
+            private=True,
+            token=hub_token,
+        )
+        print(f"Repository '{hub_model_id}' created at URL: {repo_url}")
+    else:
+        hub_model_id = None
+    print(f"Hub model ID: {hub_model_id}")
+    # Load pretrained model
+    model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
+        pretrained_model_name,
+        max_length=max_length,
+    )
+    ref_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
+        pretrained_model_name,
+        max_length=max_length,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    # Get dataset
+    train_dataset = load_trl_dataset(
+        tokenizer=tokenizer,
+        token=hub_token,
+        max_length=max_length,
+        dataset_name=ds_name,
+        ds_config=ds_config,
+    ).shuffle(seed=42).flatten_indices()
+    # Setup PPO trainer
+    hub_configs = {
+        "repo_id": hub_model_id,
+        "commit_message": "Initial version",
+        "private": True,
+    }
+    ppo_config = PPOConfig(
+        # Learning parameters
+        learning_rate=1e-5,
+        steps=max_steps, # Default: 20_000
+        ppo_epochs=ppo_epochs, # Default: 4
+        batch_size=batch_size, # Default: 256
+        gradient_accumulation_steps=1, # Default: 1
+        optimize_device_cache=True,
+        # PPO parameters
+        init_kl_coef=1.0,
+        adap_kl_ctrl=True,
+        target=0.5,
+        horizon=1000,
+        cliprange=0.1,
+        early_stopping=True,
+        target_kl=0.5,
+        max_grad_norm=1.0,
+        use_score_scaling=True,
+        use_score_norm=True,
+        whiten_rewards=True,
+        # Logging parameters
+        # NOTE: Check this guide for more information about the logged metrics:
+        # https://huggingface.co/docs/trl/v0.10.1/logging
+        model_name=hub_model_id,
+        push_to_hub_if_best_kwargs=hub_configs,
+        log_with="tensorboard", # ["wandb", LoggerType.TENSORBOARD],
+        project_kwargs={"logging_dir": output_dir},
+        seed=42,
+    )
+    ppo_trainer = PPOTrainer(
+        model=model,
+        ref_model=ref_model,
+        num_shared_layers=0,
+        config=ppo_config,
+        tokenizer=tokenizer,
+        dataset=train_dataset,
+        data_collator=data_collator_for_trl,
+        # lr_scheduler=torch.optim.lr_scheduler.LRScheduler, # NOTE: It must be that, CosineAnnealingLR is not supported
+    )
+    # Training Loop
+    generation_kwargs = {
+        "do_sample": True,
+        "num_beams": 5,
+        "top_k": 20,
+        "max_length": 512,
+        "pad_token_id": tokenizer.eos_token_id,
+    }
+    for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader), total=len(ppo_trainer.dataloader)):
+        query_tensors = batch["input_ids"]
+        # Get response from SFTModel
+        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
+        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
+        # Compute reward score
+        rewards = [reward_function(clean_text(q), clean_text(r)) for q, r in zip(batch["query"], batch["response"])]
+        rewards = [torch.tensor(r) for r in rewards]
+        # Run PPO step
+        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
+        ppo_trainer.log_stats(stats, batch, rewards)
+    # Save model and tokenizer
+    ppo_trainer.push_to_hub(**hub_configs)
+    tokenizer.push_to_hub(**hub_configs)
+def train_dpo_model(
+    model_name: str = "ailab-bio/PROTAC-Splitter-DPO",
+    output_dir: str = "./models/",
+    beta: float = 0.1,
+    loss_type: Literal["sigmoid", "hinge"] = "sigmoid",
+    learning_rate: float = 5e-5,
+    max_steps: int = 2000,
+    num_train_epochs: int = -1,
+    batch_size: int = 128,
+    gradient_accumulation_steps: int = 4,
+    resume_from_checkpoint: bool = False,
+    hub_token: Optional[str] = None,
+    pretrained_model_name: str = "ailab-bio/PROTAC-Splitter_untied_80-20-split",
+    pretrained_ref_model_name: str = "ailab-bio/PROTAC-Splitter_untied_80-20-split",
+    max_length: int = None,
+    delete_repo_first: bool = False,
+    optuna_search: bool = False,
+):
+    """ Trains a DPO model on a given dataset.
+    Args:
+        model_name (str, optional): The name of the model to be trained. Defaults to "ailab-bio/PROTAC-Splitter-DPO".
+        max_steps (int, optional): The maximum number of training steps. Defaults to 2000.
+    """
+    if hub_token is not None:
+        hf.login(token=hub_token)
+    if delete_repo_first and not resume_from_checkpoint:
+        delete_hf_repository(repo_id=model_name, token=hub_token)
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name,
+        token=hub_token,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Get train and eval datasets
+    dataset = load_dataset(
+        "ailab-bio/PROTAC-Substructures-DPO",
+        token=hub_token,
+    )
+    # Setup models
+    def model_init():
+        return EncoderDecoderModel.from_pretrained(
+            pretrained_model_name,
+            token=hub_token,
+        )
+    model_ref = EncoderDecoderModel.from_pretrained(
+        pretrained_ref_model_name,
+        token=hub_token,
+    )
+    # Setup training arguments
+    per_device_batch_size = batch_size // gradient_accumulation_steps
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        # Optimizer-related configs
+        learning_rate=learning_rate,
+        optim="adamw_torch",
+        lr_scheduler_type="cosine", # Default: "linear"
+        # Batch size and device configs
+        per_device_train_batch_size=per_device_batch_size,
+        per_device_eval_batch_size=per_device_batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        auto_find_batch_size=True,
+        # torch_compile=True,
+        fp16=True,
+        # Evaluation and checkpointing configs
+        evaluation_strategy="steps", # TODO: Why is it not working? "steps",
+        max_steps=max_steps,
+        num_train_epochs=num_train_epochs,
+        eval_steps=100,
+        save_steps=200,
+        # eval_steps=7500,
+        # warmup_steps=2000,
+        save_strategy="steps",
+        save_total_limit=1,
+        load_best_model_at_end=True,
+        # metric_for_best_model="valid_smiles",
+        # Logging configs
+        log_level="info",
+        logging_steps=50,
+        disable_tqdm=True,
+        # Hub information configs
+        push_to_hub=True, # NOTE: Done manually further down
+        hub_token=hub_token,
+        hub_model_id=model_name,
+        hub_strategy="checkpoint", # NOTE: Allows to resume training from last checkpoint
+        hub_private_repo=True,
+        # Other configs
+        remove_unused_columns=False,
+        seed=42,
+        data_seed=42,
+    )
+    # Setup Matrics
+    # TODO: The metric is not working because the predictions include rewards,
+    # or something like that, i.e., real values, which cannot be decoded by the
+    # tokenizer. Skipping for now and using the default one.
+    rouge = evaluate.load("rouge")
+    fpgen = Chem.rdFingerprintGenerator.GetMorganGenerator(
+        radius=8,
+        fpSize=2048,
+    )
+    metric = partial(
+        decode_and_get_metrics,
+        rouge=rouge,
+        tokenizer=tokenizer,
+        fpgen=fpgen,
+    )
+    # Setup trainer and start training
+    if max_length is None:
+        max_length = AutoConfig.from_pretrained(
+            pretrained_model_name,
+            token=hub_token,
+        ).max_length
+        # max_length = model.config.max_length
+    dpo_trainer = DPOTrainer(
+        model=model_init(),
+        ref_model=model_ref,
+        beta=beta,
+        loss_type=loss_type,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
+        tokenizer=tokenizer,
+        model_init=model_init if optuna_search else None,
+        # compute_metrics=metric,
+        max_length=max_length,
+        max_prompt_length=max_length,
+        max_target_length=max_length,
+        is_encoder_decoder=True,
+        padding_value=tokenizer.pad_token_id,
+        truncation_mode="keep_start",
+        args=training_args,
+    )
+    if optuna_search and False:
+        # TODO: This is not working because the training arguments do NOT
+        # include the beta parameter...
+        def optuna_hp_space(trial):
+            return {
+                "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+                "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
+                "beta": trial.suggest_float("beta", 0.1, 0.5),
+            }
+        best_trials = dpo_trainer.hyperparameter_search(
+            direction=["minimize"],
+            backend="optuna",
+            hp_space=optuna_hp_space,
+            n_trials=20,
+            # compute_objective=compute_objective,
+        )
+        print("-" * 80)
+        print(f"Best trials:\n{best_trials}")
+        print("-" * 80)
+    else:
+        if resume_from_checkpoint:
+            resume_from_checkpoint = "last-checkpoint"
+        else:
+            resume_from_checkpoint = None
+        dpo_trainer.train(
+            resume_from_checkpoint=resume_from_checkpoint,
+        )
+    dpo_trainer.push_to_hub(
+        commit_message="Initial version",
+        model_name=model_name,
+        license="mit",
+        finetuned_from=pretrained_model_name,
+        tasks=["Text2Text Generation"],
+        tags=["PROTAC", "cheminformatics"],
+        dataset="ailab-bio/PROTAC-Substructures-DPO",
+    )

protac_splitter/protac_cheminformatics.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import logging
+import random
+from typing import List, Tuple, Callable, Any, Union, Dict, Optional, Literal
+from functools import lru_cache
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import rdchem
+from rdkit import RDLogger
+from rdkit.Chem import CanonSmiles
+from .chemoinformatics import (
+    canonize,
+    smiles2mol,
+)
+RDLogger.DisableLog("rdApp.*")
+@lru_cache(maxsize=None)
+def get_mol(smiles: str) -> rdchem.Mol:
+    return Chem.MolFromSmiles(smiles)
+def find_atom_idx_of_map_atoms(
+        mol: rdchem.Mol,
+        find_poi: True,
+        find_e3: True,
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+) -> Union[int, Tuple[int, int]]:
+    """ Find the indices of the attachment points in the given molecule.
+    Args:
+        mol (rdkit.Chem.rdchem.Mol): The molecule.
+        find_poi (bool): Whether to find the POI attachment point.
+        find_e3 (bool): Whether to find the E3 attachment point.
+        poi_attachment_id (int): The label of the attachment point for the POI ligand, i.e., "[*:{poi_attachment_id}]".
+        e3_attachment_id (int): The label of the attachment point for the E3 binder, i.e., "[*:{e3_attachment_id}]".
+    Returns:
+        int | Tuple[int, int]: The index of the attachment point for the POI ligand if find_poi is True, the index of the attachment point for the E3 binder if find_e3 is True, or a tuple containing POI and E3 indices (in this order) if both find_poi and find_e3 are True.
+    """
+    if find_poi and find_e3:
+        poi_idx = None
+        e3_idx = None
+        for atom in mol.GetAtoms():
+            if atom.GetAtomMapNum() == poi_attachment_id:
+                poi_idx = atom.GetIdx()
+            elif atom.GetAtomMapNum() == e3_attachment_id:
+                e3_idx = atom.GetIdx()
+            if poi_idx is not None and e3_idx is not None:
+                break
+        return poi_idx, e3_idx
+    elif find_poi:
+        for atom in mol.GetAtoms():
+            if atom.GetAtomMapNum() == poi_attachment_id:
+                return atom.GetIdx()
+    elif find_e3:
+        for atom in mol.GetAtoms():
+            if atom.GetAtomMapNum() == e3_attachment_id:
+                return atom.GetIdx()
+def reassemble_protac(
+        ligands_smiles: Optional[str] = None,
+        poi_smiles: Optional[str] = None,
+        linker_smiles: Optional[str] = None,
+        e3_smiles: Optional[str] = None,
+        e3_bond_type: Literal['single', 'double', 'triple', 'rand_uniform'] = 'single',
+        poi_bond_type: Literal['single', 'double', 'triple', 'rand_uniform'] = 'single',
+        poi_attachment_id: int = 1,
+        e3_attachment_id: int = 2,
+        rand_generator = None,
+) -> Tuple[str, Chem.rdchem.Mol]:
+    """ Reassemble a PROTAC molecule from its substructures. The SMILES must contain attachment points.
+    In case the bond type cannot be formed an error will be raised.
+    Example of usage:
+    ```python
+    e3_smiles = '[*:2]NC(C(=O)N1CC(O)CC1C(=O)NCc1ccc(-c2scnc2C)cc1)C(C)(C)C'
+    linker_smiles = '[*:2]C(=O)CCCCCCCCCC[*:1]'
+    poi_smiles = '[*:1]CN1CCN(c2ccc(Nc3ncc4c(C)cc(=O)n(-c5cccc(NC(=O)C=C)c5)c4n3)c(OC)c2)CC1'
+    merged_smiles, _ = reassemble_protac(poi_smiles, linker_smiles, e3_smiles, 'single', 'single')
+    print(merged_smiles)
+    ```
+    Args:
+        poi_smiles (str): The SMILES notation for the POI ligand.
+        linker_smiles (str): The SMILES notation for the linker.
+        e3_smiles (str): The SMILES notation for the E3 binder.
+        e3_bond_type (str): The type of bond to be added between the E3 binder and the linker. Can be 'single', 'double', 'triple', or 'rand_uniform'.
+        poi_bond_type (str): The type of bond to be added between the POI ligand and the linker. Can be 'single', 'double', 'triple', or 'rand_uniform'.
+        poi_attachment_id (int): The label of the attachment point for the POI ligand, i.e., "[*:{poi_attachment_id}]".
+        e3_attachment_id (int): The label of the attachment point for the E3 binder, i.e., "[*:{e3_attachment_id}]".
+        rand_generator: A random number generator for 'rand_uniform' bond types. Defaults to None, i.e., standard library random.
+    Returns:
+        Tuple[str, Chem.rdchem.Mol]: The SMILES notation and RDKit molecule object for the reassembled PROTAC molecule.
+    """
+    if ligands_smiles is None:
+        if None in [poi_smiles, linker_smiles, e3_smiles]:
+            raise ValueError("Missing substructures SMILES: either provide ligands_smiles or all of poi_smiles, linker_smiles, and e3_smiles")
+        ligands_smiles = f'{e3_smiles}.{linker_smiles}.{poi_smiles}'
+    if None in [poi_smiles, linker_smiles, e3_smiles]:
+        if ligands_smiles is None:
+            raise ValueError("Missing substructures SMILES: either provide ligands_smiles or all of poi_smiles, linker_smiles, and e3_smiles")
+    ligands_mol = canonize(smiles2mol(ligands_smiles))
+    if ligands_mol is None:
+        return None, None
+    try:
+        protac_mol = Chem.molzip(ligands_mol)
+    except ValueError as e:
+        logging.error(f"Failed to reassemble PROTAC: {e}")
+        return None, None

protac_splitter/protac_splitter.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import os
+import requests
+from typing import Union, Optional, Dict, List
+from pathlib import Path
+import logging
+from datasets import Dataset
+import pandas as pd
+from protac_splitter.chemoinformatics import canonize
+from protac_splitter.fixing_functions import fix_prediction
+from protac_splitter.llms.model_utils import get_pipeline, run_pipeline
+from protac_splitter.graphs.e3_clustering import get_representative_e3s_fp
+from protac_splitter.graphs.edge_classifier import GraphEdgeClassifier
+from protac_splitter.graphs.splitting_algorithms import split_protac_graph_based
+def load_graph_edge_classifier_from_cache(
+    cache_dir: Union[str, Path] = "~/.cache/protac_splitter",
+    model_filename: str = "PROTAC-Splitter-XGBoost.joblib",
+    download_url: str = "https://docs.google.com/uc?export=download&id=1bb9i5_L_-re3QYPc7tSiCtVNEEbNIzAC",
+) -> GraphEdgeClassifier:
+    """
+    Loads the GraphEdgeClassifier model from a local cache directory.
+    If the model file is not found, downloads it from the specified URL.
+    Args:
+        cache_dir (str or Path): Directory to cache the model file.
+        model_filename (str): Name of the model file.
+        download_url (str): URL to download the model if not present.
+    Returns:
+        GraphEdgeClassifier: Loaded classifier.
+    """
+    cache_dir = Path(os.path.expanduser(cache_dir))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    model_path = cache_dir / model_filename
+    if not model_path.exists():
+        response = requests.get(download_url, stream=True)
+        response.raise_for_status()
+        expected_size = int(response.headers.get("Content-Length", -1))
+        with open(model_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=1024*1024):
+                if chunk:
+                    f.write(chunk)
+        if expected_size != -1:
+            actual = model_path.stat().st_size
+            if actual != expected_size:
+                raise RuntimeError(f"Download incomplete: got {actual}, expected {expected_size}")
+        # Optional checksum:
+        # NOTE: Uncomment the following for debugging
+        import hashlib
+        h = hashlib.sha256(model_path.read_bytes()).hexdigest()
+        h_orig = "513621f4dc2ff7ec819a222bc7311afb8b6e6e89d6d694dd2906e695a50086dd"
+        if h != h_orig:
+            raise RuntimeError(
+                f"Downloaded model checksum mismatch: got {h}, expected {h_orig}. "
+                "Please delete the model file and try again."
+            )
+    return GraphEdgeClassifier.load(model_path)
+def split_protac(
+        protac_smiles: Union[str, List, pd.DataFrame],
+        use_transformer: bool = False,
+        use_xgboost: bool = True,
+        fix_predictions: bool = True,
+        protac_smiles_col: str = "text",
+        batch_size: int = 1,
+        beam_size: int = 5,
+        device: Optional[Union[int, str]] = None,
+        num_proc: int = 1,
+        verbose: int = 0,
+) -> Union[Dict[str, str], List[Dict[str, str]]]:
+    """ Split a PROTAC SMILES into the two ligands and the linker.
+    If `use_transformer` and `use_xgboost` are both True, the Transformer model
+    will run first, and XGBost will be used as a fallback for predictions that
+    fail re-assembly and fixing. If both `use_transformer` and `use_xgboost`
+    are False, a fully heuristic-based algorithm will be used for splitting.
+    Args:
+        protac_smiles (str, list, or pd.DataFrame): The PROTAC SMILES to split.
+            If a DataFrame is provided, it must contain a column named `protac_smiles_col`.
+        use_transformer (bool): Whether to use the transformer model for splitting.
+        use_xgboost (bool): Whether to use the XGBoost model for splitting.
+        fix_predictions (bool): Whether to fix the predictions using deterministic cheminformatics rules. Only used if `use_transformer` is True.
+        protac_smiles_col (str): The name of the column containing the PROTAC SMILES in the DataFrame.
+        batch_size (int): Batch size for processing. Only used if `use_transformer` is True.
+        beam_size (int): Number of beam search predictions to generate. Only used if `use_transformer` is True. Higher values may yield better results but increase computation time.
+        device (int or str, optional): Device to run the Transformer model on. Defaults to None will attempt to run on GPU if available, otherwise CPU.
+        num_proc (int): Number of processes to use for parallel processing. Useful for large datasets of PROTACs to split.
+        verbose (int): Verbosity level.
+    Returns:
+        Union[Dict[str, str], List[Dict[str, str]]]: Depending on the input type, returns:
+            - If a single string is provided, returns a dictionary with format: `{protac_smiles_col: protac_smiles, "default_pred_n0": e3l.linker.warhead, "model_name": Transformer|XGBoost|Heuristic}`.
+            - If a list of strings is provided, returns a list of dictionaries with the same format as above.
+            - If a DataFrame is provided, returns a DataFrame with columns: `protac_smiles_col`, `default_pred_n0`, and `model_name`. The `default_pred_n0` column contains the predicted split strings in the format `e3.linker.warhead`.
+    """
+    if use_xgboost:
+        representative_e3s_fp = get_representative_e3s_fp()
+        xgboost_model = load_graph_edge_classifier_from_cache()
+    # Generate a Dataset from the input PROTAC SMILES
+    if isinstance(protac_smiles, str):
+        protac_smiles_canon = canonize(protac_smiles)
+        if protac_smiles_canon is None:
+            raise ValueError(f"Invalid PROTAC SMILES: {protac_smiles}")
+        ds = Dataset.from_dict({protac_smiles_col: [protac_smiles_canon]})
+    elif isinstance(protac_smiles, list):
+        # Canonize and check if all PROTAC SMILES are valid
+        protac_smiles_canon = [canonize(protac) for protac in protac_smiles]
+        if None in protac_smiles_canon:
+            wrong_protacs = [protac for protac, canon in zip(protac_smiles, protac_smiles_canon) if canon is None]
+            raise ValueError(f"Invalid PROTAC SMILES in list: {wrong_protacs}")
+        ds = Dataset.from_dict({protac_smiles_col: protac_smiles_canon})
+    elif isinstance(protac_smiles, pd.DataFrame):
+        # Check if the DataFrame contains a columns named `protac_smiles_col`
+        if protac_smiles_col not in protac_smiles.columns:
+            raise ValueError(f"DataFrame must contain a column named \"{protac_smiles_col}\".")
+        # Canonize and check if all PROTAC SMILES are valid
+        protac_smiles_canon = protac_smiles[protac_smiles_col].apply(canonize)
+        if protac_smiles_canon.isnull().any():
+            wrong_protacs = protac_smiles[protac_smiles_canon.isnull()]
+            raise ValueError(f"Invalid PROTAC SMILES in DataFrame: {wrong_protacs}")
+        ds = Dataset.from_pandas(protac_smiles_canon.to_frame(name=protac_smiles_col))
+    if use_transformer:
+        pipe = get_pipeline(
+            model_name="ailab-bio/PROTAC-Splitter-EncoderDecoder-lr_reduce-rand-smiles",
+            token=os.environ.get("HF_TOKEN", None),
+            is_causal_language_model=False,
+            num_return_sequences=beam_size,
+            device=device,
+        )
+        # preds will be a list of dictionaries, each containing the
+        # beam-size predictions for each input PROTAC SMILES. Format: [{'pred_n0': 'prediction_0', 'pred_n1': 'prediction_1', ...}, ...]
+        preds = run_pipeline(
+            pipe,
+            ds,
+            batch_size,
+            is_causal_language_model=False,
+            smiles_column=protac_smiles_col,
+        )
+        # Turn the predictions into a DataFrame and then into a Dataset
+        preds_df = pd.DataFrame(preds)
+        preds_df[protac_smiles_col] = ds[protac_smiles_col]
+        preds_ds = Dataset.from_pandas(preds_df)
+        def mapping_func(row: Dict[str, str]) -> Dict[str, str]:
+            """Fix the predictions for each row."""
+            protac = row[protac_smiles_col]
+            if fix_predictions:
+                preds = {k: fix_prediction(protac, v, verbose=verbose) for k, v in row.items() if k.startswith("pred_")}
+            else:
+                preds = {k: v for k, v in row.items() if k.startswith("pred_")}
+            # If all preds are None, we attempt to use the XGBoost model
+            if all(v is None for v in preds.values()):
+                if use_xgboost:
+                    pred = split_protac_graph_based(
+                        protac_smiles=protac,
+                        use_classifier=True,
+                        classifier=xgboost_model,
+                        representative_e3s_fp=representative_e3s_fp,
+                    )
+                    return {
+                        protac_smiles_col: protac,
+                        "default_pred_n0": f"{pred['e3']}.{pred['linker']}.{pred['poi']}",
+                        "model_name": "XGBoost",
+                    }
+                else:
+                    # If no predictions are valid, we return None for the default prediction
+                    return {
+                        protac_smiles_col: protac,
+                        "default_pred_n0": None,
+                        "model_name": "Transformer",
+                    }
+            else:
+                # Select the non-None prediction with the lowest beam index
+                # NOTE: The HF predictions comes in lists, with the first
+                # element being the one with the highest likelihood.
+                for i in range(beam_size):
+                    key = f"pred_n{i}"
+                    if preds[key] is not None:
+                        return {
+                            protac_smiles_col: protac,
+                            "default_pred_n0": preds[key],
+                            "model_name": "Transformer",
+                        }
+        # Map the function over the Dataset to fix the predictions and/or
+        # replace them with the XGBoost fallback predictions if they fail.
+        if fix_predictions or use_xgboost:
+            preds_ds = preds_ds.map(
+                mapping_func,
+                num_proc=1 if use_xgboost else num_proc, # Using XGBoost IN a map function might not be thread-safe
+                desc=f"{'Fixing predictions' if fix_predictions else ''}{' and ' if fix_predictions and use_xgboost else ''}{'Replacing predictions with XGBoost fallback' if use_xgboost else ''}",
+            )
+    elif use_xgboost:
+        # Use the XGBoost model only
+        def mapping_func(row: Dict[str, str]) -> Dict[str, str]:
+            """Split the PROTAC SMILES using the XGBoost model."""
+            protac = row[protac_smiles_col]
+            pred = split_protac_graph_based(
+                protac_smiles=protac,
+                use_classifier=True,
+                classifier=xgboost_model,
+                representative_e3s_fp=representative_e3s_fp,
+            )
+            if all(v is None for v in pred.values()):
+                split = None
+            else:
+                split = f"{pred['e3']}.{pred['linker']}.{pred['poi']}"
+            return {
+                protac_smiles_col: protac,
+                "default_pred_n0": split,
+                "model_name": "XGBoost",
+            }
+        preds_ds = ds.map(
+            mapping_func,
+            num_proc=1,
+            desc="Splitting PROTAC SMILES using XGBoost model",
+        )
+    else:
+        # If neither transformer nor XGBoost is used, we use the heuristic-based
+        # algorithm, that does not require any model.
+        def mapping_func(row: Dict[str, str]) -> Dict[str, str]:
+            """Split the PROTAC SMILES using the heuristic-based algorithm."""
+            protac = row[protac_smiles_col]
+            pred = split_protac_graph_based(
+                protac_smiles=protac,
+                use_classifier=False,
+            )
+            if all(v is None for v in pred.values()):
+                split = None
+            else:
+                split = f"{pred['e3']}.{pred['linker']}.{pred['poi']}"
+            return {
+                protac_smiles_col: protac,
+                "default_pred_n0": split,
+                "model_name": "Heuristic",
+            }
+        preds_ds = ds.map(
+            mapping_func,
+            num_proc=num_proc,
+            desc="Splitting PROTAC SMILES using heuristic-based algorithm",
+        )
+    if isinstance(protac_smiles, str):
+        # If the input was a single string, we return the first prediction
+        return preds_ds[0]
+    elif isinstance(protac_smiles, pd.DataFrame):
+        # If the input was a DataFrame, we return a dataframe with the predictions
+        return preds_ds.to_pandas()
+    elif isinstance(protac_smiles, list):
+        # Convert the Dataset to a list of dictionaries
+        return [row for row in preds_ds]
+    # if tokenizer is None:
+    #     if verbose:
+    #         print(f"Loading tokenizer...")
+    #     tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+    # if pipe is None:
+    #     if verbose:
+    #         print("Loading pipeline for \"default\" predictions...")
+    #     pipe = pipeline(
+    #         "text2text-generation",
+    #         model=model_name,
+    #         tokenizer=tokenizer,
+    #         device="cuda" if torch.cuda.is_available() else "cpu",
+    #         token=hf_token,
+    #         num_return_sequences=beam_size,
+    #     )
+    # if isinstance(protac_smiles, str):
+    #     protac_smiles_canon = canonize(protac_smiles)
+    #     if protac_smiles_canon is None:
+    #         raise ValueError(f"Invalid PROTAC SMILES: {protac_smiles}")
+    #     pred = pipe(protac_smiles_canon)
+    #     pred = {f"default_pred_n{i}": pred[i]["generated_text"] for i in range(len(pred))}
+    #     if fix_predictions:
+    #         p_fixed = {k: fix_prediction(protac_smiles_canon, v, verbose=verbose) for k, v in pred.items()}
+    #         # For each prediction, if the fixed prediction is not None, we
+    #         # replace the original prediction with the fixed one.
+    #         for k, v in p_fixed.items():
+    #             if v is not None:
+    #                 pred[k] = v
+    #     preds = [pred]
+    # if isinstance(protac_smiles, list):
+    #     # Canonize and check if all PROTAC SMILES are valid
+    #     protac_smiles_canon = [canonize(protac) for protac in protac_smiles]
+    #     if None in protac_smiles_canon:
+    #         wrong_protacs = [protac for protac, canon in zip(protac_smiles, protac_smiles_canon) if canon is None]
+    #         raise ValueError(f"Invalid PROTAC SMILES in list: {wrong_protacs}")
+    #     # Get the predictions for all PROTAC SMILES
+    #     preds = pipe(protac_smiles_canon, batch_size=batch_size)
+    #     preds = [{f"default_pred_n{i}": p["generated_text"] for i, p in enumerate(pred)} for pred in preds]
+    #     if fix_predictions:
+    #         for i, (protac, pred) in enumerate(zip(protac_smiles_canon, preds)):
+    #             p_fixed = {k: fix_prediction(protac, v, verbose=verbose) for k, v in pred.items()}
+    #             # For each prediction, if the fixed prediction is not None, we
+    #             # replace the original prediction with the fixed one.
+    #             for k, v in p_fixed.items():
+    #                 if v is not None:
+    #                     preds[i][k] = v
+    # if isinstance(protac_smiles, pd.DataFrame):
+    #     # Check if the DataFrame contains a columns named `protac_smiles_col`
+    #     if protac_smiles_col not in protac_smiles.columns:
+    #         raise ValueError(f"DataFrame must contain a column named \"{protac_smiles_col}\".")
+    #     # Canonize and check if all PROTAC SMILES are valid
+    #     protac_smiles_canon = protac_smiles.apply(lambda x: canonize(x[protac_smiles_col]), axis=1)
+    #     # Check if there are invalid PROTAC SMILES
+    #     if protac_smiles_canon.isnull().any():
+    #         wrong_protacs = protac_smiles[protac_smiles_canon.isnull()]
+    #         raise ValueError(f"Invalid PROTAC SMILES in DataFrame: {wrong_protacs}")
+    #     # Convert the Series to a DataFrame
+    #     protac_smiles_canon = pd.DataFrame(protac_smiles_canon, columns=[protac_smiles_col])
+    #     # Convert the DataFrame to a Dataset
+    #     dataset = Dataset.from_pandas(protac_smiles_canon)
+    #     preds = []
+    #     for pred in tqdm(pipe(KeyDataset(dataset, protac_smiles_col), batch_size=batch_size), total=len(dataset) // batch_size, desc="Generating predictions"):
+    #         p = {f"default_pred_n{i}": pred[i]["generated_text"] for i in range(len(pred))}
+    #         preds.append(p)
+    #     if fix_predictions:
+    #         for i, (protac, pred) in tqdm(enumerate(zip(protac_smiles_canon, preds)), desc="Fixing predictions", total=len(preds)):
+    #             p_fixed = {k: fix_prediction(protac, v, verbose=verbose) for k, v in pred.items()}
+    #             # For each prediction, if the fixed prediction is not None, we
+    #             # replace the original prediction with the fixed one.
+    #             for k, v in p_fixed.items():
+    #                 if v is not None:
+    #                     pred[k] = v
+    # if return_check_reassembly:
+    #     if isinstance(protac_smiles_canon, str):
+    #         protac_smiles_list = [protac_smiles_canon]
+    #     elif isinstance(protac_smiles_canon, list):
+    #         protac_smiles_list = protac_smiles_canon
+    #     elif isinstance(protac_smiles_canon, pd.DataFrame):
+    #         protac_smiles_list = protac_smiles_canon[protac_smiles_col].tolist()
+    #     print("Checking re-assembly...")
+    #     for protac, pred in zip(protac_smiles_list, preds):
+    #         for i in range(beam_size):
+    #             pred[f"reassembly_correct_n{i}"] = check_reassembly(protac, pred[f"default_pred_n{i}"])
+    #     # Just take the first prediction if the input was a string
+    #     if isinstance(protac_smiles, str):
+    #         preds = preds[0]
+    # return preds

protac_splitter_app.py ADDED Viewed

	@@ -0,0 +1,351 @@

+"""
+PROTAC Splitter Web Application
+This script provides a web interface for splitting PROTAC molecules into their
+constituent parts: E3 ligase binder, linker, and protein-of-interest (POI)
+ligand (warhead).
+The app uses the protac_splitter library to perform the splitting and offers
+two main modes of operation:
+1. Single SMILES processing
+2. Batch processing via CSV file upload
+Users can select which models to use:
+- XGBoost model (default): Fast graph-based edge classification model
+- Transformer model: More accurate but slower deep learning model
+- If neither is selected, a rule-based splitting algorithm is used
+Author: Stefano Ribes
+Date: 2025-06
+"""
+import logging
+import tempfile
+from pathlib import Path
+from typing import Union
+from PIL import Image
+import gradio as gr
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import Draw
+from protac_splitter import split_protac
+from protac_splitter.display_utils import get_mapped_protac_img
+def save_svg_to_tempfile(svg_string: str, suffix: str = ".svg") -> Union[str, Path]:
+    """
+    Write an SVG string to a temporary file and return its filesystem path.
+    """
+    # Create a named temporary file that persists after closing
+    tmp_file = tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False, encoding="utf-8")
+    logging.debug(f"Saving SVG to temporary file: {tmp_file.name}")
+    try:
+        tmp_file.write(svg_string)
+        tmp_file.flush()
+        return Path(tmp_file.name)
+    finally:
+        tmp_file.close()
+def process_single_smiles(protac_smiles: str, use_transformer: bool = False, use_xgboost: bool = True, beam_size: int = 5) -> tuple:
+    """
+    Process a single SMILES string and generate PROTAC fragment predictions
+    Args:
+        protac_smiles: The SMILES string of the PROTAC molecule
+        use_transformer: Whether to use the transformer model for prediction
+        use_xgboost: Whether to use the XGBoost model for prediction
+    Returns:
+        Tuple containing input image, output images, SMILES texts and status message
+    """
+    if not protac_smiles:
+        raise gr.Error("Please provide a valid PROTAC SMILES string.", duration=5)
+    try:
+        results = split_protac(
+            protac_smiles,
+            use_transformer=use_transformer,
+            use_xgboost=use_xgboost,
+            fix_predictions=True,   # Always apply fixes to predictions
+            beam_size=beam_size,    # Use beam search width for Transformer model
+            verbose=1
+        )
+    except Exception as e:
+        exception_message = str(e)
+        if exception_message.startswith("Invalid PROTAC SMILES"):
+            raise gr.Error("The input SMILES string is not valid (couldn't be parsed by RDKit).", duration=5)
+        else:
+            raise gr.Error(f"An error occurred while processing the input SMILES: {exception_message}", duration=10)
+    valid_molecules = []
+    pred_key = f'default_pred_n0'
+    valid_molecules.append(results[pred_key])
+    # Generate images and corresponding SMILES text
+    images = []
+    smiles_texts = []
+    input_mol = Chem.MolFromSmiles(protac_smiles)
+    if input_mol is not None:
+        input_img = Draw.MolToImage(input_mol, legend="", size=(1000, 200))
+    else:
+        input_img = Image.new('RGB', (1000, 1000))
+    splits = {}
+    for smiles in results[pred_key].split("."):
+        mol = Chem.MolFromSmiles(smiles)
+        if mol:
+            if "[*:1]" in smiles and "[*:2]" in smiles:
+                legend = "Linker"
+                splits['linker'] = smiles
+            elif "[*:1]" in smiles:
+                legend = "Warhead"
+                splits['poi'] = smiles
+            elif "[*:2]" in smiles:
+                legend = "E3 Ligase Ligand"
+                splits['e3'] = smiles
+            img = Draw.MolToImage(mol, legend="", size=(1000, 1000))
+            images.append(img)
+            smiles_texts.append(f"{legend}: {smiles}")
+    smiles_texts = "\n".join(smiles_texts)
+    use_svg = False
+    input_img = get_mapped_protac_img(
+        protac_smiles=protac_smiles,
+        poi_smiles=splits.get('poi', ''),
+        linker_smiles=splits.get('linker', ''),
+        e3_smiles=splits.get('e3', ''),
+        w=1000,
+        h=500,
+        legend=None,
+        useSVG=use_svg,
+    )
+    if use_svg:
+        input_img = save_svg_to_tempfile(input_img)
+        logging.debug(f"Returning processed image path: {input_img}")
+    return input_img, list(images), smiles_texts
+def process_csv(
+        file: gr.File,
+        smiles_col: str,
+        use_transformer: bool = False,
+        use_xgboost: bool = True,
+        beam_size: int = 5,
+        batch_size: int = 4,
+        num_proc: int = 2,
+        # NOTE: `pr` is a progress tracker, it is used to track the progress but
+        # it is not used in this function. Do not remove it.
+        pr: gr.Progress = gr.Progress(track_tqdm=True),
+) -> Path:
+    """
+    Process a CSV file containing PROTAC SMILES
+    Args:
+        file: Uploaded CSV file
+        smiles_col: Name of the column containing SMILES strings
+        use_transformer: Whether to use the transformer model for prediction
+        use_xgboost: Whether to use the XGBoost model for prediction
+    Returns:
+        Path to output CSV file with predictions
+    """
+    df = pd.read_csv(file.name)
+    if smiles_col not in df.columns:
+        # Use Gradio's error message instead of raising an exception
+        raise gr.Error(f"Column \"{smiles_col}\" is not in the provided CSV file.", duration=5)
+    try:
+        results = split_protac(
+            df,
+            use_transformer=use_transformer,
+            use_xgboost=use_xgboost,
+            protac_smiles_col=smiles_col,
+            fix_predictions=True,
+            batch_size=batch_size,
+            num_proc=num_proc,
+            beam_size=beam_size,  # Use beam search width for Transformer model
+            verbose=1
+        )
+    except Exception as e:
+        exception_message = str(e)
+        if exception_message.startswith("Invalid PROTAC SMILES"):
+            raise gr.Error("One or more of the input SMILES are not valid (couldn't be parsed by RDKit).", duration=5)
+        else:
+            raise gr.Error(f"An error occurred while processing: {exception_message}", duration=10)
+    output_df = pd.DataFrame(results)
+    # Create a temporary output file
+    output_file = str(Path(tempfile.gettempdir()) / "split_preds.csv")
+    logging.debug(f"Saving predictions to temporary file: {output_file}")
+    output_df.to_csv(output_file, index=False)
+    logging.debug(f"Output DataFrame saved to: {output_file}")
+    return output_file
+def create_interface():
+    """
+    Create and return the Gradio interface for the PROTAC Splitter app
+    The interface includes two tabs:
+    1. Single SMILES Input - For processing individual PROTAC SMILES
+    2. CSV Upload - For batch processing of multiple PROTAC SMILES
+    Returns:
+        gr.Blocks: The Gradio interface
+    """
+    with gr.Blocks() as demo:
+        header = """# PROTAC-Splitter Web Application
+Upload a CSV file or enter a single SMILES string to predict PROTAC substructures.
+Warheads and E3 ligase ligands connections to the linker are marked with dummy atoms, _i.e._, attachment points, as follows:
+- Warhead: `[*:1]`
+- E3 Ligase ligand: `[*:2]`
+"""
+        gr.Markdown(header)
+        # Model selection section - common to both tabs
+        model_selection = """## Model Selection
+You can choose which model to use for splitting PROTAC molecules:
+- **XGBoost model** (default): Fast graph-based edge classification model
+- **Transformer model**: More accurate but slower deep learning model
+- If both are selected, the Transformer model will be used first, then if it fails, the XGBoost model will be used.
+- If no model is selected, splitting will be done using graph-based heuristics, with no AI model involved.
+For fast splitting, we reccommend using the XGBoost model only, which is fast and efficient for most cases. The Transformer model might be more accurate but it is slower, especially for processing large CSV files.
+"""
+        gr.Markdown(model_selection)
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Row():
+                    use_xgboost = gr.Checkbox(label="Use XGBoost model", value=True)
+                    use_transformer = gr.Checkbox(label="Use Transformer model", value=False)
+        # Performance configuration section
+        performance_configs = """### Performance Configurations
+Change the following parameters to optimize performance based on your machine's capabilities. Particularly useful when processing large CSV files or when using the Transformer model.
+For single SMILES processing, the default values should work well in most cases.
+"""
+        gr.Markdown(performance_configs)
+        with gr.Column(scale=1):
+            # Add a num_proc input
+            with gr.Row():
+                num_proc = gr.Number(
+                    label="Number of Processes",
+                    value=2,
+                    minimum=1,
+                    maximum=8,
+                    step=1,
+                    info="Number of processes to use for parallel processing. Higher values may improve performance but require more memory."
+                )
+            # Add a number input for beam_size if Transformer model is selected
+            with gr.Row():
+                # Only show beam size input if Transformer model is selected
+                beam_size = gr.Number(
+                    label="Beam Search Width",
+                    value=5,
+                    minimum=1,
+                    maximum=10,
+                    step=1,
+                    info="Width of the beam search for the Transformer model. Higher values may improve accuracy but increase processing time.",
+                    visible=use_transformer.value  # Initially hidden, will be shown if Transformer is selected
+                )
+                # Add a dynamic visibility condition to show/hide beam_size based on Transformer model selection
+                use_transformer.change(
+                    lambda x: gr.update(visible=x),
+                    inputs=[use_transformer],
+                    outputs=[beam_size]
+                )
+            # Add a batch size input for Transformer model if selected
+            with gr.Row():
+                batch_size = gr.Number(
+                    label="Batch Size",
+                    value=4,
+                    minimum=1,
+                    maximum=64,
+                    step=1,
+                    info="Batch size for processing. Higher values may improve performance, especially on GPU machines, but require more memory.",
+                    visible=use_transformer.value  # Initially hidden, will be shown if Transformer is selected
+                )
+                use_transformer.change(
+                    lambda x: gr.update(visible=x),
+                    inputs=[use_transformer],
+                    outputs=[batch_size]
+                )
+        # Single SMILES Input tab
+        gr.Markdown("## Specify Inputs")
+        with gr.Tab("Single SMILES Input"):
+            # Input area
+            smiles_input = gr.Textbox(
+                label="Enter SMILES String",
+                placeholder="E.g., CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCOCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O",
+                # value="CC(C)(C)S(=O)(=O)c1cc2c(Nc3ccc4scnc4c3)ccnc2cc1OCCOCCOCCOCCOCC(=O)Nc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O",
+            )
+            submit_smiles = gr.Button("Process SMILES")
+            # Output area
+            smiles_input_image = gr.Image(label="Input PROTAC", type="filepath")  # Use None to allow SVG input
+            smiles_output_images = gr.Gallery(label="Valid Splits", columns=3)
+            smiles_output_texts = gr.Textbox(label="SMILES of the Splits", interactive=False, lines=3)
+            # Connect the button click event to the processing function
+            submit_smiles.click(
+                process_single_smiles,
+                inputs=[smiles_input, use_transformer, use_xgboost, beam_size],
+                outputs=[smiles_input_image, smiles_output_images, smiles_output_texts]
+            )
+        # CSV file processing tab
+        with gr.Tab("Upload CSV"):
+            # File upload area
+            file_input = gr.File(label="Upload CSV File")
+            smiles_column = gr.Textbox(
+                label="Column Name for PROTAC SMILES",
+                placeholder="E.g., \"PROTAC SMILES\"",
+                # value="PROTAC SMILES",
+            )
+            submit_csv = gr.Button("Process CSV")
+            # Output file download area
+            download_output = gr.File(label="Download Predictions")
+            # Connect the button click event to the processing function
+            submit_csv.click(
+                process_csv,
+                inputs=[file_input, smiles_column, use_transformer, use_xgboost, beam_size, batch_size, num_proc],
+                outputs=[download_output]
+            )
+            csv_notes = f"""**Note:** The output CSV will contain the following columns:
+- `{smiles_column}`: The original PROTAC SMILES string
+- `default_pred_n0`: The predicted SMILES strings for the splits
+- `model_name`: The model used for the prediction
+"""
+            gr.Markdown(csv_notes)
+    return demo
+# Create the Gradio interface
+# NOTE: `demo` must be a global variable, so to make the Gradio’s hot-reload system work.
+# NOTE: Launch the app with `gradio scripts/protac_splitter_app.py` to develop it.
+demo = create_interface()
+if __name__ == "__main__":
+    # Set logging level to DEBUG for detailed output
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,138 @@

+accelerate==1.3.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.13
+aiosignal==1.3.2
+alembic==1.16.2
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+attrs==25.3.0
+cairocffi==1.7.1
+CairoSVG==2.8.2
+certifi==2025.6.15
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+colorlog==6.9.0
+contourpy==1.3.2
+cssselect2==0.8.0
+cycler==0.12.1
+datasets==3.0.0
+decorator==5.2.1
+defusedxml==0.7.1
+dill==0.3.8
+docstring_parser==0.16
+evaluate==0.4.3
+executing==2.2.0
+fastapi==0.115.14
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.58.4
+frozenlist==1.7.0
+fsspec==2024.6.1
+gradio==5.35.0
+gradio_client==1.10.4
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.33.1
+idna==3.10
+imbalanced-learn==0.13.0
+imblearn==0.0
+iniconfig==2.1.0
+ipython==9.4.0
+ipython_pygments_lexers==1.1.1
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.1
+jsonargparse==4.40.0
+kiwisolver==1.4.8
+lightning-utilities==0.14.3
+llvmlite==0.44.0
+Mako==1.3.10
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.6.3
+multiprocess==0.70.16
+networkx==3.1
+numba==0.61.0
+numpy==1.26.4
+optuna==4.2.0
+ordered-set==4.1.0
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.2
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.3.0
+pluggy==1.6.0
+prompt_toolkit==3.0.51
+propcache==0.3.2
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==20.0.0
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.2
+PyLaTeX==1.4.2
+pyparsing==3.2.3
+pytest==8.4.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+rdkit==2024.9.4
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+ruff==0.12.1
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.14.1
+seaborn==0.13.2
+semantic-version==2.10.0
+setuptools==80.9.0
+shellingham==1.5.4
+shtab==1.7.2
+six==1.17.0
+sklearn-compat==0.1.3
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+stack-data==0.6.3
+starlette==0.46.2
+sympy==1.13.1
+threadpoolctl==3.6.0
+tinycss2==1.4.0
+tokenizers==0.19.1
+tomlkit==0.13.3
+torch==2.6.0
+torchmetrics==1.7.3
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.44.2
+trl==0.10.1
+typeguard==4.4.4
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+tyro==0.9.25
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wcwidth==0.2.13
+webencodings==0.5.1
+websockets==15.0.1
+xgboost==3.0.1
+xxhash==3.5.0
+yarl==1.20.1