Spaces:
Running
Running
| from rdkit import Chem | |
| import pandas as pd | |
| from protac_splitter.chemoinformatics import ( | |
| canonize_smiles, | |
| remove_stereo, | |
| get_mol_id, | |
| ) | |
| def update_dictionary( | |
| dictionary: pd.DataFrame, | |
| substr_to_add: list, | |
| morgan_fp_generator = None, | |
| verbose: int = 0, | |
| ) -> pd.DataFrame: | |
| """ Updates a dictionary with a list of additional substructures. | |
| The dictionary is a dataframe with columns 'SMILES', 'Molecule', 'ID', and 'FP'. | |
| Args: | |
| dictionary: The input dictionary dataframe. | |
| substr_to_add: The list of additional substructures. | |
| Returns: | |
| The updated dictionary dataframe. | |
| """ | |
| # Canonize the SMILES strings | |
| substr_to_add = [canonize_smiles(smiles) for smiles in substr_to_add if smiles is not None] | |
| substr_to_add = list(set(substr_to_add)) | |
| # Remove entries already in the dictionary | |
| for smiles in substr_to_add: | |
| if not dictionary.empty and smiles in dictionary[f'SMILES'].unique().tolist(): | |
| if verbose > 1: | |
| print(f'\tWARNING. SMILES already in the dictionary: {smiles}') | |
| # Remove it from the list | |
| substr_to_add.remove(smiles) | |
| new_entries = [] | |
| for smiles in substr_to_add: | |
| try: | |
| mol = Chem.MolFromSmiles(smiles) | |
| except Exception as e: | |
| if verbose: | |
| print(e) | |
| mol = None | |
| # Remove entries that result in invalid molecules | |
| if mol is None: | |
| continue | |
| new_entries.append({ | |
| 'SMILES': smiles, | |
| 'Molecule': mol, | |
| 'ID': get_mol_id(smiles), | |
| }) | |
| # Try adding its no-stereochemistry version as well | |
| smiles_nostereo = remove_stereo(smiles) | |
| if smiles_nostereo is not None and smiles_nostereo != smiles: | |
| mol_nostereo = Chem.MolFromSmiles(smiles_nostereo) | |
| if mol_nostereo is not None: | |
| new_entries.append({ | |
| 'SMILES': canonize_smiles(smiles_nostereo), | |
| 'Molecule': mol_nostereo, | |
| 'ID': get_mol_id(smiles_nostereo), | |
| }) | |
| new_entries = pd.DataFrame(new_entries).drop_duplicates() | |
| if len(new_entries) > 0: | |
| # Add fingerprints to the new entries | |
| if morgan_fp_generator is None: | |
| morgan_fp_generator = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048, useBondTypes=True, includeChirality=True) | |
| new_entries['FP'] = new_entries['Molecule'].apply(lambda x: morgan_fp_generator.GetFingerprint(x) if x is not None else None) | |
| if verbose: | |
| print(f'Number of substructures added to the dictionary: {len(new_entries)}') | |
| # Return the updated dictionary | |
| return pd.concat([dictionary, pd.DataFrame(new_entries)], axis=0).drop_duplicates(subset='SMILES').reset_index(drop=True) |