| | from pathlib import Path |
| | from time import time |
| | import random |
| | from collections import defaultdict |
| | import argparse |
| | import warnings |
| |
|
| | from tqdm import tqdm |
| | import numpy as np |
| | import torch |
| | from Bio.PDB import PDBParser |
| | from Bio.PDB.Polypeptide import three_to_one, is_aa |
| | from Bio.PDB import PDBIO, Select |
| | from openbabel import openbabel |
| | from rdkit import Chem |
| | from rdkit.Chem import QED |
| | from scipy.ndimage import gaussian_filter |
| |
|
| | from geometry_utils import get_bb_transform |
| | from analysis.molecule_builder import build_molecule |
| | from analysis.metrics import rdmol_to_smiles |
| | import constants |
| | from constants import covalent_radii, dataset_params |
| | import utils |
| |
|
| | dataset_info = dataset_params['bindingmoad'] |
| | amino_acid_dict = dataset_info['aa_encoder'] |
| | atom_dict = dataset_info['atom_encoder'] |
| | atom_decoder = dataset_info['atom_decoder'] |
| |
|
| |
|
| | class Model0(Select): |
| | def accept_model(self, model): |
| | return model.id == 0 |
| |
|
| |
|
| | def read_label_file(csv_path): |
| | """ |
| | Read BindingMOAD's label file |
| | Args: |
| | csv_path: path to 'every.csv' |
| | Returns: |
| | Nested dictionary with all ligands. First level: EC number, |
| | Second level: PDB ID, Third level: list of ligands. Each ligand is |
| | represented as a tuple (ligand name, validity, SMILES string) |
| | """ |
| | ligand_dict = {} |
| |
|
| | with open(csv_path, 'r') as f: |
| | for line in f.readlines(): |
| | row = line.split(',') |
| |
|
| | |
| | if len(row[0]) > 0: |
| | curr_class = row[0] |
| | ligand_dict[curr_class] = {} |
| | continue |
| |
|
| | |
| | if len(row[2]) > 0: |
| | curr_prot = row[2] |
| | ligand_dict[curr_class][curr_prot] = [] |
| | continue |
| |
|
| | |
| | if len(row[3]) > 0: |
| | ligand_dict[curr_class][curr_prot].append( |
| | |
| | [row[3], row[4], row[9]] |
| | ) |
| |
|
| | return ligand_dict |
| |
|
| |
|
| | def compute_druglikeness(ligand_dict): |
| | """ |
| | Computes RDKit's QED value and adds it to the dictionary |
| | Args: |
| | ligand_dict: nested ligand dictionary |
| | Returns: |
| | the same ligand dictionary with additional QED values |
| | """ |
| | print("Computing QED values...") |
| | for p, m in tqdm([(p, m) for c in ligand_dict for p in ligand_dict[c] |
| | for m in ligand_dict[c][p]]): |
| | mol = Chem.MolFromSmiles(m[2]) |
| | if mol is None: |
| | mol_id = f'{p}_{m}' |
| | warnings.warn(f"Could not construct molecule {mol_id} from SMILES " |
| | f"string '{m[2]}'") |
| | continue |
| | m.append(QED.qed(mol)) |
| | return ligand_dict |
| |
|
| |
|
| | def filter_and_flatten(ligand_dict, qed_thresh, max_occurences, seed): |
| |
|
| | filtered_examples = [] |
| | all_examples = [(c, p, m) for c in ligand_dict for p in ligand_dict[c] |
| | for m in ligand_dict[c][p]] |
| |
|
| | |
| | |
| | random.seed(seed) |
| | random.shuffle(all_examples) |
| |
|
| | ligand_name_counter = defaultdict(int) |
| | print("Filtering examples...") |
| | for c, p, m in tqdm(all_examples): |
| |
|
| | ligand_name, ligand_chain, ligand_resi = m[0].split(':') |
| | if m[1] == 'valid' and len(m) > 3 and m[3] > qed_thresh: |
| | if ligand_name_counter[ligand_name] < max_occurences: |
| | filtered_examples.append( |
| | (c, p, m) |
| | ) |
| | ligand_name_counter[ligand_name] += 1 |
| |
|
| | return filtered_examples |
| |
|
| |
|
| | def split_by_ec_number(data_list, n_val, n_test, ec_level=1): |
| | """ |
| | Split dataset into training, validation and test sets based on EC numbers |
| | https://en.wikipedia.org/wiki/Enzyme_Commission_number |
| | Args: |
| | data_list: list of ligands |
| | n_val: number of validation examples |
| | n_test: number of test examples |
| | ec_level: level in the EC numbering hierarchy at which the split is |
| | made, i.e. items with matching EC numbers at this level are put in |
| | the same set |
| | Returns: |
| | dictionary with keys 'train', 'val', and 'test' |
| | """ |
| |
|
| | examples_per_class = defaultdict(int) |
| | for c, p, m in data_list: |
| | c_sub = '.'.join(c.split('.')[:ec_level]) |
| | examples_per_class[c_sub] += 1 |
| |
|
| | assert sum(examples_per_class.values()) == len(data_list) |
| |
|
| | |
| | val_classes = set() |
| | for c, num in sorted(examples_per_class.items(), key=lambda x: x[1], |
| | reverse=True): |
| | if sum([examples_per_class[x] for x in val_classes]) + num <= n_val: |
| | val_classes.add(c) |
| |
|
| | test_classes = set() |
| | for c, num in sorted(examples_per_class.items(), key=lambda x: x[1], |
| | reverse=True): |
| | |
| | if c in val_classes: |
| | continue |
| | if sum([examples_per_class[x] for x in test_classes]) + num <= n_test: |
| | test_classes.add(c) |
| |
|
| | |
| | train_classes = {x for x in examples_per_class if |
| | x not in val_classes and x not in test_classes} |
| |
|
| | |
| | data_split = {} |
| | data_split['train'] = [x for x in data_list if '.'.join( |
| | x[0].split('.')[:ec_level]) in train_classes] |
| | data_split['val'] = [x for x in data_list if '.'.join( |
| | x[0].split('.')[:ec_level]) in val_classes] |
| | data_split['test'] = [x for x in data_list if '.'.join( |
| | x[0].split('.')[:ec_level]) in test_classes] |
| |
|
| | assert len(data_split['train']) + len(data_split['val']) + \ |
| | len(data_split['test']) == len(data_list) |
| |
|
| | return data_split |
| |
|
| |
|
| | def ligand_list_to_dict(ligand_list): |
| | out_dict = defaultdict(list) |
| | for _, p, m in ligand_list: |
| | out_dict[p].append(m) |
| | return out_dict |
| |
|
| |
|
| | def process_ligand_and_pocket(pdb_struct, ligand_name, ligand_chain, |
| | ligand_resi, dist_cutoff, ca_only, |
| | compute_quaternion=False): |
| | try: |
| | residues = {obj.id[1]: obj for obj in |
| | pdb_struct[0][ligand_chain].get_residues()} |
| | except KeyError as e: |
| | raise KeyError(f'Chain {e} not found ({pdbfile}, ' |
| | f'{ligand_name}:{ligand_chain}:{ligand_resi})') |
| | ligand = residues[ligand_resi] |
| | assert ligand.get_resname() == ligand_name, \ |
| | f"{ligand.get_resname()} != {ligand_name}" |
| |
|
| | |
| | |
| | lig_atoms = [a for a in ligand.get_atoms() |
| | if (a.element.capitalize() in atom_dict or a.element != 'H')] |
| | lig_coords = np.array([a.get_coord() for a in lig_atoms]) |
| |
|
| | try: |
| | lig_one_hot = np.stack([ |
| | np.eye(1, len(atom_dict), atom_dict[a.element.capitalize()]).squeeze() |
| | for a in lig_atoms |
| | ]) |
| | except KeyError as e: |
| | raise KeyError( |
| | f'Ligand atom {e} not in atom dict ({pdbfile}, ' |
| | f'{ligand_name}:{ligand_chain}:{ligand_resi})') |
| |
|
| | |
| | pocket_residues = [] |
| | for residue in pdb_struct[0].get_residues(): |
| | res_coords = np.array([a.get_coord() for a in residue.get_atoms()]) |
| | if is_aa(residue.get_resname(), standard=True) and \ |
| | (((res_coords[:, None, :] - lig_coords[None, :, :]) ** 2).sum(-1) ** 0.5).min() < dist_cutoff: |
| | pocket_residues.append(residue) |
| |
|
| | |
| | n_xyz = np.array([res['N'].get_coord() for res in pocket_residues]) |
| | ca_xyz = np.array([res['CA'].get_coord() for res in pocket_residues]) |
| | c_xyz = np.array([res['C'].get_coord() for res in pocket_residues]) |
| |
|
| | if compute_quaternion: |
| | quaternion, c_alpha = get_bb_transform(n_xyz, ca_xyz, c_xyz) |
| | if np.any(np.isnan(quaternion)): |
| | raise ValueError( |
| | f'Invalid value in quaternion ({pdbfile}, ' |
| | f'{ligand_name}:{ligand_chain}:{ligand_resi})') |
| | else: |
| | c_alpha = ca_xyz |
| |
|
| | if ca_only: |
| | pocket_coords = c_alpha |
| | try: |
| | pocket_one_hot = np.stack([ |
| | np.eye(1, len(amino_acid_dict), |
| | amino_acid_dict[three_to_one(res.get_resname())]).squeeze() |
| | for res in pocket_residues]) |
| | except KeyError as e: |
| | raise KeyError( |
| | f'{e} not in amino acid dict ({pdbfile}, ' |
| | f'{ligand_name}:{ligand_chain}:{ligand_resi})') |
| | else: |
| | pocket_atoms = [a for res in pocket_residues for a in res.get_atoms() |
| | if (a.element.capitalize() in atom_dict or a.element != 'H')] |
| | pocket_coords = np.array([a.get_coord() for a in pocket_atoms]) |
| | try: |
| | pocket_one_hot = np.stack([ |
| | np.eye(1, len(atom_dict), atom_dict[a.element.capitalize()]).squeeze() |
| | for a in pocket_atoms |
| | ]) |
| | except KeyError as e: |
| | raise KeyError( |
| | f'Pocket atom {e} not in atom dict ({pdbfile}, ' |
| | f'{ligand_name}:{ligand_chain}:{ligand_resi})') |
| |
|
| | pocket_ids = [f'{res.parent.id}:{res.id[1]}' for res in pocket_residues] |
| |
|
| | ligand_data = { |
| | 'lig_coords': lig_coords, |
| | 'lig_one_hot': lig_one_hot, |
| | } |
| | pocket_data = { |
| | 'pocket_coords': pocket_coords, |
| | 'pocket_one_hot': pocket_one_hot, |
| | 'pocket_ids': pocket_ids, |
| | } |
| | if compute_quaternion: |
| | pocket_data['pocket_quaternion'] = quaternion |
| | return ligand_data, pocket_data |
| |
|
| |
|
| | def compute_smiles(positions, one_hot, mask): |
| | print("Computing SMILES ...") |
| |
|
| | atom_types = np.argmax(one_hot, axis=-1) |
| |
|
| | sections = np.where(np.diff(mask))[0] + 1 |
| | positions = [torch.from_numpy(x) for x in np.split(positions, sections)] |
| | atom_types = [torch.from_numpy(x) for x in np.split(atom_types, sections)] |
| |
|
| | mols_smiles = [] |
| |
|
| | pbar = tqdm(enumerate(zip(positions, atom_types)), |
| | total=len(np.unique(mask))) |
| | for i, (pos, atom_type) in pbar: |
| | mol = build_molecule(pos, atom_type, dataset_info) |
| |
|
| | |
| | try: |
| | Chem.SanitizeMol(mol) |
| | except ValueError: |
| | continue |
| |
|
| | mol = rdmol_to_smiles(mol) |
| | if mol is not None: |
| | mols_smiles.append(mol) |
| | pbar.set_description(f'{len(mols_smiles)}/{i + 1} successful') |
| |
|
| | return mols_smiles |
| |
|
| |
|
| | def get_n_nodes(lig_mask, pocket_mask, smooth_sigma=None): |
| | |
| | idx_lig, n_nodes_lig = np.unique(lig_mask, return_counts=True) |
| | idx_pocket, n_nodes_pocket = np.unique(pocket_mask, return_counts=True) |
| | assert np.all(idx_lig == idx_pocket) |
| |
|
| | joint_histogram = np.zeros((np.max(n_nodes_lig) + 1, |
| | np.max(n_nodes_pocket) + 1)) |
| |
|
| | for nlig, npocket in zip(n_nodes_lig, n_nodes_pocket): |
| | joint_histogram[nlig, npocket] += 1 |
| |
|
| | print(f'Original histogram: {np.count_nonzero(joint_histogram)}/' |
| | f'{joint_histogram.shape[0] * joint_histogram.shape[1]} bins filled') |
| |
|
| | |
| | if smooth_sigma is not None: |
| | filtered_histogram = gaussian_filter( |
| | joint_histogram, sigma=smooth_sigma, order=0, mode='constant', |
| | cval=0.0, truncate=4.0) |
| |
|
| | print(f'Smoothed histogram: {np.count_nonzero(filtered_histogram)}/' |
| | f'{filtered_histogram.shape[0] * filtered_histogram.shape[1]} bins filled') |
| |
|
| | joint_histogram = filtered_histogram |
| |
|
| | return joint_histogram |
| |
|
| |
|
| | def get_bond_length_arrays(atom_mapping): |
| | bond_arrays = [] |
| | for i in range(3): |
| | bond_dict = getattr(constants, f'bonds{i + 1}') |
| | bond_array = np.zeros((len(atom_mapping), len(atom_mapping))) |
| | for a1 in atom_mapping.keys(): |
| | for a2 in atom_mapping.keys(): |
| | if a1 in bond_dict and a2 in bond_dict[a1]: |
| | bond_len = bond_dict[a1][a2] |
| | else: |
| | bond_len = 0 |
| | bond_array[atom_mapping[a1], atom_mapping[a2]] = bond_len |
| |
|
| | assert np.all(bond_array == bond_array.T) |
| | bond_arrays.append(bond_array) |
| |
|
| | return bond_arrays |
| |
|
| |
|
| | def get_lennard_jones_rm(atom_mapping): |
| | |
| | LJ_rm = np.zeros((len(atom_mapping), len(atom_mapping))) |
| |
|
| | for a1 in atom_mapping.keys(): |
| | for a2 in atom_mapping.keys(): |
| | all_bond_lengths = [] |
| | for btype in ['bonds1', 'bonds2', 'bonds3']: |
| | bond_dict = getattr(constants, btype) |
| | if a1 in bond_dict and a2 in bond_dict[a1]: |
| | all_bond_lengths.append(bond_dict[a1][a2]) |
| |
|
| | if len(all_bond_lengths) > 0: |
| | |
| | |
| | bond_len = min(all_bond_lengths) |
| | else: |
| | |
| | bond_len = covalent_radii[a1] + covalent_radii[a2] |
| |
|
| | LJ_rm[atom_mapping[a1], atom_mapping[a2]] = bond_len |
| |
|
| | assert np.all(LJ_rm == LJ_rm.T) |
| | return LJ_rm |
| |
|
| |
|
| | def get_type_histograms(lig_one_hot, pocket_one_hot, atom_encoder, aa_encoder): |
| |
|
| | atom_decoder = list(atom_encoder.keys()) |
| | atom_counts = {k: 0 for k in atom_encoder.keys()} |
| | for a in [atom_decoder[x] for x in lig_one_hot.argmax(1)]: |
| | atom_counts[a] += 1 |
| |
|
| | aa_decoder = list(aa_encoder.keys()) |
| | aa_counts = {k: 0 for k in aa_encoder.keys()} |
| | for r in [aa_decoder[x] for x in pocket_one_hot.argmax(1)]: |
| | aa_counts[r] += 1 |
| |
|
| | return atom_counts, aa_counts |
| |
|
| |
|
| | def saveall(filename, pdb_and_mol_ids, lig_coords, lig_one_hot, lig_mask, |
| | pocket_coords, pocket_quaternion, pocket_one_hot, pocket_mask): |
| |
|
| | np.savez(filename, |
| | names=pdb_and_mol_ids, |
| | lig_coords=lig_coords, |
| | lig_one_hot=lig_one_hot, |
| | lig_mask=lig_mask, |
| | pocket_coords=pocket_coords, |
| | pocket_quaternion=pocket_quaternion, |
| | pocket_one_hot=pocket_one_hot, |
| | pocket_mask=pocket_mask |
| | ) |
| | return True |
| |
|
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument('basedir', type=Path) |
| | parser.add_argument('--outdir', type=Path, default=None) |
| | parser.add_argument('--qed_thresh', type=float, default=0.3) |
| | parser.add_argument('--max_occurences', type=int, default=50) |
| | parser.add_argument('--num_val', type=int, default=300) |
| | parser.add_argument('--num_test', type=int, default=300) |
| | parser.add_argument('--dist_cutoff', type=float, default=8.0) |
| | parser.add_argument('--ca_only', action='store_true') |
| | parser.add_argument('--random_seed', type=int, default=42) |
| | parser.add_argument('--make_split', action='store_true') |
| | args = parser.parse_args() |
| |
|
| | pdbdir = args.basedir / 'BindingMOAD_2020/' |
| |
|
| | |
| | if args.outdir is None: |
| | suffix = '' if 'H' in atom_dict else '_noH' |
| | suffix += '_ca_only' if args.ca_only else '_full' |
| | processed_dir = Path(args.basedir, f'processed{suffix}') |
| | else: |
| | processed_dir = args.outdir |
| |
|
| | processed_dir.mkdir(exist_ok=True, parents=True) |
| |
|
| | if args.make_split: |
| | |
| | csv_path = args.basedir / 'every.csv' |
| | ligand_dict = read_label_file(csv_path) |
| | ligand_dict = compute_druglikeness(ligand_dict) |
| | filtered_examples = filter_and_flatten( |
| | ligand_dict, args.qed_thresh, args.max_occurences, args.random_seed) |
| | print(f'{len(filtered_examples)} examples after filtering') |
| |
|
| | |
| | data_split = split_by_ec_number(filtered_examples, args.num_val, |
| | args.num_test) |
| |
|
| | else: |
| | |
| | data_split = {} |
| | for split in ['test', 'val', 'train']: |
| | with open(f'data/moad_{split}.txt', 'r') as f: |
| | pocket_ids = f.read().split(',') |
| | |
| | data_split[split] = [(None, x.split('_')[0][:4], (x.split('_')[1],)) |
| | for x in pocket_ids] |
| |
|
| | n_train_before = len(data_split['train']) |
| | n_val_before = len(data_split['val']) |
| | n_test_before = len(data_split['test']) |
| |
|
| | |
| | n_samples_after = {} |
| | for split in data_split.keys(): |
| | lig_coords = [] |
| | lig_one_hot = [] |
| | lig_mask = [] |
| | pocket_coords = [] |
| | pocket_one_hot = [] |
| | pocket_mask = [] |
| | pdb_and_mol_ids = [] |
| | receptors = [] |
| | count = 0 |
| |
|
| | pdb_sdf_dir = processed_dir / split |
| | pdb_sdf_dir.mkdir(exist_ok=True) |
| |
|
| | n_tot = len(data_split[split]) |
| | pair_dict = ligand_list_to_dict(data_split[split]) |
| |
|
| | tic = time() |
| | num_failed = 0 |
| | with tqdm(total=n_tot) as pbar: |
| | for p in pair_dict: |
| |
|
| | pdb_successful = set() |
| |
|
| | |
| | for pdbfile in sorted(pdbdir.glob(f"{p.lower()}.bio*")): |
| |
|
| | |
| | if len(pair_dict[p]) == len(pdb_successful): |
| | continue |
| |
|
| | pdb_struct = PDBParser(QUIET=True).get_structure('', pdbfile) |
| | struct_copy = pdb_struct.copy() |
| |
|
| | n_bio_successful = 0 |
| | for m in pair_dict[p]: |
| |
|
| | |
| | if m[0] in pdb_successful: |
| | continue |
| |
|
| | ligand_name, ligand_chain, ligand_resi = m[0].split(':') |
| | ligand_resi = int(ligand_resi) |
| |
|
| | try: |
| | ligand_data, pocket_data = process_ligand_and_pocket( |
| | pdb_struct, ligand_name, ligand_chain, ligand_resi, |
| | dist_cutoff=args.dist_cutoff, ca_only=args.ca_only) |
| | except (KeyError, AssertionError, FileNotFoundError, |
| | IndexError, ValueError) as e: |
| | |
| | continue |
| |
|
| | pdb_and_mol_ids.append(f"{p}_{m[0]}") |
| | receptors.append(pdbfile.name) |
| | lig_coords.append(ligand_data['lig_coords']) |
| | lig_one_hot.append(ligand_data['lig_one_hot']) |
| | lig_mask.append( |
| | count * np.ones(len(ligand_data['lig_coords']))) |
| | pocket_coords.append(pocket_data['pocket_coords']) |
| | |
| | |
| | pocket_one_hot.append(pocket_data['pocket_one_hot']) |
| | pocket_mask.append( |
| | count * np.ones(len(pocket_data['pocket_coords']))) |
| | count += 1 |
| |
|
| | pdb_successful.add(m[0]) |
| | n_bio_successful += 1 |
| |
|
| | |
| | if split in {'val', 'test'}: |
| | |
| | |
| | try: |
| | struct_copy[0][ligand_chain].detach_child((f'H_{ligand_name}', ligand_resi, ' ')) |
| | except KeyError: |
| | warnings.warn(f"Could not find ligand {(f'H_{ligand_name}', ligand_resi, ' ')} in {pdbfile}") |
| | continue |
| |
|
| | |
| | atom_types = [atom_decoder[np.argmax(i)] for i in ligand_data['lig_one_hot']] |
| | xyz_file = Path(pdb_sdf_dir, 'tmp.xyz') |
| | utils.write_xyz_file(ligand_data['lig_coords'], atom_types, xyz_file) |
| |
|
| | obConversion = openbabel.OBConversion() |
| | obConversion.SetInAndOutFormats("xyz", "sdf") |
| | mol = openbabel.OBMol() |
| | obConversion.ReadFile(mol, str(xyz_file)) |
| | xyz_file.unlink() |
| |
|
| | name = f"{p}-{pdbfile.suffix[1:]}_{m[0]}" |
| | sdf_file = Path(pdb_sdf_dir, f'{name}.sdf') |
| | obConversion.WriteFile(mol, str(sdf_file)) |
| |
|
| | |
| | with open(Path(pdb_sdf_dir, f'{name}.txt'), 'w') as f: |
| | f.write(' '.join(pocket_data['pocket_ids'])) |
| |
|
| | if split in {'val', 'test'} and n_bio_successful > 0: |
| | |
| | |
| | pdb_file_out = Path(pdb_sdf_dir, f'{p}-{pdbfile.suffix[1:]}.pdb') |
| | io = PDBIO() |
| | io.set_structure(struct_copy) |
| | io.save(str(pdb_file_out), select=Model0()) |
| |
|
| | pbar.update(len(pair_dict[p])) |
| | num_failed += (len(pair_dict[p]) - len(pdb_successful)) |
| | pbar.set_description(f'#failed: {num_failed}') |
| |
|
| |
|
| | lig_coords = np.concatenate(lig_coords, axis=0) |
| | lig_one_hot = np.concatenate(lig_one_hot, axis=0) |
| | lig_mask = np.concatenate(lig_mask, axis=0) |
| | pocket_coords = np.concatenate(pocket_coords, axis=0) |
| | pocket_one_hot = np.concatenate(pocket_one_hot, axis=0) |
| | pocket_mask = np.concatenate(pocket_mask, axis=0) |
| |
|
| | np.savez(processed_dir / f'{split}.npz', names=pdb_and_mol_ids, |
| | receptors=receptors, lig_coords=lig_coords, |
| | lig_one_hot=lig_one_hot, lig_mask=lig_mask, |
| | pocket_coords=pocket_coords, pocket_one_hot=pocket_one_hot, |
| | pocket_mask=pocket_mask) |
| |
|
| | n_samples_after[split] = len(pdb_and_mol_ids) |
| | print(f"Processing {split} set took {(time() - tic)/60.0:.2f} minutes") |
| |
|
| | |
| | |
| | |
| | with np.load(processed_dir / 'train.npz', allow_pickle=True) as data: |
| | lig_mask = data['lig_mask'] |
| | pocket_mask = data['pocket_mask'] |
| | lig_coords = data['lig_coords'] |
| | lig_one_hot = data['lig_one_hot'] |
| | pocket_one_hot = data['pocket_one_hot'] |
| |
|
| | |
| | train_smiles = compute_smiles(lig_coords, lig_one_hot, lig_mask) |
| | np.save(processed_dir / 'train_smiles.npy', train_smiles) |
| |
|
| | |
| | n_nodes = get_n_nodes(lig_mask, pocket_mask, smooth_sigma=1.0) |
| | np.save(Path(processed_dir, 'size_distribution.npy'), n_nodes) |
| |
|
| | |
| | bonds1, bonds2, bonds3 = get_bond_length_arrays(atom_dict) |
| |
|
| | |
| | rm_LJ = get_lennard_jones_rm(atom_dict) |
| |
|
| | |
| | atom_hist, aa_hist = get_type_histograms(lig_one_hot, pocket_one_hot, |
| | atom_dict, amino_acid_dict) |
| |
|
| | |
| | summary_string = '# SUMMARY\n\n' |
| | summary_string += '# Before processing\n' |
| | summary_string += f'num_samples train: {n_train_before}\n' |
| | summary_string += f'num_samples val: {n_val_before}\n' |
| | summary_string += f'num_samples test: {n_test_before}\n\n' |
| | summary_string += '# After processing\n' |
| | summary_string += f"num_samples train: {n_samples_after['train']}\n" |
| | summary_string += f"num_samples val: {n_samples_after['val']}\n" |
| | summary_string += f"num_samples test: {n_samples_after['test']}\n\n" |
| | summary_string += '# Info\n' |
| | summary_string += f"'atom_encoder': {atom_dict}\n" |
| | summary_string += f"'atom_decoder': {list(atom_dict.keys())}\n" |
| | summary_string += f"'aa_encoder': {amino_acid_dict}\n" |
| | summary_string += f"'aa_decoder': {list(amino_acid_dict.keys())}\n" |
| | summary_string += f"'bonds1': {bonds1.tolist()}\n" |
| | summary_string += f"'bonds2': {bonds2.tolist()}\n" |
| | summary_string += f"'bonds3': {bonds3.tolist()}\n" |
| | summary_string += f"'lennard_jones_rm': {rm_LJ.tolist()}\n" |
| | summary_string += f"'atom_hist': {atom_hist}\n" |
| | summary_string += f"'aa_hist': {aa_hist}\n" |
| | summary_string += f"'n_nodes': {n_nodes.tolist()}\n" |
| |
|
| | |
| | with open(processed_dir / 'summary.txt', 'w') as f: |
| | f.write(summary_string) |
| |
|
| | |
| | print(summary_string) |
| |
|