Spaces:

hugging-science
/

LigandMPNN

Running on Zero

App Files Files Community

gabboud commited on Feb 23

Commit

d95502a

1 Parent(s): 361e13e

initial commit from source repo

Browse files

Files changed (42) hide show

app.py +25 -0
data_utils.py +988 -0
inputs/1BC8.pdb +0 -0
model_utils.py +1772 -0
openfold/__init__.py +6 -0
openfold/config.py +558 -0
openfold/data/__init__.py +0 -0
openfold/data/data_modules.py +721 -0
openfold/data/data_pipeline.py +826 -0
openfold/data/data_transforms.py +1212 -0
openfold/data/errors.py +22 -0
openfold/data/feature_pipeline.py +116 -0
openfold/data/input_pipeline.py +208 -0
openfold/data/mmcif_parsing.py +485 -0
openfold/data/parsers.py +388 -0
openfold/data/templates.py +1108 -0
openfold/data/tools/__init__.py +0 -0
openfold/data/tools/hhblits.py +175 -0
openfold/data/tools/hhsearch.py +106 -0
openfold/data/tools/jackhmmer.py +228 -0
openfold/data/tools/kalign.py +115 -0
openfold/data/tools/utils.py +48 -0
openfold/np/__init__.py +16 -0
openfold/np/protein.py +441 -0
openfold/np/relax/__init__.py +16 -0
openfold/np/relax/amber_minimize.py +625 -0
openfold/np/relax/cleanup.py +131 -0
openfold/np/relax/relax.py +93 -0
openfold/np/relax/utils.py +86 -0
openfold/np/residue_constants.py +1310 -0
openfold/resources/__init__.py +0 -0
openfold/utils/feats.py +274 -0
openfold/utils/loss.py +1614 -0
openfold/utils/rigid_utils.py +1367 -0
openfold/utils/tensor_utils.py +121 -0
requirements.txt +29 -0
run.py +990 -0
run_examples.sh +244 -0
sc_examples.sh +55 -0
sc_utils.py +1158 -0
score.py +549 -0
space_utils/download_weights.py +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import gradio as gr
+import warnings
+import os
+import subprocess
+from pathlib import Path
+import shutil
+import spaces
+from space_utils.download_weights import download_ligandmpnn_weights
+download_ligandmpnn_weights()
+with gr.Blocks(title="RFD3 Test") as demo:
+    out_dir = "./output/test"
+    command= f"python run.py --pdb_path ./inputs/1BC8.pdb --out_folder {out_dir}"
+    subprocess.run(command, shell=True, check=True, text=True)
+    command = f"ls {out_dir}"
+    res = subprocess.run(command, shell=True, check=True, text=True, capture_output=True)
+    gr.Markdown("### Command Output")
+    gr.Textbox(value=res.stdout, lines=20)
+if __name__ == "__main__":
+    demo.launch()

data_utils.py ADDED Viewed

	@@ -0,0 +1,988 @@

+from __future__ import print_function
+import numpy as np
+import torch
+import torch.utils
+from prody import *
+confProDy(verbosity="none")
+restype_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+    "X": "UNK",
+}
+restype_str_to_int = {
+    "A": 0,
+    "C": 1,
+    "D": 2,
+    "E": 3,
+    "F": 4,
+    "G": 5,
+    "H": 6,
+    "I": 7,
+    "K": 8,
+    "L": 9,
+    "M": 10,
+    "N": 11,
+    "P": 12,
+    "Q": 13,
+    "R": 14,
+    "S": 15,
+    "T": 16,
+    "V": 17,
+    "W": 18,
+    "Y": 19,
+    "X": 20,
+}
+restype_int_to_str = {
+    0: "A",
+    1: "C",
+    2: "D",
+    3: "E",
+    4: "F",
+    5: "G",
+    6: "H",
+    7: "I",
+    8: "K",
+    9: "L",
+    10: "M",
+    11: "N",
+    12: "P",
+    13: "Q",
+    14: "R",
+    15: "S",
+    16: "T",
+    17: "V",
+    18: "W",
+    19: "Y",
+    20: "X",
+}
+alphabet = list(restype_str_to_int)
+element_list = [
+    "H",
+    "He",
+    "Li",
+    "Be",
+    "B",
+    "C",
+    "N",
+    "O",
+    "F",
+    "Ne",
+    "Na",
+    "Mg",
+    "Al",
+    "Si",
+    "P",
+    "S",
+    "Cl",
+    "Ar",
+    "K",
+    "Ca",
+    "Sc",
+    "Ti",
+    "V",
+    "Cr",
+    "Mn",
+    "Fe",
+    "Co",
+    "Ni",
+    "Cu",
+    "Zn",
+    "Ga",
+    "Ge",
+    "As",
+    "Se",
+    "Br",
+    "Kr",
+    "Rb",
+    "Sr",
+    "Y",
+    "Zr",
+    "Nb",
+    "Mb",
+    "Tc",
+    "Ru",
+    "Rh",
+    "Pd",
+    "Ag",
+    "Cd",
+    "In",
+    "Sn",
+    "Sb",
+    "Te",
+    "I",
+    "Xe",
+    "Cs",
+    "Ba",
+    "La",
+    "Ce",
+    "Pr",
+    "Nd",
+    "Pm",
+    "Sm",
+    "Eu",
+    "Gd",
+    "Tb",
+    "Dy",
+    "Ho",
+    "Er",
+    "Tm",
+    "Yb",
+    "Lu",
+    "Hf",
+    "Ta",
+    "W",
+    "Re",
+    "Os",
+    "Ir",
+    "Pt",
+    "Au",
+    "Hg",
+    "Tl",
+    "Pb",
+    "Bi",
+    "Po",
+    "At",
+    "Rn",
+    "Fr",
+    "Ra",
+    "Ac",
+    "Th",
+    "Pa",
+    "U",
+    "Np",
+    "Pu",
+    "Am",
+    "Cm",
+    "Bk",
+    "Cf",
+    "Es",
+    "Fm",
+    "Md",
+    "No",
+    "Lr",
+    "Rf",
+    "Db",
+    "Sg",
+    "Bh",
+    "Hs",
+    "Mt",
+    "Ds",
+    "Rg",
+    "Cn",
+    "Uut",
+    "Fl",
+    "Uup",
+    "Lv",
+    "Uus",
+    "Uuo",
+]
+element_list = [item.upper() for item in element_list]
+# element_dict = dict(zip(element_list, range(1,len(element_list))))
+element_dict_rev = dict(zip(range(1, len(element_list)), element_list))
+def get_seq_rec(S: torch.Tensor, S_pred: torch.Tensor, mask: torch.Tensor):
+    """
+    S : true sequence shape=[batch, length]
+    S_pred : predicted sequence shape=[batch, length]
+    mask : mask to compute average over the region shape=[batch, length]
+    average : averaged sequence recovery shape=[batch]
+    """
+    match = S == S_pred
+    average = torch.sum(match * mask, dim=-1) / torch.sum(mask, dim=-1)
+    return average
+def get_score(S: torch.Tensor, log_probs: torch.Tensor, mask: torch.Tensor):
+    """
+    S : true sequence shape=[batch, length]
+    log_probs : predicted sequence shape=[batch, length]
+    mask : mask to compute average over the region shape=[batch, length]
+    average_loss : averaged categorical cross entropy (CCE) [batch]
+    loss_per_resdue : per position CCE [batch, length]
+    """
+    S_one_hot = torch.nn.functional.one_hot(S, 21)
+    loss_per_residue = -(S_one_hot * log_probs).sum(-1)  # [B, L]
+    average_loss = torch.sum(loss_per_residue * mask, dim=-1) / (
+        torch.sum(mask, dim=-1) + 1e-8
+    )
+    return average_loss, loss_per_residue
+def write_full_PDB(
+    save_path: str,
+    X: np.ndarray,
+    X_m: np.ndarray,
+    b_factors: np.ndarray,
+    R_idx: np.ndarray,
+    chain_letters: np.ndarray,
+    S: np.ndarray,
+    other_atoms=None,
+    icodes=None,
+    force_hetatm=False,
+):
+    """
+    save_path : path where the PDB will be written to
+    X : protein atom xyz coordinates shape=[length, 14, 3]
+    X_m : protein atom mask shape=[length, 14]
+    b_factors: shape=[length, 14]
+    R_idx: protein residue indices shape=[length]
+    chain_letters: protein chain letters shape=[length]
+    S : protein amino acid sequence shape=[length]
+    other_atoms: other atoms parsed by prody
+    icodes: a list of insertion codes for the PDB; e.g. antibody loops
+    """
+    restype_1to3 = {
+        "A": "ALA",
+        "R": "ARG",
+        "N": "ASN",
+        "D": "ASP",
+        "C": "CYS",
+        "Q": "GLN",
+        "E": "GLU",
+        "G": "GLY",
+        "H": "HIS",
+        "I": "ILE",
+        "L": "LEU",
+        "K": "LYS",
+        "M": "MET",
+        "F": "PHE",
+        "P": "PRO",
+        "S": "SER",
+        "T": "THR",
+        "W": "TRP",
+        "Y": "TYR",
+        "V": "VAL",
+        "X": "UNK",
+    }
+    restype_INTtoSTR = {
+        0: "A",
+        1: "C",
+        2: "D",
+        3: "E",
+        4: "F",
+        5: "G",
+        6: "H",
+        7: "I",
+        8: "K",
+        9: "L",
+        10: "M",
+        11: "N",
+        12: "P",
+        13: "Q",
+        14: "R",
+        15: "S",
+        16: "T",
+        17: "V",
+        18: "W",
+        19: "Y",
+        20: "X",
+    }
+    restype_name_to_atom14_names = {
+        "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""],
+        "ARG": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "CD",
+            "NE",
+            "CZ",
+            "NH1",
+            "NH2",
+            "",
+            "",
+            "",
+        ],
+        "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2", "", "", "", "", "", ""],
+        "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2", "", "", "", "", "", ""],
+        "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""],
+        "GLN": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "CD",
+            "OE1",
+            "NE2",
+            "",
+            "",
+            "",
+            "",
+            "",
+        ],
+        "GLU": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "CD",
+            "OE1",
+            "OE2",
+            "",
+            "",
+            "",
+            "",
+            "",
+        ],
+        "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""],
+        "HIS": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "ND1",
+            "CD2",
+            "CE1",
+            "NE2",
+            "",
+            "",
+            "",
+            "",
+        ],
+        "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1", "", "", "", "", "", ""],
+        "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "", "", "", "", "", ""],
+        "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ", "", "", "", "", ""],
+        "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE", "", "", "", "", "", ""],
+        "PHE": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "CD1",
+            "CD2",
+            "CE1",
+            "CE2",
+            "CZ",
+            "",
+            "",
+            "",
+        ],
+        "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""],
+        "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""],
+        "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2", "", "", "", "", "", "", ""],
+        "TRP": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "CD1",
+            "CD2",
+            "CE2",
+            "CE3",
+            "NE1",
+            "CZ2",
+            "CZ3",
+            "CH2",
+        ],
+        "TYR": [
+            "N",
+            "CA",
+            "C",
+            "O",
+            "CB",
+            "CG",
+            "CD1",
+            "CD2",
+            "CE1",
+            "CE2",
+            "CZ",
+            "OH",
+            "",
+            "",
+        ],
+        "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "", "", "", "", "", "", ""],
+        "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
+    }
+    S_str = [restype_1to3[AA] for AA in [restype_INTtoSTR[AA] for AA in S]]
+    X_list = []
+    b_factor_list = []
+    atom_name_list = []
+    element_name_list = []
+    residue_name_list = []
+    residue_number_list = []
+    chain_id_list = []
+    icodes_list = []
+    for i, AA in enumerate(S_str):
+        sel = X_m[i].astype(np.int32) == 1
+        total = np.sum(sel)
+        tmp = np.array(restype_name_to_atom14_names[AA])[sel]
+        X_list.append(X[i][sel])
+        b_factor_list.append(b_factors[i][sel])
+        atom_name_list.append(tmp)
+        element_name_list += [AA[:1] for AA in list(tmp)]
+        residue_name_list += total * [AA]
+        residue_number_list += total * [R_idx[i]]
+        chain_id_list += total * [chain_letters[i]]
+        icodes_list += total * [icodes[i]]
+    X_stack = np.concatenate(X_list, 0)
+    b_factor_stack = np.concatenate(b_factor_list, 0)
+    atom_name_stack = np.concatenate(atom_name_list, 0)
+    protein = prody.AtomGroup()
+    protein.setCoords(X_stack)
+    protein.setBetas(b_factor_stack)
+    protein.setNames(atom_name_stack)
+    protein.setResnames(residue_name_list)
+    protein.setElements(element_name_list)
+    protein.setOccupancies(np.ones([X_stack.shape[0]]))
+    protein.setResnums(residue_number_list)
+    protein.setChids(chain_id_list)
+    protein.setIcodes(icodes_list)
+    if other_atoms:
+        other_atoms_g = prody.AtomGroup()
+        other_atoms_g.setCoords(other_atoms.getCoords())
+        other_atoms_g.setNames(other_atoms.getNames())
+        other_atoms_g.setResnames(other_atoms.getResnames())
+        other_atoms_g.setElements(other_atoms.getElements())
+        other_atoms_g.setOccupancies(other_atoms.getOccupancies())
+        other_atoms_g.setResnums(other_atoms.getResnums())
+        other_atoms_g.setChids(other_atoms.getChids())
+        if force_hetatm:
+            other_atoms_g.setFlags("hetatm", other_atoms.getFlags("hetatm"))
+        writePDB(save_path, protein + other_atoms_g)
+    else:
+        writePDB(save_path, protein)
+def get_aligned_coordinates(protein_atoms, CA_dict: dict, atom_name: str):
+    """
+    protein_atoms: prody atom group
+    CA_dict: mapping between chain_residue_idx_icodes and integers
+    atom_name: atom to be parsed; e.g. CA
+    """
+    atom_atoms = protein_atoms.select(f"name {atom_name}")
+    if atom_atoms != None:
+        atom_coords = atom_atoms.getCoords()
+        atom_resnums = atom_atoms.getResnums()
+        atom_chain_ids = atom_atoms.getChids()
+        atom_icodes = atom_atoms.getIcodes()
+    atom_coords_ = np.zeros([len(CA_dict), 3], np.float32)
+    atom_coords_m = np.zeros([len(CA_dict)], np.int32)
+    if atom_atoms != None:
+        for i in range(len(atom_resnums)):
+            code = atom_chain_ids[i] + "_" + str(atom_resnums[i]) + "_" + atom_icodes[i]
+            if code in list(CA_dict):
+                atom_coords_[CA_dict[code], :] = atom_coords[i]
+                atom_coords_m[CA_dict[code]] = 1
+    return atom_coords_, atom_coords_m
+def parse_PDB(
+    input_path: str,
+    device: str = "cpu",
+    chains: list = [],
+    parse_all_atoms: bool = False,
+    parse_atoms_with_zero_occupancy: bool = False
+):
+    """
+    input_path : path for the input PDB
+    device: device for the torch.Tensor
+    chains: a list specifying which chains need to be parsed; e.g. ["A", "B"]
+    parse_all_atoms: if False parse only N,CA,C,O otherwise all 37 atoms
+    parse_atoms_with_zero_occupancy: if True atoms with zero occupancy will be parsed
+    """
+    element_list = [
+        "H",
+        "He",
+        "Li",
+        "Be",
+        "B",
+        "C",
+        "N",
+        "O",
+        "F",
+        "Ne",
+        "Na",
+        "Mg",
+        "Al",
+        "Si",
+        "P",
+        "S",
+        "Cl",
+        "Ar",
+        "K",
+        "Ca",
+        "Sc",
+        "Ti",
+        "V",
+        "Cr",
+        "Mn",
+        "Fe",
+        "Co",
+        "Ni",
+        "Cu",
+        "Zn",
+        "Ga",
+        "Ge",
+        "As",
+        "Se",
+        "Br",
+        "Kr",
+        "Rb",
+        "Sr",
+        "Y",
+        "Zr",
+        "Nb",
+        "Mb",
+        "Tc",
+        "Ru",
+        "Rh",
+        "Pd",
+        "Ag",
+        "Cd",
+        "In",
+        "Sn",
+        "Sb",
+        "Te",
+        "I",
+        "Xe",
+        "Cs",
+        "Ba",
+        "La",
+        "Ce",
+        "Pr",
+        "Nd",
+        "Pm",
+        "Sm",
+        "Eu",
+        "Gd",
+        "Tb",
+        "Dy",
+        "Ho",
+        "Er",
+        "Tm",
+        "Yb",
+        "Lu",
+        "Hf",
+        "Ta",
+        "W",
+        "Re",
+        "Os",
+        "Ir",
+        "Pt",
+        "Au",
+        "Hg",
+        "Tl",
+        "Pb",
+        "Bi",
+        "Po",
+        "At",
+        "Rn",
+        "Fr",
+        "Ra",
+        "Ac",
+        "Th",
+        "Pa",
+        "U",
+        "Np",
+        "Pu",
+        "Am",
+        "Cm",
+        "Bk",
+        "Cf",
+        "Es",
+        "Fm",
+        "Md",
+        "No",
+        "Lr",
+        "Rf",
+        "Db",
+        "Sg",
+        "Bh",
+        "Hs",
+        "Mt",
+        "Ds",
+        "Rg",
+        "Cn",
+        "Uut",
+        "Fl",
+        "Uup",
+        "Lv",
+        "Uus",
+        "Uuo",
+    ]
+    element_list = [item.upper() for item in element_list]
+    element_dict = dict(zip(element_list, range(1, len(element_list))))
+    restype_3to1 = {
+        "ALA": "A",
+        "ARG": "R",
+        "ASN": "N",
+        "ASP": "D",
+        "CYS": "C",
+        "GLN": "Q",
+        "GLU": "E",
+        "GLY": "G",
+        "HIS": "H",
+        "ILE": "I",
+        "LEU": "L",
+        "LYS": "K",
+        "MET": "M",
+        "PHE": "F",
+        "PRO": "P",
+        "SER": "S",
+        "THR": "T",
+        "TRP": "W",
+        "TYR": "Y",
+        "VAL": "V",
+    }
+    restype_STRtoINT = {
+        "A": 0,
+        "C": 1,
+        "D": 2,
+        "E": 3,
+        "F": 4,
+        "G": 5,
+        "H": 6,
+        "I": 7,
+        "K": 8,
+        "L": 9,
+        "M": 10,
+        "N": 11,
+        "P": 12,
+        "Q": 13,
+        "R": 14,
+        "S": 15,
+        "T": 16,
+        "V": 17,
+        "W": 18,
+        "Y": 19,
+        "X": 20,
+    }
+    atom_order = {
+        "N": 0,
+        "CA": 1,
+        "C": 2,
+        "CB": 3,
+        "O": 4,
+        "CG": 5,
+        "CG1": 6,
+        "CG2": 7,
+        "OG": 8,
+        "OG1": 9,
+        "SG": 10,
+        "CD": 11,
+        "CD1": 12,
+        "CD2": 13,
+        "ND1": 14,
+        "ND2": 15,
+        "OD1": 16,
+        "OD2": 17,
+        "SD": 18,
+        "CE": 19,
+        "CE1": 20,
+        "CE2": 21,
+        "CE3": 22,
+        "NE": 23,
+        "NE1": 24,
+        "NE2": 25,
+        "OE1": 26,
+        "OE2": 27,
+        "CH2": 28,
+        "NH1": 29,
+        "NH2": 30,
+        "OH": 31,
+        "CZ": 32,
+        "CZ2": 33,
+        "CZ3": 34,
+        "NZ": 35,
+        "OXT": 36,
+    }
+    if not parse_all_atoms:
+        atom_types = ["N", "CA", "C", "O"]
+    else:
+        atom_types = [
+            "N",
+            "CA",
+            "C",
+            "CB",
+            "O",
+            "CG",
+            "CG1",
+            "CG2",
+            "OG",
+            "OG1",
+            "SG",
+            "CD",
+            "CD1",
+            "CD2",
+            "ND1",
+            "ND2",
+            "OD1",
+            "OD2",
+            "SD",
+            "CE",
+            "CE1",
+            "CE2",
+            "CE3",
+            "NE",
+            "NE1",
+            "NE2",
+            "OE1",
+            "OE2",
+            "CH2",
+            "NH1",
+            "NH2",
+            "OH",
+            "CZ",
+            "CZ2",
+            "CZ3",
+            "NZ",
+        ]
+    atoms = parsePDB(input_path)
+    if not parse_atoms_with_zero_occupancy:
+        atoms = atoms.select("occupancy > 0")
+    if chains:
+        str_out = ""
+        for item in chains:
+            str_out += " chain " + item + " or"
+        atoms = atoms.select(str_out[1:-3])
+    protein_atoms = atoms.select("protein")
+    backbone = protein_atoms.select("backbone")
+    other_atoms = atoms.select("not protein and not water")
+    water_atoms = atoms.select("water")
+    CA_atoms = protein_atoms.select("name CA")
+    CA_resnums = CA_atoms.getResnums()
+    CA_chain_ids = CA_atoms.getChids()
+    CA_icodes = CA_atoms.getIcodes()
+    CA_dict = {}
+    for i in range(len(CA_resnums)):
+        code = CA_chain_ids[i] + "_" + str(CA_resnums[i]) + "_" + CA_icodes[i]
+        CA_dict[code] = i
+    xyz_37 = np.zeros([len(CA_dict), 37, 3], np.float32)
+    xyz_37_m = np.zeros([len(CA_dict), 37], np.int32)
+    for atom_name in atom_types:
+        xyz, xyz_m = get_aligned_coordinates(protein_atoms, CA_dict, atom_name)
+        xyz_37[:, atom_order[atom_name], :] = xyz
+        xyz_37_m[:, atom_order[atom_name]] = xyz_m
+    N = xyz_37[:, atom_order["N"], :]
+    CA = xyz_37[:, atom_order["CA"], :]
+    C = xyz_37[:, atom_order["C"], :]
+    O = xyz_37[:, atom_order["O"], :]
+    N_m = xyz_37_m[:, atom_order["N"]]
+    CA_m = xyz_37_m[:, atom_order["CA"]]
+    C_m = xyz_37_m[:, atom_order["C"]]
+    O_m = xyz_37_m[:, atom_order["O"]]
+    mask = N_m * CA_m * C_m * O_m  # must all 4 atoms exist
+    b = CA - N
+    c = C - CA
+    a = np.cross(b, c, axis=-1)
+    CB = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + CA
+    chain_labels = np.array(CA_atoms.getChindices(), dtype=np.int32)
+    R_idx = np.array(CA_resnums, dtype=np.int32)
+    S = CA_atoms.getResnames()
+    S = [restype_3to1[AA] if AA in list(restype_3to1) else "X" for AA in list(S)]
+    S = np.array([restype_STRtoINT[AA] for AA in list(S)], np.int32)
+    X = np.concatenate([N[:, None], CA[:, None], C[:, None], O[:, None]], 1)
+    try:
+        Y = np.array(other_atoms.getCoords(), dtype=np.float32)
+        Y_t = list(other_atoms.getElements())
+        Y_t = np.array(
+            [
+                element_dict[y_t.upper()] if y_t.upper() in element_list else 0
+                for y_t in Y_t
+            ],
+            dtype=np.int32,
+        )
+        Y_m = (Y_t != 1) * (Y_t != 0)
+        Y = Y[Y_m, :]
+        Y_t = Y_t[Y_m]
+        Y_m = Y_m[Y_m]
+    except:
+        Y = np.zeros([1, 3], np.float32)
+        Y_t = np.zeros([1], np.int32)
+        Y_m = np.zeros([1], np.int32)
+    output_dict = {}
+    output_dict["X"] = torch.tensor(X, device=device, dtype=torch.float32)
+    output_dict["mask"] = torch.tensor(mask, device=device, dtype=torch.int32)
+    output_dict["Y"] = torch.tensor(Y, device=device, dtype=torch.float32)
+    output_dict["Y_t"] = torch.tensor(Y_t, device=device, dtype=torch.int32)
+    output_dict["Y_m"] = torch.tensor(Y_m, device=device, dtype=torch.int32)
+    output_dict["R_idx"] = torch.tensor(R_idx, device=device, dtype=torch.int32)
+    output_dict["chain_labels"] = torch.tensor(
+        chain_labels, device=device, dtype=torch.int32
+    )
+    output_dict["chain_letters"] = CA_chain_ids
+    mask_c = []
+    chain_list = list(set(output_dict["chain_letters"]))
+    chain_list.sort()
+    for chain in chain_list:
+        mask_c.append(
+            torch.tensor(
+                [chain == item for item in output_dict["chain_letters"]],
+                device=device,
+                dtype=bool,
+            )
+        )
+    output_dict["mask_c"] = mask_c
+    output_dict["chain_list"] = chain_list
+    output_dict["S"] = torch.tensor(S, device=device, dtype=torch.int32)
+    output_dict["xyz_37"] = torch.tensor(xyz_37, device=device, dtype=torch.float32)
+    output_dict["xyz_37_m"] = torch.tensor(xyz_37_m, device=device, dtype=torch.int32)
+    return output_dict, backbone, other_atoms, CA_icodes, water_atoms
+def get_nearest_neighbours(CB, mask, Y, Y_t, Y_m, number_of_ligand_atoms):
+    device = CB.device
+    mask_CBY = mask[:, None] * Y_m[None, :]  # [A,B]
+    L2_AB = torch.sum((CB[:, None, :] - Y[None, :, :]) ** 2, -1)
+    L2_AB = L2_AB * mask_CBY + (1 - mask_CBY) * 1000.0
+    nn_idx = torch.argsort(L2_AB, -1)[:, :number_of_ligand_atoms]
+    L2_AB_nn = torch.gather(L2_AB, 1, nn_idx)
+    D_AB_closest = torch.sqrt(L2_AB_nn[:, 0])
+    Y_r = Y[None, :, :].repeat(CB.shape[0], 1, 1)
+    Y_t_r = Y_t[None, :].repeat(CB.shape[0], 1)
+    Y_m_r = Y_m[None, :].repeat(CB.shape[0], 1)
+    Y_tmp = torch.gather(Y_r, 1, nn_idx[:, :, None].repeat(1, 1, 3))
+    Y_t_tmp = torch.gather(Y_t_r, 1, nn_idx)
+    Y_m_tmp = torch.gather(Y_m_r, 1, nn_idx)
+    Y = torch.zeros(
+        [CB.shape[0], number_of_ligand_atoms, 3], dtype=torch.float32, device=device
+    )
+    Y_t = torch.zeros(
+        [CB.shape[0], number_of_ligand_atoms], dtype=torch.int32, device=device
+    )
+    Y_m = torch.zeros(
+        [CB.shape[0], number_of_ligand_atoms], dtype=torch.int32, device=device
+    )
+    num_nn_update = Y_tmp.shape[1]
+    Y[:, :num_nn_update] = Y_tmp
+    Y_t[:, :num_nn_update] = Y_t_tmp
+    Y_m[:, :num_nn_update] = Y_m_tmp
+    return Y, Y_t, Y_m, D_AB_closest
+def featurize(
+    input_dict,
+    cutoff_for_score=8.0,
+    use_atom_context=True,
+    number_of_ligand_atoms=16,
+    model_type="protein_mpnn",
+):
+    output_dict = {}
+    if model_type == "ligand_mpnn":
+        mask = input_dict["mask"]
+        Y = input_dict["Y"]
+        Y_t = input_dict["Y_t"]
+        Y_m = input_dict["Y_m"]
+        N = input_dict["X"][:, 0, :]
+        CA = input_dict["X"][:, 1, :]
+        C = input_dict["X"][:, 2, :]
+        b = CA - N
+        c = C - CA
+        a = torch.cross(b, c, axis=-1)
+        CB = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + CA
+        Y, Y_t, Y_m, D_XY = get_nearest_neighbours(
+            CB, mask, Y, Y_t, Y_m, number_of_ligand_atoms
+        )
+        mask_XY = (D_XY < cutoff_for_score) * mask * Y_m[:, 0]
+        output_dict["mask_XY"] = mask_XY[None,]
+        if "side_chain_mask" in list(input_dict):
+            output_dict["side_chain_mask"] = input_dict["side_chain_mask"][None,]
+        output_dict["Y"] = Y[None,]
+        output_dict["Y_t"] = Y_t[None,]
+        output_dict["Y_m"] = Y_m[None,]
+        if not use_atom_context:
+            output_dict["Y_m"] = 0.0 * output_dict["Y_m"]
+    elif (
+        model_type == "per_residue_label_membrane_mpnn"
+        or model_type == "global_label_membrane_mpnn"
+    ):
+        output_dict["membrane_per_residue_labels"] = input_dict[
+            "membrane_per_residue_labels"
+        ][None,]
+    R_idx_list = []
+    count = 0
+    R_idx_prev = -100000
+    for R_idx in list(input_dict["R_idx"]):
+        if R_idx_prev == R_idx:
+            count += 1
+        R_idx_list.append(R_idx + count)
+        R_idx_prev = R_idx
+    R_idx_renumbered = torch.tensor(R_idx_list, device=R_idx.device)
+    output_dict["R_idx"] = R_idx_renumbered[None,]
+    output_dict["R_idx_original"] = input_dict["R_idx"][None,]
+    output_dict["chain_labels"] = input_dict["chain_labels"][None,]
+    output_dict["S"] = input_dict["S"][None,]
+    output_dict["chain_mask"] = input_dict["chain_mask"][None,]
+    output_dict["mask"] = input_dict["mask"][None,]
+    output_dict["X"] = input_dict["X"][None,]
+    if "xyz_37" in list(input_dict):
+        output_dict["xyz_37"] = input_dict["xyz_37"][None,]
+        output_dict["xyz_37_m"] = input_dict["xyz_37_m"][None,]
+    return output_dict

inputs/1BC8.pdb ADDED Viewed

The diff for this file is too large to render. See raw diff

model_utils.py ADDED Viewed

	@@ -0,0 +1,1772 @@

+from __future__ import print_function
+import itertools
+import sys
+import numpy as np
+import torch
+class ProteinMPNN(torch.nn.Module):
+    def __init__(
+        self,
+        num_letters=21,
+        node_features=128,
+        edge_features=128,
+        hidden_dim=128,
+        num_encoder_layers=3,
+        num_decoder_layers=3,
+        vocab=21,
+        k_neighbors=48,
+        augment_eps=0.0,
+        dropout=0.0,
+        device=None,
+        atom_context_num=0,
+        model_type="protein_mpnn",
+        ligand_mpnn_use_side_chain_context=False,
+    ):
+        super(ProteinMPNN, self).__init__()
+        self.model_type = model_type
+        self.node_features = node_features
+        self.edge_features = edge_features
+        self.hidden_dim = hidden_dim
+        if self.model_type == "ligand_mpnn":
+            self.features = ProteinFeaturesLigand(
+                node_features,
+                edge_features,
+                top_k=k_neighbors,
+                augment_eps=augment_eps,
+                device=device,
+                atom_context_num=atom_context_num,
+                use_side_chains=ligand_mpnn_use_side_chain_context,
+            )
+            self.W_v = torch.nn.Linear(node_features, hidden_dim, bias=True)
+            self.W_c = torch.nn.Linear(hidden_dim, hidden_dim, bias=True)
+            self.W_nodes_y = torch.nn.Linear(hidden_dim, hidden_dim, bias=True)
+            self.W_edges_y = torch.nn.Linear(hidden_dim, hidden_dim, bias=True)
+            self.V_C = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
+            self.V_C_norm = torch.nn.LayerNorm(hidden_dim)
+            self.context_encoder_layers = torch.nn.ModuleList(
+                [
+                    DecLayer(hidden_dim, hidden_dim * 2, dropout=dropout)
+                    for _ in range(2)
+                ]
+            )
+            self.y_context_encoder_layers = torch.nn.ModuleList(
+                [DecLayerJ(hidden_dim, hidden_dim, dropout=dropout) for _ in range(2)]
+            )
+        elif self.model_type == "protein_mpnn" or self.model_type == "soluble_mpnn":
+            self.features = ProteinFeatures(
+                node_features, edge_features, top_k=k_neighbors, augment_eps=augment_eps
+            )
+        elif (
+            self.model_type == "per_residue_label_membrane_mpnn"
+            or self.model_type == "global_label_membrane_mpnn"
+        ):
+            self.W_v = torch.nn.Linear(node_features, hidden_dim, bias=True)
+            self.features = ProteinFeaturesMembrane(
+                node_features,
+                edge_features,
+                top_k=k_neighbors,
+                augment_eps=augment_eps,
+                num_classes=3,
+            )
+        else:
+            print("Choose --model_type flag from currently available models")
+            sys.exit()
+        self.W_e = torch.nn.Linear(edge_features, hidden_dim, bias=True)
+        self.W_s = torch.nn.Embedding(vocab, hidden_dim)
+        self.dropout = torch.nn.Dropout(dropout)
+        # Encoder layers
+        self.encoder_layers = torch.nn.ModuleList(
+            [
+                EncLayer(hidden_dim, hidden_dim * 2, dropout=dropout)
+                for _ in range(num_encoder_layers)
+            ]
+        )
+        # Decoder layers
+        self.decoder_layers = torch.nn.ModuleList(
+            [
+                DecLayer(hidden_dim, hidden_dim * 3, dropout=dropout)
+                for _ in range(num_decoder_layers)
+            ]
+        )
+        self.W_out = torch.nn.Linear(hidden_dim, num_letters, bias=True)
+        for p in self.parameters():
+            if p.dim() > 1:
+                torch.nn.init.xavier_uniform_(p)
+    def encode(self, feature_dict):
+        # xyz_37 = feature_dict["xyz_37"] #[B,L,37,3] - xyz coordinates for all atoms if needed
+        # xyz_37_m = feature_dict["xyz_37_m"] #[B,L,37] - mask for all coords
+        # Y = feature_dict["Y"] #[B,L,num_context_atoms,3] - for ligandMPNN coords
+        # Y_t = feature_dict["Y_t"] #[B,L,num_context_atoms] - element type
+        # Y_m = feature_dict["Y_m"] #[B,L,num_context_atoms] - mask
+        # X = feature_dict["X"] #[B,L,4,3] - backbone xyz coordinates for N,CA,C,O
+        S_true = feature_dict[
+            "S"
+        ]  # [B,L] - integer protein sequence encoded using "restype_STRtoINT"
+        # R_idx = feature_dict["R_idx"] #[B,L] - primary sequence residue index
+        mask = feature_dict[
+            "mask"
+        ]  # [B,L] - mask for missing regions - should be removed! all ones most of the time
+        # chain_labels = feature_dict["chain_labels"] #[B,L] - integer labels for chain letters
+        B, L = S_true.shape
+        device = S_true.device
+        if self.model_type == "ligand_mpnn":
+            V, E, E_idx, Y_nodes, Y_edges, Y_m = self.features(feature_dict)
+            h_V = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=device)
+            h_E = self.W_e(E)
+            h_E_context = self.W_v(V)
+            mask_attend = gather_nodes(mask.unsqueeze(-1), E_idx).squeeze(-1)
+            mask_attend = mask.unsqueeze(-1) * mask_attend
+            for layer in self.encoder_layers:
+                h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+            h_V_C = self.W_c(h_V)
+            Y_m_edges = Y_m[:, :, :, None] * Y_m[:, :, None, :]
+            Y_nodes = self.W_nodes_y(Y_nodes)
+            Y_edges = self.W_edges_y(Y_edges)
+            for i in range(len(self.context_encoder_layers)):
+                Y_nodes = self.y_context_encoder_layers[i](
+                    Y_nodes, Y_edges, Y_m, Y_m_edges
+                )
+                h_E_context_cat = torch.cat([h_E_context, Y_nodes], -1)
+                h_V_C = self.context_encoder_layers[i](
+                    h_V_C, h_E_context_cat, mask, Y_m
+                )
+            h_V_C = self.V_C(h_V_C)
+            h_V = h_V + self.V_C_norm(self.dropout(h_V_C))
+        elif self.model_type == "protein_mpnn" or self.model_type == "soluble_mpnn":
+            E, E_idx = self.features(feature_dict)
+            h_V = torch.zeros((E.shape[0], E.shape[1], E.shape[-1]), device=device)
+            h_E = self.W_e(E)
+            mask_attend = gather_nodes(mask.unsqueeze(-1), E_idx).squeeze(-1)
+            mask_attend = mask.unsqueeze(-1) * mask_attend
+            for layer in self.encoder_layers:
+                h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+        elif (
+            self.model_type == "per_residue_label_membrane_mpnn"
+            or self.model_type == "global_label_membrane_mpnn"
+        ):
+            V, E, E_idx = self.features(feature_dict)
+            h_V = self.W_v(V)
+            h_E = self.W_e(E)
+            mask_attend = gather_nodes(mask.unsqueeze(-1), E_idx).squeeze(-1)
+            mask_attend = mask.unsqueeze(-1) * mask_attend
+            for layer in self.encoder_layers:
+                h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+        return h_V, h_E, E_idx
+    def sample(self, feature_dict):
+        # xyz_37 = feature_dict["xyz_37"] #[B,L,37,3] - xyz coordinates for all atoms if needed
+        # xyz_37_m = feature_dict["xyz_37_m"] #[B,L,37] - mask for all coords
+        # Y = feature_dict["Y"] #[B,L,num_context_atoms,3] - for ligandMPNN coords
+        # Y_t = feature_dict["Y_t"] #[B,L,num_context_atoms] - element type
+        # Y_m = feature_dict["Y_m"] #[B,L,num_context_atoms] - mask
+        # X = feature_dict["X"] #[B,L,4,3] - backbone xyz coordinates for N,CA,C,O
+        B_decoder = feature_dict["batch_size"]
+        S_true = feature_dict[
+            "S"
+        ]  # [B,L] - integer proitein sequence encoded using "restype_STRtoINT"
+        # R_idx = feature_dict["R_idx"] #[B,L] - primary sequence residue index
+        mask = feature_dict[
+            "mask"
+        ]  # [B,L] - mask for missing regions - should be removed! all ones most of the time
+        chain_mask = feature_dict[
+            "chain_mask"
+        ]  # [B,L] - mask for which residues need to be fixed; 0.0 - fixed; 1.0 - will be designed
+        bias = feature_dict["bias"]  # [B,L,21] - amino acid bias per position
+        # chain_labels = feature_dict["chain_labels"] #[B,L] - integer labels for chain letters
+        randn = feature_dict[
+            "randn"
+        ]  # [B,L] - random numbers for decoding order; only the first entry is used since decoding within a batch needs to match for symmetry
+        temperature = feature_dict[
+            "temperature"
+        ]  # float - sampling temperature; prob = softmax(logits/temperature)
+        symmetry_list_of_lists = feature_dict[
+            "symmetry_residues"
+        ]  # [[0, 1, 14], [10,11,14,15], [20, 21]] #indices to select X over length - L
+        symmetry_weights_list_of_lists = feature_dict[
+            "symmetry_weights"
+        ]  # [[1.0, 1.0, 1.0], [-2.0,1.1,0.2,1.1], [2.3, 1.1]]
+        B, L = S_true.shape
+        device = S_true.device
+        h_V, h_E, E_idx = self.encode(feature_dict)
+        chain_mask = mask * chain_mask  # update chain_M to include missing regions
+        decoding_order = torch.argsort(
+            (chain_mask + 0.0001) * (torch.abs(randn))
+        )  # [numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+        if len(symmetry_list_of_lists[0]) == 0 and len(symmetry_list_of_lists) == 1:
+            E_idx = E_idx.repeat(B_decoder, 1, 1)
+            permutation_matrix_reverse = torch.nn.functional.one_hot(
+                decoding_order, num_classes=L
+            ).float()
+            order_mask_backward = torch.einsum(
+                "ij, biq, bjp->bqp",
+                (1 - torch.triu(torch.ones(L, L, device=device))),
+                permutation_matrix_reverse,
+                permutation_matrix_reverse,
+            )
+            mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+            mask_1D = mask.view([B, L, 1, 1])
+            mask_bw = mask_1D * mask_attend
+            mask_fw = mask_1D * (1.0 - mask_attend)
+            # repeat for decoding
+            S_true = S_true.repeat(B_decoder, 1)
+            h_V = h_V.repeat(B_decoder, 1, 1)
+            h_E = h_E.repeat(B_decoder, 1, 1, 1)
+            chain_mask = chain_mask.repeat(B_decoder, 1)
+            mask = mask.repeat(B_decoder, 1)
+            bias = bias.repeat(B_decoder, 1, 1)
+            all_probs = torch.zeros(
+                (B_decoder, L, 20), device=device, dtype=torch.float32
+            )
+            all_log_probs = torch.zeros(
+                (B_decoder, L, 21), device=device, dtype=torch.float32
+            )
+            h_S = torch.zeros_like(h_V, device=device)
+            S = 20 * torch.ones((B_decoder, L), dtype=torch.int64, device=device)
+            h_V_stack = [h_V] + [
+                torch.zeros_like(h_V, device=device)
+                for _ in range(len(self.decoder_layers))
+            ]
+            h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+            h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+            h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+            for t_ in range(L):
+                t = decoding_order[:, t_]  # [B]
+                chain_mask_t = torch.gather(chain_mask, 1, t[:, None])[:, 0]  # [B]
+                mask_t = torch.gather(mask, 1, t[:, None])[:, 0]  # [B]
+                bias_t = torch.gather(bias, 1, t[:, None, None].repeat(1, 1, 21))[
+                    :, 0, :
+                ]  # [B,21]
+                E_idx_t = torch.gather(
+                    E_idx, 1, t[:, None, None].repeat(1, 1, E_idx.shape[-1])
+                )
+                h_E_t = torch.gather(
+                    h_E,
+                    1,
+                    t[:, None, None, None].repeat(1, 1, h_E.shape[-2], h_E.shape[-1]),
+                )
+                h_ES_t = cat_neighbors_nodes(h_S, h_E_t, E_idx_t)
+                h_EXV_encoder_t = torch.gather(
+                    h_EXV_encoder_fw,
+                    1,
+                    t[:, None, None, None].repeat(
+                        1, 1, h_EXV_encoder_fw.shape[-2], h_EXV_encoder_fw.shape[-1]
+                    ),
+                )
+                mask_bw_t = torch.gather(
+                    mask_bw,
+                    1,
+                    t[:, None, None, None].repeat(
+                        1, 1, mask_bw.shape[-2], mask_bw.shape[-1]
+                    ),
+                )
+                for l, layer in enumerate(self.decoder_layers):
+                    h_ESV_decoder_t = cat_neighbors_nodes(h_V_stack[l], h_ES_t, E_idx_t)
+                    h_V_t = torch.gather(
+                        h_V_stack[l],
+                        1,
+                        t[:, None, None].repeat(1, 1, h_V_stack[l].shape[-1]),
+                    )
+                    h_ESV_t = mask_bw_t * h_ESV_decoder_t + h_EXV_encoder_t
+                    h_V_stack[l + 1].scatter_(
+                        1,
+                        t[:, None, None].repeat(1, 1, h_V.shape[-1]),
+                        layer(h_V_t, h_ESV_t, mask_V=mask_t),
+                    )
+                h_V_t = torch.gather(
+                    h_V_stack[-1],
+                    1,
+                    t[:, None, None].repeat(1, 1, h_V_stack[-1].shape[-1]),
+                )[:, 0]
+                logits = self.W_out(h_V_t)  # [B,21]
+                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)  # [B,21]
+                probs = torch.nn.functional.softmax(
+                    (logits + bias_t) / temperature, dim=-1
+                )  # [B,21]
+                probs_sample = probs[:, :20] / torch.sum(
+                    probs[:, :20], dim=-1, keepdim=True
+                )  # hard omit X #[B,20]
+                S_t = torch.multinomial(probs_sample, 1)[:, 0]  # [B]
+                all_probs.scatter_(
+                    1,
+                    t[:, None, None].repeat(1, 1, 20),
+                    (chain_mask_t[:, None, None] * probs_sample[:, None, :]).float(),
+                )
+                all_log_probs.scatter_(
+                    1,
+                    t[:, None, None].repeat(1, 1, 21),
+                    (chain_mask_t[:, None, None] * log_probs[:, None, :]).float(),
+                )
+                S_true_t = torch.gather(S_true, 1, t[:, None])[:, 0]
+                S_t = (S_t * chain_mask_t + S_true_t * (1.0 - chain_mask_t)).long()
+                h_S.scatter_(
+                    1,
+                    t[:, None, None].repeat(1, 1, h_S.shape[-1]),
+                    self.W_s(S_t)[:, None, :],
+                )
+                S.scatter_(1, t[:, None], S_t[:, None])
+            output_dict = {
+                "S": S,
+                "sampling_probs": all_probs,
+                "log_probs": all_log_probs,
+                "decoding_order": decoding_order,
+            }
+        else:
+            # weights for symmetric design
+            symmetry_weights = torch.ones([L], device=device, dtype=torch.float32)
+            for i1, item_list in enumerate(symmetry_list_of_lists):
+                for i2, item in enumerate(item_list):
+                    symmetry_weights[item] = symmetry_weights_list_of_lists[i1][i2]
+            new_decoding_order = []
+            for t_dec in list(decoding_order[0,].cpu().data.numpy()):
+                if t_dec not in list(itertools.chain(*new_decoding_order)):
+                    list_a = [item for item in symmetry_list_of_lists if t_dec in item]
+                    if list_a:
+                        new_decoding_order.append(list_a[0])
+                    else:
+                        new_decoding_order.append([t_dec])
+            decoding_order = torch.tensor(
+                list(itertools.chain(*new_decoding_order)), device=device
+            )[None,].repeat(B, 1)
+            permutation_matrix_reverse = torch.nn.functional.one_hot(
+                decoding_order, num_classes=L
+            ).float()
+            order_mask_backward = torch.einsum(
+                "ij, biq, bjp->bqp",
+                (1 - torch.triu(torch.ones(L, L, device=device))),
+                permutation_matrix_reverse,
+                permutation_matrix_reverse,
+            )
+            mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+            mask_1D = mask.view([B, L, 1, 1])
+            mask_bw = mask_1D * mask_attend
+            mask_fw = mask_1D * (1.0 - mask_attend)
+            # repeat for decoding
+            S_true = S_true.repeat(B_decoder, 1)
+            h_V = h_V.repeat(B_decoder, 1, 1)
+            h_E = h_E.repeat(B_decoder, 1, 1, 1)
+            E_idx = E_idx.repeat(B_decoder, 1, 1)
+            mask_fw = mask_fw.repeat(B_decoder, 1, 1, 1)
+            mask_bw = mask_bw.repeat(B_decoder, 1, 1, 1)
+            chain_mask = chain_mask.repeat(B_decoder, 1)
+            mask = mask.repeat(B_decoder, 1)
+            bias = bias.repeat(B_decoder, 1, 1)
+            all_probs = torch.zeros(
+                (B_decoder, L, 20), device=device, dtype=torch.float32
+            )
+            all_log_probs = torch.zeros(
+                (B_decoder, L, 21), device=device, dtype=torch.float32
+            )
+            h_S = torch.zeros_like(h_V, device=device)
+            S = 20 * torch.ones((B_decoder, L), dtype=torch.int64, device=device)
+            h_V_stack = [h_V] + [
+                torch.zeros_like(h_V, device=device)
+                for _ in range(len(self.decoder_layers))
+            ]
+            h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+            h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+            h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+            for t_list in new_decoding_order:
+                total_logits = 0.0
+                for t in t_list:
+                    chain_mask_t = chain_mask[:, t]  # [B]
+                    mask_t = mask[:, t]  # [B]
+                    bias_t = bias[:, t]  # [B, 21]
+                    E_idx_t = E_idx[:, t : t + 1]
+                    h_E_t = h_E[:, t : t + 1]
+                    h_ES_t = cat_neighbors_nodes(h_S, h_E_t, E_idx_t)
+                    h_EXV_encoder_t = h_EXV_encoder_fw[:, t : t + 1]
+                    for l, layer in enumerate(self.decoder_layers):
+                        h_ESV_decoder_t = cat_neighbors_nodes(
+                            h_V_stack[l], h_ES_t, E_idx_t
+                        )
+                        h_V_t = h_V_stack[l][:, t : t + 1]
+                        h_ESV_t = (
+                            mask_bw[:, t : t + 1] * h_ESV_decoder_t + h_EXV_encoder_t
+                        )
+                        h_V_stack[l + 1][:, t : t + 1, :] = layer(
+                            h_V_t, h_ESV_t, mask_V=mask_t[:, None]
+                        )
+                    h_V_t = h_V_stack[-1][:, t]
+                    logits = self.W_out(h_V_t)  # [B,21]
+                    log_probs = torch.nn.functional.log_softmax(
+                        logits, dim=-1
+                    )  # [B,21]
+                    all_log_probs[:, t] = (
+                        chain_mask_t[:, None] * log_probs
+                    ).float()  # [B,21]
+                    total_logits += symmetry_weights[t] * logits
+                probs = torch.nn.functional.softmax(
+                    (total_logits + bias_t) / temperature, dim=-1
+                )  # [B,21]
+                probs_sample = probs[:, :20] / torch.sum(
+                    probs[:, :20], dim=-1, keepdim=True
+                )  # hard omit X #[B,20]
+                S_t = torch.multinomial(probs_sample, 1)[:, 0]  # [B]
+                for t in t_list:
+                    chain_mask_t = chain_mask[:, t]  # [B]
+                    all_probs[:, t] = (
+                        chain_mask_t[:, None] * probs_sample
+                    ).float()  # [B,20]
+                    S_true_t = S_true[:, t]  # [B]
+                    S_t = (S_t * chain_mask_t + S_true_t * (1.0 - chain_mask_t)).long()
+                    h_S[:, t] = self.W_s(S_t)
+                    S[:, t] = S_t
+            output_dict = {
+                "S": S,
+                "sampling_probs": all_probs,
+                "log_probs": all_log_probs,
+                "decoding_order": decoding_order.repeat(B_decoder, 1),
+            }
+        return output_dict
+    def single_aa_score(self, feature_dict, use_sequence: bool):
+        """
+        feature_dict - input features
+        use_sequence - False using backbone info only
+        """
+        B_decoder = feature_dict["batch_size"]
+        S_true_enc = feature_dict[
+            "S"
+        ]
+        mask_enc = feature_dict[
+            "mask"
+        ]
+        chain_mask_enc = feature_dict[
+            "chain_mask"
+        ]
+        randn = feature_dict[
+            "randn"
+        ]
+        B, L = S_true_enc.shape
+        device = S_true_enc.device
+        h_V_enc, h_E_enc, E_idx_enc = self.encode(feature_dict)
+        log_probs_out = torch.zeros([B_decoder, L, 21], device=device).float()
+        logits_out = torch.zeros([B_decoder, L, 21], device=device).float()
+        decoding_order_out = torch.zeros([B_decoder, L, L], device=device).float()
+        for idx in range(L):
+            h_V = torch.clone(h_V_enc)
+            E_idx = torch.clone(E_idx_enc)
+            mask = torch.clone(mask_enc)
+            S_true = torch.clone(S_true_enc)
+            if not use_sequence:
+                order_mask = torch.zeros(chain_mask_enc.shape[1], device=device).float()
+                order_mask[idx] = 1.
+            else:
+                order_mask = torch.ones(chain_mask_enc.shape[1], device=device).float()
+                order_mask[idx] = 0.
+            decoding_order = torch.argsort(
+                (order_mask + 0.0001) * (torch.abs(randn))
+            )  # [numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+            E_idx = E_idx.repeat(B_decoder, 1, 1)
+            permutation_matrix_reverse = torch.nn.functional.one_hot(
+                decoding_order, num_classes=L
+            ).float()
+            order_mask_backward = torch.einsum(
+                "ij, biq, bjp->bqp",
+                (1 - torch.triu(torch.ones(L, L, device=device))),
+                permutation_matrix_reverse,
+                permutation_matrix_reverse,
+            )
+            mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+            mask_1D = mask.view([B, L, 1, 1])
+            mask_bw = mask_1D * mask_attend
+            mask_fw = mask_1D * (1.0 - mask_attend)
+            S_true = S_true.repeat(B_decoder, 1)
+            h_V = h_V.repeat(B_decoder, 1, 1)
+            h_E = h_E_enc.repeat(B_decoder, 1, 1, 1)
+            mask = mask.repeat(B_decoder, 1)
+            h_S = self.W_s(S_true)
+            h_ES = cat_neighbors_nodes(h_S, h_E, E_idx)
+            # Build encoder embeddings
+            h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+            h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+            h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+            for layer in self.decoder_layers:
+                # Masked positions attend to encoder information, unmasked see.
+                h_ESV = cat_neighbors_nodes(h_V, h_ES, E_idx)
+                h_ESV = mask_bw * h_ESV + h_EXV_encoder_fw
+                h_V = layer(h_V, h_ESV, mask)
+            logits = self.W_out(h_V)
+            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+            log_probs_out[:,idx,:] = log_probs[:,idx,:]
+            logits_out[:,idx,:] = logits[:,idx,:]
+            decoding_order_out[:,idx,:] = decoding_order
+        output_dict = {
+            "S": S_true,
+            "log_probs": log_probs_out,
+            "logits": logits_out,
+            "decoding_order": decoding_order_out,
+        }
+        return output_dict
+    def score(self, feature_dict, use_sequence: bool):
+        B_decoder = feature_dict["batch_size"]
+        S_true = feature_dict[
+            "S"
+        ]
+        mask = feature_dict[
+            "mask"
+        ]
+        chain_mask = feature_dict[
+            "chain_mask"
+        ]
+        randn = feature_dict[
+            "randn"
+        ]
+        symmetry_list_of_lists = feature_dict[
+            "symmetry_residues"
+        ]
+        B, L = S_true.shape
+        device = S_true.device
+        h_V, h_E, E_idx = self.encode(feature_dict)
+        chain_mask = mask * chain_mask  # update chain_M to include missing regions
+        decoding_order = torch.argsort(
+            (chain_mask + 0.0001) * (torch.abs(randn))
+        )  # [numbers will be smaller for places where chain_M = 0.0 and higher for places where chain_M = 1.0]
+        if len(symmetry_list_of_lists[0]) == 0 and len(symmetry_list_of_lists) == 1:
+            E_idx = E_idx.repeat(B_decoder, 1, 1)
+            permutation_matrix_reverse = torch.nn.functional.one_hot(
+                decoding_order, num_classes=L
+            ).float()
+            order_mask_backward = torch.einsum(
+                "ij, biq, bjp->bqp",
+                (1 - torch.triu(torch.ones(L, L, device=device))),
+                permutation_matrix_reverse,
+                permutation_matrix_reverse,
+            )
+            mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+            mask_1D = mask.view([B, L, 1, 1])
+            mask_bw = mask_1D * mask_attend
+            mask_fw = mask_1D * (1.0 - mask_attend)
+        else:
+            new_decoding_order = []
+            for t_dec in list(decoding_order[0,].cpu().data.numpy()):
+                if t_dec not in list(itertools.chain(*new_decoding_order)):
+                    list_a = [item for item in symmetry_list_of_lists if t_dec in item]
+                    if list_a:
+                        new_decoding_order.append(list_a[0])
+                    else:
+                        new_decoding_order.append([t_dec])
+            decoding_order = torch.tensor(
+                list(itertools.chain(*new_decoding_order)), device=device
+            )[None,].repeat(B, 1)
+            permutation_matrix_reverse = torch.nn.functional.one_hot(
+                decoding_order, num_classes=L
+            ).float()
+            order_mask_backward = torch.einsum(
+                "ij, biq, bjp->bqp",
+                (1 - torch.triu(torch.ones(L, L, device=device))),
+                permutation_matrix_reverse,
+                permutation_matrix_reverse,
+            )
+            mask_attend = torch.gather(order_mask_backward, 2, E_idx).unsqueeze(-1)
+            mask_1D = mask.view([B, L, 1, 1])
+            mask_bw = mask_1D * mask_attend
+            mask_fw = mask_1D * (1.0 - mask_attend)
+            E_idx = E_idx.repeat(B_decoder, 1, 1)
+            mask_fw = mask_fw.repeat(B_decoder, 1, 1, 1)
+            mask_bw = mask_bw.repeat(B_decoder, 1, 1, 1)
+            decoding_order = decoding_order.repeat(B_decoder, 1)
+        S_true = S_true.repeat(B_decoder, 1)
+        h_V = h_V.repeat(B_decoder, 1, 1)
+        h_E = h_E.repeat(B_decoder, 1, 1, 1)
+        mask = mask.repeat(B_decoder, 1)
+        h_S = self.W_s(S_true)
+        h_ES = cat_neighbors_nodes(h_S, h_E, E_idx)
+        # Build encoder embeddings
+        h_EX_encoder = cat_neighbors_nodes(torch.zeros_like(h_S), h_E, E_idx)
+        h_EXV_encoder = cat_neighbors_nodes(h_V, h_EX_encoder, E_idx)
+        h_EXV_encoder_fw = mask_fw * h_EXV_encoder
+        if not use_sequence:
+            for layer in self.decoder_layers:
+                h_V = layer(h_V, h_EXV_encoder_fw, mask)
+        else:
+            for layer in self.decoder_layers:
+                # Masked positions attend to encoder information, unmasked see.
+                h_ESV = cat_neighbors_nodes(h_V, h_ES, E_idx)
+                h_ESV = mask_bw * h_ESV + h_EXV_encoder_fw
+                h_V = layer(h_V, h_ESV, mask)
+        logits = self.W_out(h_V)
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        output_dict = {
+            "S": S_true,
+            "log_probs": log_probs,
+            "logits": logits,
+            "decoding_order": decoding_order,
+        }
+        return output_dict
+class ProteinFeaturesLigand(torch.nn.Module):
+    def __init__(
+        self,
+        edge_features,
+        node_features,
+        num_positional_embeddings=16,
+        num_rbf=16,
+        top_k=30,
+        augment_eps=0.0,
+        device=None,
+        atom_context_num=16,
+        use_side_chains=False,
+    ):
+        """Extract protein features"""
+        super(ProteinFeaturesLigand, self).__init__()
+        self.use_side_chains = use_side_chains
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.num_rbf = num_rbf
+        self.num_positional_embeddings = num_positional_embeddings
+        self.embeddings = PositionalEncodings(num_positional_embeddings)
+        edge_in = num_positional_embeddings + num_rbf * 25
+        self.edge_embedding = torch.nn.Linear(edge_in, edge_features, bias=False)
+        self.norm_edges = torch.nn.LayerNorm(edge_features)
+        self.node_project_down = torch.nn.Linear(
+            5 * num_rbf + 64 + 4, node_features, bias=True
+        )
+        self.norm_nodes = torch.nn.LayerNorm(node_features)
+        self.type_linear = torch.nn.Linear(147, 64)
+        self.y_nodes = torch.nn.Linear(147, node_features, bias=False)
+        self.y_edges = torch.nn.Linear(num_rbf, node_features, bias=False)
+        self.norm_y_edges = torch.nn.LayerNorm(node_features)
+        self.norm_y_nodes = torch.nn.LayerNorm(node_features)
+        self.atom_context_num = atom_context_num
+        # the last 32 atoms in the 37 atom representation
+        self.side_chain_atom_types = torch.tensor(
+            [
+                6,
+                6,
+                6,
+                8,
+                8,
+                16,
+                6,
+                6,
+                6,
+                7,
+                7,
+                8,
+                8,
+                16,
+                6,
+                6,
+                6,
+                6,
+                7,
+                7,
+                7,
+                8,
+                8,
+                6,
+                7,
+                7,
+                8,
+                6,
+                6,
+                6,
+                7,
+                8,
+            ],
+            device=device,
+        )
+        self.periodic_table_features = torch.tensor(
+            [
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    19,
+                    20,
+                    21,
+                    22,
+                    23,
+                    24,
+                    25,
+                    26,
+                    27,
+                    28,
+                    29,
+                    30,
+                    31,
+                    32,
+                    33,
+                    34,
+                    35,
+                    36,
+                    37,
+                    38,
+                    39,
+                    40,
+                    41,
+                    42,
+                    43,
+                    44,
+                    45,
+                    46,
+                    47,
+                    48,
+                    49,
+                    50,
+                    51,
+                    52,
+                    53,
+                    54,
+                    55,
+                    56,
+                    57,
+                    58,
+                    59,
+                    60,
+                    61,
+                    62,
+                    63,
+                    64,
+                    65,
+                    66,
+                    67,
+                    68,
+                    69,
+                    70,
+                    71,
+                    72,
+                    73,
+                    74,
+                    75,
+                    76,
+                    77,
+                    78,
+                    79,
+                    80,
+                    81,
+                    82,
+                    83,
+                    84,
+                    85,
+                    86,
+                    87,
+                    88,
+                    89,
+                    90,
+                    91,
+                    92,
+                    93,
+                    94,
+                    95,
+                    96,
+                    97,
+                    98,
+                    99,
+                    100,
+                    101,
+                    102,
+                    103,
+                    104,
+                    105,
+                    106,
+                    107,
+                    108,
+                    109,
+                    110,
+                    111,
+                    112,
+                    113,
+                    114,
+                    115,
+                    116,
+                    117,
+                    118,
+                ],
+                [
+                    0,
+                    1,
+                    18,
+                    1,
+                    2,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                ],
+                [
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                ],
+            ],
+            dtype=torch.long,
+            device=device,
+        )
+    def _make_angle_features(self, A, B, C, Y):
+        v1 = A - B
+        v2 = C - B
+        e1 = torch.nn.functional.normalize(v1, dim=-1)
+        e1_v2_dot = torch.einsum("bli, bli -> bl", e1, v2)[..., None]
+        u2 = v2 - e1 * e1_v2_dot
+        e2 = torch.nn.functional.normalize(u2, dim=-1)
+        e3 = torch.cross(e1, e2, dim=-1)
+        R_residue = torch.cat(
+            (e1[:, :, :, None], e2[:, :, :, None], e3[:, :, :, None]), dim=-1
+        )
+        local_vectors = torch.einsum(
+            "blqp, blyq -> blyp", R_residue, Y - B[:, :, None, :]
+        )
+        rxy = torch.sqrt(local_vectors[..., 0] ** 2 + local_vectors[..., 1] ** 2 + 1e-8)
+        f1 = local_vectors[..., 0] / rxy
+        f2 = local_vectors[..., 1] / rxy
+        rxyz = torch.norm(local_vectors, dim=-1) + 1e-8
+        f3 = rxy / rxyz
+        f4 = local_vectors[..., 2] / rxyz
+        f = torch.cat([f1[..., None], f2[..., None], f3[..., None], f4[..., None]], -1)
+        return f
+    def _dist(self, X, mask, eps=1e-6):
+        mask_2D = torch.unsqueeze(mask, 1) * torch.unsqueeze(mask, 2)
+        dX = torch.unsqueeze(X, 1) - torch.unsqueeze(X, 2)
+        D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+        D_max, _ = torch.max(D, -1, keepdim=True)
+        D_adjust = D + (1.0 - mask_2D) * D_max
+        D_neighbors, E_idx = torch.topk(
+            D_adjust, np.minimum(self.top_k, X.shape[1]), dim=-1, largest=False
+        )
+        return D_neighbors, E_idx
+    def _rbf(self, D):
+        device = D.device
+        D_min, D_max, D_count = 2.0, 22.0, self.num_rbf
+        D_mu = torch.linspace(D_min, D_max, D_count, device=device)
+        D_mu = D_mu.view([1, 1, 1, -1])
+        D_sigma = (D_max - D_min) / D_count
+        D_expand = torch.unsqueeze(D, -1)
+        RBF = torch.exp(-(((D_expand - D_mu) / D_sigma) ** 2))
+        return RBF
+    def _get_rbf(self, A, B, E_idx):
+        D_A_B = torch.sqrt(
+            torch.sum((A[:, :, None, :] - B[:, None, :, :]) ** 2, -1) + 1e-6
+        )  # [B, L, L]
+        D_A_B_neighbors = gather_edges(D_A_B[:, :, :, None], E_idx)[
+            :, :, :, 0
+        ]  # [B,L,K]
+        RBF_A_B = self._rbf(D_A_B_neighbors)
+        return RBF_A_B
+    def forward(self, input_features):
+        Y = input_features["Y"]
+        Y_m = input_features["Y_m"]
+        Y_t = input_features["Y_t"]
+        X = input_features["X"]
+        mask = input_features["mask"]
+        R_idx = input_features["R_idx"]
+        chain_labels = input_features["chain_labels"]
+        if self.augment_eps > 0:
+            X = X + self.augment_eps * torch.randn_like(X)
+            Y = Y + self.augment_eps * torch.randn_like(Y)
+        B, L, _, _ = X.shape
+        Ca = X[:, :, 1, :]
+        N = X[:, :, 0, :]
+        C = X[:, :, 2, :]
+        O = X[:, :, 3, :]
+        b = Ca - N
+        c = C - Ca
+        a = torch.cross(b, c, dim=-1)
+        Cb = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + Ca  # shift from CA
+        D_neighbors, E_idx = self._dist(Ca, mask)
+        RBF_all = []
+        RBF_all.append(self._rbf(D_neighbors))  # Ca-Ca
+        RBF_all.append(self._get_rbf(N, N, E_idx))  # N-N
+        RBF_all.append(self._get_rbf(C, C, E_idx))  # C-C
+        RBF_all.append(self._get_rbf(O, O, E_idx))  # O-O
+        RBF_all.append(self._get_rbf(Cb, Cb, E_idx))  # Cb-Cb
+        RBF_all.append(self._get_rbf(Ca, N, E_idx))  # Ca-N
+        RBF_all.append(self._get_rbf(Ca, C, E_idx))  # Ca-C
+        RBF_all.append(self._get_rbf(Ca, O, E_idx))  # Ca-O
+        RBF_all.append(self._get_rbf(Ca, Cb, E_idx))  # Ca-Cb
+        RBF_all.append(self._get_rbf(N, C, E_idx))  # N-C
+        RBF_all.append(self._get_rbf(N, O, E_idx))  # N-O
+        RBF_all.append(self._get_rbf(N, Cb, E_idx))  # N-Cb
+        RBF_all.append(self._get_rbf(Cb, C, E_idx))  # Cb-C
+        RBF_all.append(self._get_rbf(Cb, O, E_idx))  # Cb-O
+        RBF_all.append(self._get_rbf(O, C, E_idx))  # O-C
+        RBF_all.append(self._get_rbf(N, Ca, E_idx))  # N-Ca
+        RBF_all.append(self._get_rbf(C, Ca, E_idx))  # C-Ca
+        RBF_all.append(self._get_rbf(O, Ca, E_idx))  # O-Ca
+        RBF_all.append(self._get_rbf(Cb, Ca, E_idx))  # Cb-Ca
+        RBF_all.append(self._get_rbf(C, N, E_idx))  # C-N
+        RBF_all.append(self._get_rbf(O, N, E_idx))  # O-N
+        RBF_all.append(self._get_rbf(Cb, N, E_idx))  # Cb-N
+        RBF_all.append(self._get_rbf(C, Cb, E_idx))  # C-Cb
+        RBF_all.append(self._get_rbf(O, Cb, E_idx))  # O-Cb
+        RBF_all.append(self._get_rbf(C, O, E_idx))  # C-O
+        RBF_all = torch.cat(tuple(RBF_all), dim=-1)
+        offset = R_idx[:, :, None] - R_idx[:, None, :]
+        offset = gather_edges(offset[:, :, :, None], E_idx)[:, :, :, 0]  # [B, L, K]
+        d_chains = (
+            (chain_labels[:, :, None] - chain_labels[:, None, :]) == 0
+        ).long()  # find self vs non-self interaction
+        E_chains = gather_edges(d_chains[:, :, :, None], E_idx)[:, :, :, 0]
+        E_positional = self.embeddings(offset.long(), E_chains)
+        E = torch.cat((E_positional, RBF_all), -1)
+        E = self.edge_embedding(E)
+        E = self.norm_edges(E)
+        if self.use_side_chains:
+            xyz_37 = input_features["xyz_37"]
+            xyz_37_m = input_features["xyz_37_m"]
+            E_idx_sub = E_idx[:, :, :16]  # [B, L, 15]
+            mask_residues = input_features["chain_mask"]
+            xyz_37_m = xyz_37_m * (1 - mask_residues[:, :, None])
+            R_m = gather_nodes(xyz_37_m[:, :, 5:], E_idx_sub)
+            X_sidechain = xyz_37[:, :, 5:, :].view(B, L, -1)
+            R = gather_nodes(X_sidechain, E_idx_sub).view(
+                B, L, E_idx_sub.shape[2], -1, 3
+            )
+            R_t = self.side_chain_atom_types[None, None, None, :].repeat(
+                B, L, E_idx_sub.shape[2], 1
+            )
+            # Side chain atom context
+            R = R.view(B, L, -1, 3)  # coordinates
+            R_m = R_m.view(B, L, -1)  # mask
+            R_t = R_t.view(B, L, -1)  # atom types
+            # Ligand atom context
+            Y = torch.cat((R, Y), 2)  # [B, L, atoms, 3]
+            Y_m = torch.cat((R_m, Y_m), 2)  # [B, L, atoms]
+            Y_t = torch.cat((R_t, Y_t), 2)  # [B, L, atoms]
+            Cb_Y_distances = torch.sum((Cb[:, :, None, :] - Y) ** 2, -1)
+            mask_Y = mask[:, :, None] * Y_m
+            Cb_Y_distances_adjusted = Cb_Y_distances * mask_Y + (1.0 - mask_Y) * 10000.0
+            _, E_idx_Y = torch.topk(
+                Cb_Y_distances_adjusted, self.atom_context_num, dim=-1, largest=False
+            )
+            Y = torch.gather(Y, 2, E_idx_Y[:, :, :, None].repeat(1, 1, 1, 3))
+            Y_t = torch.gather(Y_t, 2, E_idx_Y)
+            Y_m = torch.gather(Y_m, 2, E_idx_Y)
+        Y_t = Y_t.long()
+        Y_t_g = self.periodic_table_features[1][Y_t]  # group; 19 categories including 0
+        Y_t_p = self.periodic_table_features[2][Y_t]  # period; 8 categories including 0
+        Y_t_g_1hot_ = torch.nn.functional.one_hot(Y_t_g, 19)  # [B, L, M, 19]
+        Y_t_p_1hot_ = torch.nn.functional.one_hot(Y_t_p, 8)  # [B, L, M, 8]
+        Y_t_1hot_ = torch.nn.functional.one_hot(Y_t, 120)  # [B, L, M, 120]
+        Y_t_1hot_ = torch.cat(
+            [Y_t_1hot_, Y_t_g_1hot_, Y_t_p_1hot_], -1
+        )  # [B, L, M, 147]
+        Y_t_1hot = self.type_linear(Y_t_1hot_.float())
+        D_N_Y = self._rbf(
+            torch.sqrt(torch.sum((N[:, :, None, :] - Y) ** 2, -1) + 1e-6)
+        )  # [B, L, M, num_bins]
+        D_Ca_Y = self._rbf(
+            torch.sqrt(torch.sum((Ca[:, :, None, :] - Y) ** 2, -1) + 1e-6)
+        )
+        D_C_Y = self._rbf(torch.sqrt(torch.sum((C[:, :, None, :] - Y) ** 2, -1) + 1e-6))
+        D_O_Y = self._rbf(torch.sqrt(torch.sum((O[:, :, None, :] - Y) ** 2, -1) + 1e-6))
+        D_Cb_Y = self._rbf(
+            torch.sqrt(torch.sum((Cb[:, :, None, :] - Y) ** 2, -1) + 1e-6)
+        )
+        f_angles = self._make_angle_features(N, Ca, C, Y)  # [B, L, M, 4]
+        D_all = torch.cat(
+            (D_N_Y, D_Ca_Y, D_C_Y, D_O_Y, D_Cb_Y, Y_t_1hot, f_angles), dim=-1
+        )  # [B,L,M,5*num_bins+5]
+        V = self.node_project_down(D_all)  # [B, L, M, node_features]
+        V = self.norm_nodes(V)
+        Y_edges = self._rbf(
+            torch.sqrt(
+                torch.sum((Y[:, :, :, None, :] - Y[:, :, None, :, :]) ** 2, -1) + 1e-6
+            )
+        )  # [B, L, M, M, num_bins]
+        Y_edges = self.y_edges(Y_edges)
+        Y_nodes = self.y_nodes(Y_t_1hot_.float())
+        Y_edges = self.norm_y_edges(Y_edges)
+        Y_nodes = self.norm_y_nodes(Y_nodes)
+        return V, E, E_idx, Y_nodes, Y_edges, Y_m
+class ProteinFeatures(torch.nn.Module):
+    def __init__(
+        self,
+        edge_features,
+        node_features,
+        num_positional_embeddings=16,
+        num_rbf=16,
+        top_k=48,
+        augment_eps=0.0,
+    ):
+        """Extract protein features"""
+        super(ProteinFeatures, self).__init__()
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.num_rbf = num_rbf
+        self.num_positional_embeddings = num_positional_embeddings
+        self.embeddings = PositionalEncodings(num_positional_embeddings)
+        edge_in = num_positional_embeddings + num_rbf * 25
+        self.edge_embedding = torch.nn.Linear(edge_in, edge_features, bias=False)
+        self.norm_edges = torch.nn.LayerNorm(edge_features)
+    def _dist(self, X, mask, eps=1e-6):
+        mask_2D = torch.unsqueeze(mask, 1) * torch.unsqueeze(mask, 2)
+        dX = torch.unsqueeze(X, 1) - torch.unsqueeze(X, 2)
+        D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+        D_max, _ = torch.max(D, -1, keepdim=True)
+        D_adjust = D + (1.0 - mask_2D) * D_max
+        D_neighbors, E_idx = torch.topk(
+            D_adjust, np.minimum(self.top_k, X.shape[1]), dim=-1, largest=False
+        )
+        return D_neighbors, E_idx
+    def _rbf(self, D):
+        device = D.device
+        D_min, D_max, D_count = 2.0, 22.0, self.num_rbf
+        D_mu = torch.linspace(D_min, D_max, D_count, device=device)
+        D_mu = D_mu.view([1, 1, 1, -1])
+        D_sigma = (D_max - D_min) / D_count
+        D_expand = torch.unsqueeze(D, -1)
+        RBF = torch.exp(-(((D_expand - D_mu) / D_sigma) ** 2))
+        return RBF
+    def _get_rbf(self, A, B, E_idx):
+        D_A_B = torch.sqrt(
+            torch.sum((A[:, :, None, :] - B[:, None, :, :]) ** 2, -1) + 1e-6
+        )  # [B, L, L]
+        D_A_B_neighbors = gather_edges(D_A_B[:, :, :, None], E_idx)[
+            :, :, :, 0
+        ]  # [B,L,K]
+        RBF_A_B = self._rbf(D_A_B_neighbors)
+        return RBF_A_B
+    def forward(self, input_features):
+        X = input_features["X"]
+        mask = input_features["mask"]
+        R_idx = input_features["R_idx"]
+        chain_labels = input_features["chain_labels"]
+        if self.augment_eps > 0:
+            X = X + self.augment_eps * torch.randn_like(X)
+        b = X[:, :, 1, :] - X[:, :, 0, :]
+        c = X[:, :, 2, :] - X[:, :, 1, :]
+        a = torch.cross(b, c, dim=-1)
+        Cb = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + X[:, :, 1, :]
+        Ca = X[:, :, 1, :]
+        N = X[:, :, 0, :]
+        C = X[:, :, 2, :]
+        O = X[:, :, 3, :]
+        D_neighbors, E_idx = self._dist(Ca, mask)
+        RBF_all = []
+        RBF_all.append(self._rbf(D_neighbors))  # Ca-Ca
+        RBF_all.append(self._get_rbf(N, N, E_idx))  # N-N
+        RBF_all.append(self._get_rbf(C, C, E_idx))  # C-C
+        RBF_all.append(self._get_rbf(O, O, E_idx))  # O-O
+        RBF_all.append(self._get_rbf(Cb, Cb, E_idx))  # Cb-Cb
+        RBF_all.append(self._get_rbf(Ca, N, E_idx))  # Ca-N
+        RBF_all.append(self._get_rbf(Ca, C, E_idx))  # Ca-C
+        RBF_all.append(self._get_rbf(Ca, O, E_idx))  # Ca-O
+        RBF_all.append(self._get_rbf(Ca, Cb, E_idx))  # Ca-Cb
+        RBF_all.append(self._get_rbf(N, C, E_idx))  # N-C
+        RBF_all.append(self._get_rbf(N, O, E_idx))  # N-O
+        RBF_all.append(self._get_rbf(N, Cb, E_idx))  # N-Cb
+        RBF_all.append(self._get_rbf(Cb, C, E_idx))  # Cb-C
+        RBF_all.append(self._get_rbf(Cb, O, E_idx))  # Cb-O
+        RBF_all.append(self._get_rbf(O, C, E_idx))  # O-C
+        RBF_all.append(self._get_rbf(N, Ca, E_idx))  # N-Ca
+        RBF_all.append(self._get_rbf(C, Ca, E_idx))  # C-Ca
+        RBF_all.append(self._get_rbf(O, Ca, E_idx))  # O-Ca
+        RBF_all.append(self._get_rbf(Cb, Ca, E_idx))  # Cb-Ca
+        RBF_all.append(self._get_rbf(C, N, E_idx))  # C-N
+        RBF_all.append(self._get_rbf(O, N, E_idx))  # O-N
+        RBF_all.append(self._get_rbf(Cb, N, E_idx))  # Cb-N
+        RBF_all.append(self._get_rbf(C, Cb, E_idx))  # C-Cb
+        RBF_all.append(self._get_rbf(O, Cb, E_idx))  # O-Cb
+        RBF_all.append(self._get_rbf(C, O, E_idx))  # C-O
+        RBF_all = torch.cat(tuple(RBF_all), dim=-1)
+        offset = R_idx[:, :, None] - R_idx[:, None, :]
+        offset = gather_edges(offset[:, :, :, None], E_idx)[:, :, :, 0]  # [B, L, K]
+        d_chains = (
+            (chain_labels[:, :, None] - chain_labels[:, None, :]) == 0
+        ).long()  # find self vs non-self interaction
+        E_chains = gather_edges(d_chains[:, :, :, None], E_idx)[:, :, :, 0]
+        E_positional = self.embeddings(offset.long(), E_chains)
+        E = torch.cat((E_positional, RBF_all), -1)
+        E = self.edge_embedding(E)
+        E = self.norm_edges(E)
+        return E, E_idx
+class ProteinFeaturesMembrane(torch.nn.Module):
+    def __init__(
+        self,
+        edge_features,
+        node_features,
+        num_positional_embeddings=16,
+        num_rbf=16,
+        top_k=48,
+        augment_eps=0.0,
+        num_classes=3,
+    ):
+        """Extract protein features"""
+        super(ProteinFeaturesMembrane, self).__init__()
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.num_rbf = num_rbf
+        self.num_positional_embeddings = num_positional_embeddings
+        self.num_classes = num_classes
+        self.embeddings = PositionalEncodings(num_positional_embeddings)
+        edge_in = num_positional_embeddings + num_rbf * 25
+        self.edge_embedding = torch.nn.Linear(edge_in, edge_features, bias=False)
+        self.norm_edges = torch.nn.LayerNorm(edge_features)
+        self.node_embedding = torch.nn.Linear(
+            self.num_classes, node_features, bias=False
+        )
+        self.norm_nodes = torch.nn.LayerNorm(node_features)
+    def _dist(self, X, mask, eps=1e-6):
+        mask_2D = torch.unsqueeze(mask, 1) * torch.unsqueeze(mask, 2)
+        dX = torch.unsqueeze(X, 1) - torch.unsqueeze(X, 2)
+        D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+        D_max, _ = torch.max(D, -1, keepdim=True)
+        D_adjust = D + (1.0 - mask_2D) * D_max
+        D_neighbors, E_idx = torch.topk(
+            D_adjust, np.minimum(self.top_k, X.shape[1]), dim=-1, largest=False
+        )
+        return D_neighbors, E_idx
+    def _rbf(self, D):
+        device = D.device
+        D_min, D_max, D_count = 2.0, 22.0, self.num_rbf
+        D_mu = torch.linspace(D_min, D_max, D_count, device=device)
+        D_mu = D_mu.view([1, 1, 1, -1])
+        D_sigma = (D_max - D_min) / D_count
+        D_expand = torch.unsqueeze(D, -1)
+        RBF = torch.exp(-(((D_expand - D_mu) / D_sigma) ** 2))
+        return RBF
+    def _get_rbf(self, A, B, E_idx):
+        D_A_B = torch.sqrt(
+            torch.sum((A[:, :, None, :] - B[:, None, :, :]) ** 2, -1) + 1e-6
+        )  # [B, L, L]
+        D_A_B_neighbors = gather_edges(D_A_B[:, :, :, None], E_idx)[
+            :, :, :, 0
+        ]  # [B,L,K]
+        RBF_A_B = self._rbf(D_A_B_neighbors)
+        return RBF_A_B
+    def forward(self, input_features):
+        X = input_features["X"]
+        mask = input_features["mask"]
+        R_idx = input_features["R_idx"]
+        chain_labels = input_features["chain_labels"]
+        membrane_per_residue_labels = input_features["membrane_per_residue_labels"]
+        if self.augment_eps > 0:
+            X = X + self.augment_eps * torch.randn_like(X)
+        b = X[:, :, 1, :] - X[:, :, 0, :]
+        c = X[:, :, 2, :] - X[:, :, 1, :]
+        a = torch.cross(b, c, dim=-1)
+        Cb = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + X[:, :, 1, :]
+        Ca = X[:, :, 1, :]
+        N = X[:, :, 0, :]
+        C = X[:, :, 2, :]
+        O = X[:, :, 3, :]
+        D_neighbors, E_idx = self._dist(Ca, mask)
+        RBF_all = []
+        RBF_all.append(self._rbf(D_neighbors))  # Ca-Ca
+        RBF_all.append(self._get_rbf(N, N, E_idx))  # N-N
+        RBF_all.append(self._get_rbf(C, C, E_idx))  # C-C
+        RBF_all.append(self._get_rbf(O, O, E_idx))  # O-O
+        RBF_all.append(self._get_rbf(Cb, Cb, E_idx))  # Cb-Cb
+        RBF_all.append(self._get_rbf(Ca, N, E_idx))  # Ca-N
+        RBF_all.append(self._get_rbf(Ca, C, E_idx))  # Ca-C
+        RBF_all.append(self._get_rbf(Ca, O, E_idx))  # Ca-O
+        RBF_all.append(self._get_rbf(Ca, Cb, E_idx))  # Ca-Cb
+        RBF_all.append(self._get_rbf(N, C, E_idx))  # N-C
+        RBF_all.append(self._get_rbf(N, O, E_idx))  # N-O
+        RBF_all.append(self._get_rbf(N, Cb, E_idx))  # N-Cb
+        RBF_all.append(self._get_rbf(Cb, C, E_idx))  # Cb-C
+        RBF_all.append(self._get_rbf(Cb, O, E_idx))  # Cb-O
+        RBF_all.append(self._get_rbf(O, C, E_idx))  # O-C
+        RBF_all.append(self._get_rbf(N, Ca, E_idx))  # N-Ca
+        RBF_all.append(self._get_rbf(C, Ca, E_idx))  # C-Ca
+        RBF_all.append(self._get_rbf(O, Ca, E_idx))  # O-Ca
+        RBF_all.append(self._get_rbf(Cb, Ca, E_idx))  # Cb-Ca
+        RBF_all.append(self._get_rbf(C, N, E_idx))  # C-N
+        RBF_all.append(self._get_rbf(O, N, E_idx))  # O-N
+        RBF_all.append(self._get_rbf(Cb, N, E_idx))  # Cb-N
+        RBF_all.append(self._get_rbf(C, Cb, E_idx))  # C-Cb
+        RBF_all.append(self._get_rbf(O, Cb, E_idx))  # O-Cb
+        RBF_all.append(self._get_rbf(C, O, E_idx))  # C-O
+        RBF_all = torch.cat(tuple(RBF_all), dim=-1)
+        offset = R_idx[:, :, None] - R_idx[:, None, :]
+        offset = gather_edges(offset[:, :, :, None], E_idx)[:, :, :, 0]  # [B, L, K]
+        d_chains = (
+            (chain_labels[:, :, None] - chain_labels[:, None, :]) == 0
+        ).long()  # find self vs non-self interaction
+        E_chains = gather_edges(d_chains[:, :, :, None], E_idx)[:, :, :, 0]
+        E_positional = self.embeddings(offset.long(), E_chains)
+        E = torch.cat((E_positional, RBF_all), -1)
+        E = self.edge_embedding(E)
+        E = self.norm_edges(E)
+        C_1hot = torch.nn.functional.one_hot(
+            membrane_per_residue_labels, self.num_classes
+        ).float()
+        V = self.node_embedding(C_1hot)
+        V = self.norm_nodes(V)
+        return V, E, E_idx
+class DecLayerJ(torch.nn.Module):
+    def __init__(self, num_hidden, num_in, dropout=0.1, num_heads=None, scale=30):
+        super(DecLayerJ, self).__init__()
+        self.num_hidden = num_hidden
+        self.num_in = num_in
+        self.scale = scale
+        self.dropout1 = torch.nn.Dropout(dropout)
+        self.dropout2 = torch.nn.Dropout(dropout)
+        self.norm1 = torch.nn.LayerNorm(num_hidden)
+        self.norm2 = torch.nn.LayerNorm(num_hidden)
+        self.W1 = torch.nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W2 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W3 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+        self.dense = PositionWiseFeedForward(num_hidden, num_hidden * 4)
+    def forward(self, h_V, h_E, mask_V=None, mask_attend=None):
+        """Parallel computation of full transformer layer"""
+        # Concatenate h_V_i to h_E_ij
+        h_V_expand = h_V.unsqueeze(-2).expand(
+            -1, -1, -1, h_E.size(-2), -1
+        )  # the only difference
+        h_EV = torch.cat([h_V_expand, h_E], -1)
+        h_message = self.W3(self.act(self.W2(self.act(self.W1(h_EV)))))
+        if mask_attend is not None:
+            h_message = mask_attend.unsqueeze(-1) * h_message
+        dh = torch.sum(h_message, -2) / self.scale
+        h_V = self.norm1(h_V + self.dropout1(dh))
+        # Position-wise feedforward
+        dh = self.dense(h_V)
+        h_V = self.norm2(h_V + self.dropout2(dh))
+        if mask_V is not None:
+            mask_V = mask_V.unsqueeze(-1)
+            h_V = mask_V * h_V
+        return h_V
+class PositionWiseFeedForward(torch.nn.Module):
+    def __init__(self, num_hidden, num_ff):
+        super(PositionWiseFeedForward, self).__init__()
+        self.W_in = torch.nn.Linear(num_hidden, num_ff, bias=True)
+        self.W_out = torch.nn.Linear(num_ff, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+    def forward(self, h_V):
+        h = self.act(self.W_in(h_V))
+        h = self.W_out(h)
+        return h
+class PositionalEncodings(torch.nn.Module):
+    def __init__(self, num_embeddings, max_relative_feature=32):
+        super(PositionalEncodings, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.max_relative_feature = max_relative_feature
+        self.linear = torch.nn.Linear(2 * max_relative_feature + 1 + 1, num_embeddings)
+    def forward(self, offset, mask):
+        d = torch.clip(
+            offset + self.max_relative_feature, 0, 2 * self.max_relative_feature
+        ) * mask + (1 - mask) * (2 * self.max_relative_feature + 1)
+        d_onehot = torch.nn.functional.one_hot(d, 2 * self.max_relative_feature + 1 + 1)
+        E = self.linear(d_onehot.float())
+        return E
+class DecLayer(torch.nn.Module):
+    def __init__(self, num_hidden, num_in, dropout=0.1, num_heads=None, scale=30):
+        super(DecLayer, self).__init__()
+        self.num_hidden = num_hidden
+        self.num_in = num_in
+        self.scale = scale
+        self.dropout1 = torch.nn.Dropout(dropout)
+        self.dropout2 = torch.nn.Dropout(dropout)
+        self.norm1 = torch.nn.LayerNorm(num_hidden)
+        self.norm2 = torch.nn.LayerNorm(num_hidden)
+        self.W1 = torch.nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W2 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W3 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+        self.dense = PositionWiseFeedForward(num_hidden, num_hidden * 4)
+    def forward(self, h_V, h_E, mask_V=None, mask_attend=None):
+        """Parallel computation of full transformer layer"""
+        # Concatenate h_V_i to h_E_ij
+        h_V_expand = h_V.unsqueeze(-2).expand(-1, -1, h_E.size(-2), -1)
+        h_EV = torch.cat([h_V_expand, h_E], -1)
+        h_message = self.W3(self.act(self.W2(self.act(self.W1(h_EV)))))
+        if mask_attend is not None:
+            h_message = mask_attend.unsqueeze(-1) * h_message
+        dh = torch.sum(h_message, -2) / self.scale
+        h_V = self.norm1(h_V + self.dropout1(dh))
+        # Position-wise feedforward
+        dh = self.dense(h_V)
+        h_V = self.norm2(h_V + self.dropout2(dh))
+        if mask_V is not None:
+            mask_V = mask_V.unsqueeze(-1)
+            h_V = mask_V * h_V
+        return h_V
+class EncLayer(torch.nn.Module):
+    def __init__(self, num_hidden, num_in, dropout=0.1, num_heads=None, scale=30):
+        super(EncLayer, self).__init__()
+        self.num_hidden = num_hidden
+        self.num_in = num_in
+        self.scale = scale
+        self.dropout1 = torch.nn.Dropout(dropout)
+        self.dropout2 = torch.nn.Dropout(dropout)
+        self.dropout3 = torch.nn.Dropout(dropout)
+        self.norm1 = torch.nn.LayerNorm(num_hidden)
+        self.norm2 = torch.nn.LayerNorm(num_hidden)
+        self.norm3 = torch.nn.LayerNorm(num_hidden)
+        self.W1 = torch.nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W2 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W3 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W11 = torch.nn.Linear(num_hidden + num_in, num_hidden, bias=True)
+        self.W12 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.W13 = torch.nn.Linear(num_hidden, num_hidden, bias=True)
+        self.act = torch.nn.GELU()
+        self.dense = PositionWiseFeedForward(num_hidden, num_hidden * 4)
+    def forward(self, h_V, h_E, E_idx, mask_V=None, mask_attend=None):
+        """Parallel computation of full transformer layer"""
+        h_EV = cat_neighbors_nodes(h_V, h_E, E_idx)
+        h_V_expand = h_V.unsqueeze(-2).expand(-1, -1, h_EV.size(-2), -1)
+        h_EV = torch.cat([h_V_expand, h_EV], -1)
+        h_message = self.W3(self.act(self.W2(self.act(self.W1(h_EV)))))
+        if mask_attend is not None:
+            h_message = mask_attend.unsqueeze(-1) * h_message
+        dh = torch.sum(h_message, -2) / self.scale
+        h_V = self.norm1(h_V + self.dropout1(dh))
+        dh = self.dense(h_V)
+        h_V = self.norm2(h_V + self.dropout2(dh))
+        if mask_V is not None:
+            mask_V = mask_V.unsqueeze(-1)
+            h_V = mask_V * h_V
+        h_EV = cat_neighbors_nodes(h_V, h_E, E_idx)
+        h_V_expand = h_V.unsqueeze(-2).expand(-1, -1, h_EV.size(-2), -1)
+        h_EV = torch.cat([h_V_expand, h_EV], -1)
+        h_message = self.W13(self.act(self.W12(self.act(self.W11(h_EV)))))
+        h_E = self.norm3(h_E + self.dropout3(h_message))
+        return h_V, h_E
+# The following gather functions
+def gather_edges(edges, neighbor_idx):
+    # Features [B,N,N,C] at Neighbor indices [B,N,K] => Neighbor features [B,N,K,C]
+    neighbors = neighbor_idx.unsqueeze(-1).expand(-1, -1, -1, edges.size(-1))
+    edge_features = torch.gather(edges, 2, neighbors)
+    return edge_features
+def gather_nodes(nodes, neighbor_idx):
+    # Features [B,N,C] at Neighbor indices [B,N,K] => [B,N,K,C]
+    # Flatten and expand indices per batch [B,N,K] => [B,NK] => [B,NK,C]
+    neighbors_flat = neighbor_idx.reshape((neighbor_idx.shape[0], -1))
+    neighbors_flat = neighbors_flat.unsqueeze(-1).expand(-1, -1, nodes.size(2))
+    # Gather and re-pack
+    neighbor_features = torch.gather(nodes, 1, neighbors_flat)
+    neighbor_features = neighbor_features.view(list(neighbor_idx.shape)[:3] + [-1])
+    return neighbor_features
+def gather_nodes_t(nodes, neighbor_idx):
+    # Features [B,N,C] at Neighbor index [B,K] => Neighbor features[B,K,C]
+    idx_flat = neighbor_idx.unsqueeze(-1).expand(-1, -1, nodes.size(2))
+    neighbor_features = torch.gather(nodes, 1, idx_flat)
+    return neighbor_features
+def cat_neighbors_nodes(h_nodes, h_neighbors, E_idx):
+    h_nodes = gather_nodes(h_nodes, E_idx)
+    h_nn = torch.cat([h_neighbors, h_nodes], -1)
+    return h_nn

openfold/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#from . import model
+#from . import utils
+#from . import np
+#from . import resources
+#__all__ = ["model", "utils", "np", "data", "resources"]

openfold/config.py ADDED Viewed

	@@ -0,0 +1,558 @@

+import copy
+import ml_collections as mlc
+def set_inf(c, inf):
+    for k, v in c.items():
+        if isinstance(v, mlc.ConfigDict):
+            set_inf(v, inf)
+        elif k == "inf":
+            c[k] = inf
+def enforce_config_constraints(config):
+    def string_to_setting(s):
+        path = s.split('.')
+        setting = config
+        for p in path:
+            setting = setting[p]
+        return setting
+    mutually_exclusive_bools = [
+        (
+            "model.template.average_templates",
+            "model.template.offload_templates"
+        )
+    ]
+    for s1, s2 in mutually_exclusive_bools:
+        s1_setting = string_to_setting(s1)
+        s2_setting = string_to_setting(s2)
+        if(s1_setting and s2_setting):
+            raise ValueError(f"Only one of {s1} and {s2} may be set at a time")
+def model_config(name, train=False, low_prec=False):
+    c = copy.deepcopy(config)
+    if name == "initial_training":
+        # AF2 Suppl. Table 4, "initial training" setting
+        pass
+    elif name == "finetuning":
+        # AF2 Suppl. Table 4, "finetuning" setting
+        c.data.train.max_extra_msa = 5120
+        c.data.train.crop_size = 384
+        c.data.train.max_msa_clusters = 512
+        c.loss.violation.weight = 1.
+        c.loss.experimentally_resolved.weight = 0.01
+    elif name == "finetuning_ptm":
+        c.data.train.max_extra_msa = 5120
+        c.data.train.crop_size = 384
+        c.data.train.max_msa_clusters = 512
+        c.loss.violation.weight = 1.
+        c.loss.experimentally_resolved.weight = 0.01
+        c.model.heads.tm.enabled = True
+        c.loss.tm.weight = 0.1
+    elif name == "model_1":
+        # AF2 Suppl. Table 5, Model 1.1.1
+        c.data.train.max_extra_msa = 5120
+        c.data.predict.max_extra_msa = 5120
+        c.data.common.reduce_max_clusters_by_max_templates = True
+        c.data.common.use_templates = True
+        c.data.common.use_template_torsion_angles = True
+        c.model.template.enabled = True
+    elif name == "model_2":
+        # AF2 Suppl. Table 5, Model 1.1.2
+        c.data.common.reduce_max_clusters_by_max_templates = True
+        c.data.common.use_templates = True
+        c.data.common.use_template_torsion_angles = True
+        c.model.template.enabled = True
+    elif name == "model_3":
+        # AF2 Suppl. Table 5, Model 1.2.1
+        c.data.train.max_extra_msa = 5120
+        c.data.predict.max_extra_msa = 5120
+        c.model.template.enabled = False
+    elif name == "model_4":
+        # AF2 Suppl. Table 5, Model 1.2.2
+        c.data.train.max_extra_msa = 5120
+        c.data.predict.max_extra_msa = 5120
+        c.model.template.enabled = False
+    elif name == "model_5":
+        # AF2 Suppl. Table 5, Model 1.2.3
+        c.model.template.enabled = False
+    elif name == "model_1_ptm":
+        c.data.train.max_extra_msa = 5120
+        c.data.predict.max_extra_msa = 5120
+        c.data.common.reduce_max_clusters_by_max_templates = True
+        c.data.common.use_templates = True
+        c.data.common.use_template_torsion_angles = True
+        c.model.template.enabled = True
+        c.model.heads.tm.enabled = True
+        c.loss.tm.weight = 0.1
+    elif name == "model_2_ptm":
+        c.data.common.reduce_max_clusters_by_max_templates = True
+        c.data.common.use_templates = True
+        c.data.common.use_template_torsion_angles = True
+        c.model.template.enabled = True
+        c.model.heads.tm.enabled = True
+        c.loss.tm.weight = 0.1
+    elif name == "model_3_ptm":
+        c.data.train.max_extra_msa = 5120
+        c.data.predict.max_extra_msa = 5120
+        c.model.template.enabled = False
+        c.model.heads.tm.enabled = True
+        c.loss.tm.weight = 0.1
+    elif name == "model_4_ptm":
+        c.data.train.max_extra_msa = 5120
+        c.data.predict.max_extra_msa = 5120
+        c.model.template.enabled = False
+        c.model.heads.tm.enabled = True
+        c.loss.tm.weight = 0.1
+    elif name == "model_5_ptm":
+        c.model.template.enabled = False
+        c.model.heads.tm.enabled = True
+        c.loss.tm.weight = 0.1
+    else:
+        raise ValueError("Invalid model name")
+    if train:
+        c.globals.blocks_per_ckpt = 1
+        c.globals.chunk_size = None
+        c.globals.use_lma = False
+        c.globals.offload_inference = False
+        c.model.template.average_templates = False
+        c.model.template.offload_templates = False
+    if low_prec:
+        c.globals.eps = 1e-4
+        # If we want exact numerical parity with the original, inf can't be
+        # a global constant
+        set_inf(c, 1e4)
+    enforce_config_constraints(c)
+    return c
+c_z = mlc.FieldReference(128, field_type=int)
+c_m = mlc.FieldReference(256, field_type=int)
+c_t = mlc.FieldReference(64, field_type=int)
+c_e = mlc.FieldReference(64, field_type=int)
+c_s = mlc.FieldReference(384, field_type=int)
+blocks_per_ckpt = mlc.FieldReference(None, field_type=int)
+chunk_size = mlc.FieldReference(4, field_type=int)
+aux_distogram_bins = mlc.FieldReference(64, field_type=int)
+tm_enabled = mlc.FieldReference(False, field_type=bool)
+eps = mlc.FieldReference(1e-8, field_type=float)
+templates_enabled = mlc.FieldReference(True, field_type=bool)
+embed_template_torsion_angles = mlc.FieldReference(True, field_type=bool)
+tune_chunk_size = mlc.FieldReference(True, field_type=bool)
+NUM_RES = "num residues placeholder"
+NUM_MSA_SEQ = "msa placeholder"
+NUM_EXTRA_SEQ = "extra msa placeholder"
+NUM_TEMPLATES = "num templates placeholder"
+config = mlc.ConfigDict(
+    {
+        "data": {
+            "common": {
+                "feat": {
+                    "aatype": [NUM_RES],
+                    "all_atom_mask": [NUM_RES, None],
+                    "all_atom_positions": [NUM_RES, None, None],
+                    "alt_chi_angles": [NUM_RES, None],
+                    "atom14_alt_gt_exists": [NUM_RES, None],
+                    "atom14_alt_gt_positions": [NUM_RES, None, None],
+                    "atom14_atom_exists": [NUM_RES, None],
+                    "atom14_atom_is_ambiguous": [NUM_RES, None],
+                    "atom14_gt_exists": [NUM_RES, None],
+                    "atom14_gt_positions": [NUM_RES, None, None],
+                    "atom37_atom_exists": [NUM_RES, None],
+                    "backbone_rigid_mask": [NUM_RES],
+                    "backbone_rigid_tensor": [NUM_RES, None, None],
+                    "bert_mask": [NUM_MSA_SEQ, NUM_RES],
+                    "chi_angles_sin_cos": [NUM_RES, None, None],
+                    "chi_mask": [NUM_RES, None],
+                    "extra_deletion_value": [NUM_EXTRA_SEQ, NUM_RES],
+                    "extra_has_deletion": [NUM_EXTRA_SEQ, NUM_RES],
+                    "extra_msa": [NUM_EXTRA_SEQ, NUM_RES],
+                    "extra_msa_mask": [NUM_EXTRA_SEQ, NUM_RES],
+                    "extra_msa_row_mask": [NUM_EXTRA_SEQ],
+                    "is_distillation": [],
+                    "msa_feat": [NUM_MSA_SEQ, NUM_RES, None],
+                    "msa_mask": [NUM_MSA_SEQ, NUM_RES],
+                    "msa_row_mask": [NUM_MSA_SEQ],
+                    "no_recycling_iters": [],
+                    "pseudo_beta": [NUM_RES, None],
+                    "pseudo_beta_mask": [NUM_RES],
+                    "residue_index": [NUM_RES],
+                    "residx_atom14_to_atom37": [NUM_RES, None],
+                    "residx_atom37_to_atom14": [NUM_RES, None],
+                    "resolution": [],
+                    "rigidgroups_alt_gt_frames": [NUM_RES, None, None, None],
+                    "rigidgroups_group_exists": [NUM_RES, None],
+                    "rigidgroups_group_is_ambiguous": [NUM_RES, None],
+                    "rigidgroups_gt_exists": [NUM_RES, None],
+                    "rigidgroups_gt_frames": [NUM_RES, None, None, None],
+                    "seq_length": [],
+                    "seq_mask": [NUM_RES],
+                    "target_feat": [NUM_RES, None],
+                    "template_aatype": [NUM_TEMPLATES, NUM_RES],
+                    "template_all_atom_mask": [NUM_TEMPLATES, NUM_RES, None],
+                    "template_all_atom_positions": [
+                        NUM_TEMPLATES, NUM_RES, None, None,
+                    ],
+                    "template_alt_torsion_angles_sin_cos": [
+                        NUM_TEMPLATES, NUM_RES, None, None,
+                    ],
+                    "template_backbone_rigid_mask": [NUM_TEMPLATES, NUM_RES],
+                    "template_backbone_rigid_tensor": [
+                        NUM_TEMPLATES, NUM_RES, None, None,
+                    ],
+                    "template_mask": [NUM_TEMPLATES],
+                    "template_pseudo_beta": [NUM_TEMPLATES, NUM_RES, None],
+                    "template_pseudo_beta_mask": [NUM_TEMPLATES, NUM_RES],
+                    "template_sum_probs": [NUM_TEMPLATES, None],
+                    "template_torsion_angles_mask": [
+                        NUM_TEMPLATES, NUM_RES, None,
+                    ],
+                    "template_torsion_angles_sin_cos": [
+                        NUM_TEMPLATES, NUM_RES, None, None,
+                    ],
+                    "true_msa": [NUM_MSA_SEQ, NUM_RES],
+                    "use_clamped_fape": [],
+                },
+                "masked_msa": {
+                    "profile_prob": 0.1,
+                    "same_prob": 0.1,
+                    "uniform_prob": 0.1,
+                },
+                "max_recycling_iters": 3,
+                "msa_cluster_features": True,
+                "reduce_msa_clusters_by_max_templates": False,
+                "resample_msa_in_recycling": True,
+                "template_features": [
+                    "template_all_atom_positions",
+                    "template_sum_probs",
+                    "template_aatype",
+                    "template_all_atom_mask",
+                ],
+                "unsupervised_features": [
+                    "aatype",
+                    "residue_index",
+                    "msa",
+                    "num_alignments",
+                    "seq_length",
+                    "between_segment_residues",
+                    "deletion_matrix",
+                    "no_recycling_iters",
+                ],
+                "use_templates": templates_enabled,
+                "use_template_torsion_angles": embed_template_torsion_angles,
+            },
+            "supervised": {
+                "clamp_prob": 0.9,
+                "supervised_features": [
+                    "all_atom_mask",
+                    "all_atom_positions",
+                    "resolution",
+                    "use_clamped_fape",
+                    "is_distillation",
+                ],
+            },
+            "predict": {
+                "fixed_size": True,
+                "subsample_templates": False,  # We want top templates.
+                "masked_msa_replace_fraction": 0.15,
+                "max_msa_clusters": 512,
+                "max_extra_msa": 1024,
+                "max_template_hits": 4,
+                "max_templates": 4,
+                "crop": False,
+                "crop_size": None,
+                "supervised": False,
+                "uniform_recycling": False,
+            },
+            "eval": {
+                "fixed_size": True,
+                "subsample_templates": False,  # We want top templates.
+                "masked_msa_replace_fraction": 0.15,
+                "max_msa_clusters": 128,
+                "max_extra_msa": 1024,
+                "max_template_hits": 4,
+                "max_templates": 4,
+                "crop": False,
+                "crop_size": None,
+                "supervised": True,
+                "uniform_recycling": False,
+            },
+            "train": {
+                "fixed_size": True,
+                "subsample_templates": True,
+                "masked_msa_replace_fraction": 0.15,
+                "max_msa_clusters": 128,
+                "max_extra_msa": 1024,
+                "max_template_hits": 4,
+                "max_templates": 4,
+                "shuffle_top_k_prefiltered": 20,
+                "crop": True,
+                "crop_size": 256,
+                "supervised": True,
+                "clamp_prob": 0.9,
+                "max_distillation_msa_clusters": 1000,
+                "uniform_recycling": True,
+                "distillation_prob": 0.75,
+            },
+            "data_module": {
+                "use_small_bfd": False,
+                "data_loaders": {
+                    "batch_size": 1,
+                    "num_workers": 16,
+                },
+            },
+        },
+        # Recurring FieldReferences that can be changed globally here
+        "globals": {
+            "blocks_per_ckpt": blocks_per_ckpt,
+            "chunk_size": chunk_size,
+            "use_lma": False,
+            "offload_inference": False,
+            "c_z": c_z,
+            "c_m": c_m,
+            "c_t": c_t,
+            "c_e": c_e,
+            "c_s": c_s,
+            "eps": eps,
+        },
+        "model": {
+            "_mask_trans": False,
+            "input_embedder": {
+                "tf_dim": 22,
+                "msa_dim": 49,
+                "c_z": c_z,
+                "c_m": c_m,
+                "relpos_k": 32,
+            },
+            "recycling_embedder": {
+                "c_z": c_z,
+                "c_m": c_m,
+                "min_bin": 3.25,
+                "max_bin": 20.75,
+                "no_bins": 15,
+                "inf": 1e8,
+            },
+            "template": {
+                "distogram": {
+                    "min_bin": 3.25,
+                    "max_bin": 50.75,
+                    "no_bins": 39,
+                },
+                "template_angle_embedder": {
+                    # DISCREPANCY: c_in is supposed to be 51.
+                    "c_in": 57,
+                    "c_out": c_m,
+                },
+                "template_pair_embedder": {
+                    "c_in": 88,
+                    "c_out": c_t,
+                },
+                "template_pair_stack": {
+                    "c_t": c_t,
+                    # DISCREPANCY: c_hidden_tri_att here is given in the supplement
+                    # as 64. In the code, it's 16.
+                    "c_hidden_tri_att": 16,
+                    "c_hidden_tri_mul": 64,
+                    "no_blocks": 2,
+                    "no_heads": 4,
+                    "pair_transition_n": 2,
+                    "dropout_rate": 0.25,
+                    "blocks_per_ckpt": blocks_per_ckpt,
+                    "tune_chunk_size": tune_chunk_size,
+                    "inf": 1e9,
+                },
+                "template_pointwise_attention": {
+                    "c_t": c_t,
+                    "c_z": c_z,
+                    # DISCREPANCY: c_hidden here is given in the supplement as 64.
+                    # It's actually 16.
+                    "c_hidden": 16,
+                    "no_heads": 4,
+                    "inf": 1e5,  # 1e9,
+                },
+                "inf": 1e5,  # 1e9,
+                "eps": eps,  # 1e-6,
+                "enabled": templates_enabled,
+                "embed_angles": embed_template_torsion_angles,
+                "use_unit_vector": False,
+                # Approximate template computation, saving memory.
+                # In our experiments, results are equivalent to or better than
+                # the stock implementation. Should be enabled for all new
+                # training runs.
+                "average_templates": False,
+                # Offload template embeddings to CPU memory. Vastly reduced
+                # memory consumption at the cost of a modest increase in
+                # runtime. Useful for inference on very long sequences.
+                # Mutually exclusive with average_templates.
+                "offload_templates": False,
+            },
+            "extra_msa": {
+                "extra_msa_embedder": {
+                    "c_in": 25,
+                    "c_out": c_e,
+                },
+                "extra_msa_stack": {
+                    "c_m": c_e,
+                    "c_z": c_z,
+                    "c_hidden_msa_att": 8,
+                    "c_hidden_opm": 32,
+                    "c_hidden_mul": 128,
+                    "c_hidden_pair_att": 32,
+                    "no_heads_msa": 8,
+                    "no_heads_pair": 4,
+                    "no_blocks": 4,
+                    "transition_n": 4,
+                    "msa_dropout": 0.15,
+                    "pair_dropout": 0.25,
+                    "clear_cache_between_blocks": False,
+                    "tune_chunk_size": tune_chunk_size,
+                    "inf": 1e9,
+                    "eps": eps,  # 1e-10,
+                    "ckpt": blocks_per_ckpt is not None,
+                },
+                "enabled": True,
+            },
+            "evoformer_stack": {
+                "c_m": c_m,
+                "c_z": c_z,
+                "c_hidden_msa_att": 32,
+                "c_hidden_opm": 32,
+                "c_hidden_mul": 128,
+                "c_hidden_pair_att": 32,
+                "c_s": c_s,
+                "no_heads_msa": 8,
+                "no_heads_pair": 4,
+                "no_blocks": 48,
+                "transition_n": 4,
+                "msa_dropout": 0.15,
+                "pair_dropout": 0.25,
+                "blocks_per_ckpt": blocks_per_ckpt,
+                "clear_cache_between_blocks": False,
+                "tune_chunk_size": tune_chunk_size,
+                "inf": 1e9,
+                "eps": eps,  # 1e-10,
+            },
+            "structure_module": {
+                "c_s": c_s,
+                "c_z": c_z,
+                "c_ipa": 16,
+                "c_resnet": 128,
+                "no_heads_ipa": 12,
+                "no_qk_points": 4,
+                "no_v_points": 8,
+                "dropout_rate": 0.1,
+                "no_blocks": 8,
+                "no_transition_layers": 1,
+                "no_resnet_blocks": 2,
+                "no_angles": 7,
+                "trans_scale_factor": 10,
+                "epsilon": eps,  # 1e-12,
+                "inf": 1e5,
+            },
+            "heads": {
+                "lddt": {
+                    "no_bins": 50,
+                    "c_in": c_s,
+                    "c_hidden": 128,
+                },
+                "distogram": {
+                    "c_z": c_z,
+                    "no_bins": aux_distogram_bins,
+                },
+                "tm": {
+                    "c_z": c_z,
+                    "no_bins": aux_distogram_bins,
+                    "enabled": tm_enabled,
+                },
+                "masked_msa": {
+                    "c_m": c_m,
+                    "c_out": 23,
+                },
+                "experimentally_resolved": {
+                    "c_s": c_s,
+                    "c_out": 37,
+                },
+            },
+        },
+        "relax": {
+            "max_iterations": 0,  # no max
+            "tolerance": 2.39,
+            "stiffness": 10.0,
+            "max_outer_iterations": 20,
+            "exclude_residues": [],
+        },
+        "loss": {
+            "distogram": {
+                "min_bin": 2.3125,
+                "max_bin": 21.6875,
+                "no_bins": 64,
+                "eps": eps,  # 1e-6,
+                "weight": 0.3,
+            },
+            "experimentally_resolved": {
+                "eps": eps,  # 1e-8,
+                "min_resolution": 0.1,
+                "max_resolution": 3.0,
+                "weight": 0.0,
+            },
+            "fape": {
+                "backbone": {
+                    "clamp_distance": 10.0,
+                    "loss_unit_distance": 10.0,
+                    "weight": 0.5,
+                },
+                "sidechain": {
+                    "clamp_distance": 10.0,
+                    "length_scale": 10.0,
+                    "weight": 0.5,
+                },
+                "eps": 1e-4,
+                "weight": 1.0,
+            },
+            "lddt": {
+                "min_resolution": 0.1,
+                "max_resolution": 3.0,
+                "cutoff": 15.0,
+                "no_bins": 50,
+                "eps": eps,  # 1e-10,
+                "weight": 0.01,
+            },
+            "masked_msa": {
+                "eps": eps,  # 1e-8,
+                "weight": 2.0,
+            },
+            "supervised_chi": {
+                "chi_weight": 0.5,
+                "angle_norm_weight": 0.01,
+                "eps": eps,  # 1e-6,
+                "weight": 1.0,
+            },
+            "violation": {
+                "violation_tolerance_factor": 12.0,
+                "clash_overlap_tolerance": 1.5,
+                "eps": eps,  # 1e-6,
+                "weight": 0.0,
+            },
+            "tm": {
+                "max_bin": 31,
+                "no_bins": 64,
+                "min_resolution": 0.1,
+                "max_resolution": 3.0,
+                "eps": eps,  # 1e-8,
+                "weight": 0.,
+                "enabled": tm_enabled,
+            },
+            "eps": eps,
+        },
+        "ema": {"decay": 0.999},
+    }
+)

openfold/data/__init__.py ADDED Viewed

File without changes

openfold/data/data_modules.py ADDED Viewed

	@@ -0,0 +1,721 @@

+import copy
+from functools import partial
+import json
+import logging
+import os
+import pickle
+from typing import Optional, Sequence, List, Any
+import ml_collections as mlc
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import RandomSampler
+from openfold.data import (
+    data_pipeline,
+    feature_pipeline,
+    mmcif_parsing,
+    templates,
+)
+from openfold.utils.tensor_utils import tensor_tree_map, dict_multimap
+class OpenFoldSingleDataset(torch.utils.data.Dataset):
+    def __init__(self,
+        data_dir: str,
+        alignment_dir: str,
+        template_mmcif_dir: str,
+        max_template_date: str,
+        config: mlc.ConfigDict,
+        kalign_binary_path: str = '/usr/bin/kalign',
+        max_template_hits: int = 4,
+        obsolete_pdbs_file_path: Optional[str] = None,
+        template_release_dates_cache_path: Optional[str] = None,
+        shuffle_top_k_prefiltered: Optional[int] = None,
+        treat_pdb_as_distillation: bool = True,
+        mapping_path: Optional[str] = None,
+        mode: str = "train",
+        alignment_index: Optional[Any] = None,
+        _output_raw: bool = False,
+        _structure_index: Optional[Any] = None,
+    ):
+        """
+            Args:
+                data_dir:
+                    A path to a directory containing mmCIF files (in train
+                    mode) or FASTA files (in inference mode).
+                alignment_dir:
+                    A path to a directory containing only data in the format
+                    output by an AlignmentRunner
+                    (defined in openfold.features.alignment_runner).
+                    I.e. a directory of directories named {PDB_ID}_{CHAIN_ID}
+                    or simply {PDB_ID}, each containing .a3m, .sto, and .hhr
+                    files.
+                template_mmcif_dir:
+                    Path to a directory containing template mmCIF files.
+                config:
+                    A dataset config object. See openfold.config
+                kalign_binary_path:
+                    Path to kalign binary.
+                max_template_hits:
+                    An upper bound on how many templates are considered. During
+                    training, the templates ultimately used are subsampled
+                    from this total quantity.
+                template_release_dates_cache_path:
+                    Path to the output of scripts/generate_mmcif_cache.
+                obsolete_pdbs_file_path:
+                    Path to the file containing replacements for obsolete PDBs.
+                shuffle_top_k_prefiltered:
+                    Whether to uniformly shuffle the top k template hits before
+                    parsing max_template_hits of them. Can be used to
+                    approximate DeepMind's training-time template subsampling
+                    scheme much more performantly.
+                treat_pdb_as_distillation:
+                    Whether to assume that .pdb files in the data_dir are from
+                    the self-distillation set (and should be subjected to
+                    special distillation set preprocessing steps).
+                mode:
+                    "train", "val", or "predict"
+        """
+        super(OpenFoldSingleDataset, self).__init__()
+        self.data_dir = data_dir
+        self.alignment_dir = alignment_dir
+        self.config = config
+        self.treat_pdb_as_distillation = treat_pdb_as_distillation
+        self.mode = mode
+        self.alignment_index = alignment_index
+        self._output_raw = _output_raw
+        self._structure_index = _structure_index
+        self.supported_exts = [".cif", ".core", ".pdb"]
+        valid_modes = ["train", "eval", "predict"]
+        if(mode not in valid_modes):
+            raise ValueError(f'mode must be one of {valid_modes}')
+        if(template_release_dates_cache_path is None):
+            logging.warning(
+                "Template release dates cache does not exist. Remember to run "
+                "scripts/generate_mmcif_cache.py before running OpenFold"
+            )
+        if(alignment_index is not None):
+            self._chain_ids = list(alignment_index.keys())
+        elif(mapping_path is None):
+            self._chain_ids = list(os.listdir(alignment_dir))
+        else:
+            with open(mapping_path, "r") as f:
+                self._chain_ids = [l.strip() for l in f.readlines()]
+        self._chain_id_to_idx_dict = {
+            chain: i for i, chain in enumerate(self._chain_ids)
+        }
+        template_featurizer = templates.TemplateHitFeaturizer(
+            mmcif_dir=template_mmcif_dir,
+            max_template_date=max_template_date,
+            max_hits=max_template_hits,
+            kalign_binary_path=kalign_binary_path,
+            release_dates_path=template_release_dates_cache_path,
+            obsolete_pdbs_path=obsolete_pdbs_file_path,
+            _shuffle_top_k_prefiltered=shuffle_top_k_prefiltered,
+        )
+        self.data_pipeline = data_pipeline.DataPipeline(
+            template_featurizer=template_featurizer,
+        )
+        if(not self._output_raw):
+            self.feature_pipeline = feature_pipeline.FeaturePipeline(config)
+    def _parse_mmcif(self, path, file_id, chain_id, alignment_dir, alignment_index):
+        with open(path, 'r') as f:
+            mmcif_string = f.read()
+        mmcif_object = mmcif_parsing.parse(
+            file_id=file_id, mmcif_string=mmcif_string
+        )
+        # Crash if an error is encountered. Any parsing errors should have
+        # been dealt with at the alignment stage.
+        if(mmcif_object.mmcif_object is None):
+            raise list(mmcif_object.errors.values())[0]
+        mmcif_object = mmcif_object.mmcif_object
+        data = self.data_pipeline.process_mmcif(
+            mmcif=mmcif_object,
+            alignment_dir=alignment_dir,
+            chain_id=chain_id,
+            alignment_index=alignment_index
+        )
+        return data
+    def chain_id_to_idx(self, chain_id):
+        return self._chain_id_to_idx_dict[chain_id]
+    def idx_to_chain_id(self, idx):
+        return self._chain_ids[idx]
+    def __getitem__(self, idx):
+        name = self.idx_to_chain_id(idx)
+        alignment_dir = os.path.join(self.alignment_dir, name)
+        alignment_index = None
+        if(self.alignment_index is not None):
+            alignment_dir = self.alignment_dir
+            alignment_index = self.alignment_index[name]
+        if(self.mode == 'train' or self.mode == 'eval'):
+            spl = name.rsplit('_', 1)
+            if(len(spl) == 2):
+                file_id, chain_id = spl
+            else:
+                file_id, = spl
+                chain_id = None
+            path = os.path.join(self.data_dir, file_id)
+            structure_index_entry = None
+            if(self._structure_index is not None):
+                structure_index_entry = self._structure_index[name]
+                assert(len(structure_index_entry["files"]) == 1)
+                filename, _, _ = structure_index_entry["files"][0]
+                ext = os.path.splitext(filename)[1]
+            else:
+                ext = None
+                for e in self.supported_exts:
+                    if(os.path.exists(path + e)):
+                        ext = e
+                        break
+                if(ext is None):
+                    raise ValueError("Invalid file type")
+            path += ext
+            if(ext == ".cif"):
+                data = self._parse_mmcif(
+                    path, file_id, chain_id, alignment_dir, alignment_index,
+                )
+            elif(ext == ".core"):
+                data = self.data_pipeline.process_core(
+                    path, alignment_dir, alignment_index,
+                )
+            elif(ext == ".pdb"):
+                data = self.data_pipeline.process_pdb(
+                    pdb_path=path,
+                    alignment_dir=alignment_dir,
+                    is_distillation=self.treat_pdb_as_distillation,
+                    chain_id=chain_id,
+                    alignment_index=alignment_index,
+                    _structure_index=self._structure_index[name],
+                )
+            else:
+               raise ValueError("Extension branch missing")
+        else:
+            path = os.path.join(name, name + ".fasta")
+            data = self.data_pipeline.process_fasta(
+                fasta_path=path,
+                alignment_dir=alignment_dir,
+                alignment_index=alignment_index,
+            )
+        if(self._output_raw):
+            return data
+        feats = self.feature_pipeline.process_features(
+            data, self.mode
+        )
+        feats["batch_idx"] = torch.tensor([idx for _ in range(feats["aatype"].shape[-1])], dtype=torch.int64, device=feats["aatype"].device)
+        return feats
+    def __len__(self):
+        return len(self._chain_ids)
+def deterministic_train_filter(
+    chain_data_cache_entry: Any,
+    max_resolution: float = 9.,
+    max_single_aa_prop: float = 0.8,
+) -> bool:
+    # Hard filters
+    resolution = chain_data_cache_entry.get("resolution", None)
+    if(resolution is not None and resolution > max_resolution):
+        return False
+    seq = chain_data_cache_entry["seq"]
+    counts = {}
+    for aa in seq:
+        counts.setdefault(aa, 0)
+        counts[aa] += 1
+    largest_aa_count = max(counts.values())
+    largest_single_aa_prop = largest_aa_count / len(seq)
+    if(largest_single_aa_prop > max_single_aa_prop):
+        return False
+    return True
+def get_stochastic_train_filter_prob(
+    chain_data_cache_entry: Any,
+) -> List[float]:
+    # Stochastic filters
+    probabilities = []
+    cluster_size = chain_data_cache_entry.get("cluster_size", None)
+    if(cluster_size is not None and cluster_size > 0):
+        probabilities.append(1 / cluster_size)
+    chain_length = len(chain_data_cache_entry["seq"])
+    probabilities.append((1 / 512) * (max(min(chain_length, 512), 256)))
+    # Risk of underflow here?
+    out = 1
+    for p in probabilities:
+        out *= p
+    return out
+class OpenFoldDataset(torch.utils.data.Dataset):
+    """
+        Implements the stochastic filters applied during AlphaFold's training.
+        Because samples are selected from constituent datasets randomly, the
+        length of an OpenFoldFilteredDataset is arbitrary. Samples are selected
+        and filtered once at initialization.
+    """
+    def __init__(self,
+        datasets: Sequence[OpenFoldSingleDataset],
+        probabilities: Sequence[int],
+        epoch_len: int,
+        chain_data_cache_paths: List[str],
+        generator: torch.Generator = None,
+        _roll_at_init: bool = True,
+    ):
+        self.datasets = datasets
+        self.probabilities = probabilities
+        self.epoch_len = epoch_len
+        self.generator = generator
+        self.chain_data_caches = []
+        for path in chain_data_cache_paths:
+            with open(path, "r") as fp:
+                self.chain_data_caches.append(json.load(fp))
+        def looped_shuffled_dataset_idx(dataset_len):
+            while True:
+                # Uniformly shuffle each dataset's indices
+                weights = [1. for _ in range(dataset_len)]
+                shuf = torch.multinomial(
+                    torch.tensor(weights),
+                    num_samples=dataset_len,
+                    replacement=False,
+                    generator=self.generator,
+                )
+                for idx in shuf:
+                    yield idx
+        def looped_samples(dataset_idx):
+            max_cache_len = int(epoch_len * probabilities[dataset_idx])
+            dataset = self.datasets[dataset_idx]
+            idx_iter = looped_shuffled_dataset_idx(len(dataset))
+            chain_data_cache = self.chain_data_caches[dataset_idx]
+            while True:
+                weights = []
+                idx = []
+                for _ in range(max_cache_len):
+                    candidate_idx = next(idx_iter)
+                    chain_id = dataset.idx_to_chain_id(candidate_idx)
+                    chain_data_cache_entry = chain_data_cache[chain_id]
+                    if(not deterministic_train_filter(chain_data_cache_entry)):
+                        continue
+                    p = get_stochastic_train_filter_prob(
+                        chain_data_cache_entry,
+                    )
+                    weights.append([1. - p, p])
+                    idx.append(candidate_idx)
+                samples = torch.multinomial(
+                    torch.tensor(weights),
+                    num_samples=1,
+                    generator=self.generator,
+                )
+                samples = samples.squeeze()
+                cache = [i for i, s in zip(idx, samples) if s]
+                for datapoint_idx in cache:
+                    yield datapoint_idx
+        self._samples = [looped_samples(i) for i in range(len(self.datasets))]
+        if(_roll_at_init):
+            self.reroll()
+    def __getitem__(self, idx):
+        dataset_idx, datapoint_idx = self.datapoints[idx]
+        return self.datasets[dataset_idx][datapoint_idx]
+    def __len__(self):
+        return self.epoch_len
+    def reroll(self):
+        dataset_choices = torch.multinomial(
+            torch.tensor(self.probabilities),
+            num_samples=self.epoch_len,
+            replacement=True,
+            generator=self.generator,
+        )
+        self.datapoints = []
+        for dataset_idx in dataset_choices:
+            samples = self._samples[dataset_idx]
+            datapoint_idx = next(samples)
+            self.datapoints.append((dataset_idx, datapoint_idx))
+class OpenFoldBatchCollator:
+    def __call__(self, prots):
+        stack_fn = partial(torch.stack, dim=0)
+        return dict_multimap(stack_fn, prots)
+class OpenFoldDataLoader(torch.utils.data.DataLoader):
+    def __init__(self, *args, config, stage="train", generator=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.config = config
+        self.stage = stage
+        if(generator is None):
+            generator = torch.Generator()
+        self.generator = generator
+        self._prep_batch_properties_probs()
+    def _prep_batch_properties_probs(self):
+        keyed_probs = []
+        stage_cfg = self.config[self.stage]
+        max_iters = self.config.common.max_recycling_iters
+        if(stage_cfg.supervised):
+            clamp_prob = self.config.supervised.clamp_prob
+            keyed_probs.append(
+                ("use_clamped_fape", [1 - clamp_prob, clamp_prob])
+            )
+        if(stage_cfg.uniform_recycling):
+            recycling_probs = [
+                1. / (max_iters + 1) for _ in range(max_iters + 1)
+            ]
+        else:
+            recycling_probs = [
+                0. for _ in range(max_iters + 1)
+            ]
+            recycling_probs[-1] = 1.
+        keyed_probs.append(
+            ("no_recycling_iters", recycling_probs)
+        )
+        keys, probs = zip(*keyed_probs)
+        max_len = max([len(p) for p in probs])
+        padding = [[0.] * (max_len - len(p)) for p in probs]
+        self.prop_keys = keys
+        self.prop_probs_tensor = torch.tensor(
+            [p + pad for p, pad in zip(probs, padding)],
+            dtype=torch.float32,
+        )
+    def _add_batch_properties(self, batch):
+        samples = torch.multinomial(
+            self.prop_probs_tensor,
+            num_samples=1, # 1 per row
+            replacement=True,
+            generator=self.generator
+        )
+        aatype = batch["aatype"]
+        batch_dims = aatype.shape[:-2]
+        recycling_dim = aatype.shape[-1]
+        no_recycling = recycling_dim
+        for i, key in enumerate(self.prop_keys):
+            sample = int(samples[i][0])
+            sample_tensor = torch.tensor(
+                sample,
+                device=aatype.device,
+                requires_grad=False
+            )
+            orig_shape = sample_tensor.shape
+            sample_tensor = sample_tensor.view(
+                (1,) * len(batch_dims) + sample_tensor.shape + (1,)
+            )
+            sample_tensor = sample_tensor.expand(
+                batch_dims + orig_shape + (recycling_dim,)
+            )
+            batch[key] = sample_tensor
+            if(key == "no_recycling_iters"):
+                no_recycling = sample
+        resample_recycling = lambda t: t[..., :no_recycling + 1]
+        batch = tensor_tree_map(resample_recycling, batch)
+        return batch
+    def __iter__(self):
+        it = super().__iter__()
+        def _batch_prop_gen(iterator):
+            for batch in iterator:
+                yield self._add_batch_properties(batch)
+        return _batch_prop_gen(it)
+class OpenFoldDataModule(pl.LightningDataModule):
+    def __init__(self,
+        config: mlc.ConfigDict,
+        template_mmcif_dir: str,
+        max_template_date: str,
+        train_data_dir: Optional[str] = None,
+        train_alignment_dir: Optional[str] = None,
+        train_chain_data_cache_path: Optional[str] = None,
+        distillation_data_dir: Optional[str] = None,
+        distillation_alignment_dir: Optional[str] = None,
+        distillation_chain_data_cache_path: Optional[str] = None,
+        val_data_dir: Optional[str] = None,
+        val_alignment_dir: Optional[str] = None,
+        predict_data_dir: Optional[str] = None,
+        predict_alignment_dir: Optional[str] = None,
+        kalign_binary_path: str = '/usr/bin/kalign',
+        train_mapping_path: Optional[str] = None,
+        distillation_mapping_path: Optional[str] = None,
+        obsolete_pdbs_file_path: Optional[str] = None,
+        template_release_dates_cache_path: Optional[str] = None,
+        batch_seed: Optional[int] = None,
+        train_epoch_len: int = 50000,
+        _distillation_structure_index_path: Optional[str] = None,
+        alignment_index_path: Optional[str] = None,
+        distillation_alignment_index_path: Optional[str] = None,
+        **kwargs
+    ):
+        super(OpenFoldDataModule, self).__init__()
+        self.config = config
+        self.template_mmcif_dir = template_mmcif_dir
+        self.max_template_date = max_template_date
+        self.train_data_dir = train_data_dir
+        self.train_alignment_dir = train_alignment_dir
+        self.train_chain_data_cache_path = train_chain_data_cache_path
+        self.distillation_data_dir = distillation_data_dir
+        self.distillation_alignment_dir = distillation_alignment_dir
+        self.distillation_chain_data_cache_path = (
+            distillation_chain_data_cache_path
+        )
+        self.val_data_dir = val_data_dir
+        self.val_alignment_dir = val_alignment_dir
+        self.predict_data_dir = predict_data_dir
+        self.predict_alignment_dir = predict_alignment_dir
+        self.kalign_binary_path = kalign_binary_path
+        self.train_mapping_path = train_mapping_path
+        self.distillation_mapping_path = distillation_mapping_path
+        self.template_release_dates_cache_path = (
+            template_release_dates_cache_path
+        )
+        self.obsolete_pdbs_file_path = obsolete_pdbs_file_path
+        self.batch_seed = batch_seed
+        self.train_epoch_len = train_epoch_len
+        if(self.train_data_dir is None and self.predict_data_dir is None):
+            raise ValueError(
+                'At least one of train_data_dir or predict_data_dir must be '
+                'specified'
+            )
+        self.training_mode = self.train_data_dir is not None
+        if(self.training_mode and train_alignment_dir is None):
+            raise ValueError(
+                'In training mode, train_alignment_dir must be specified'
+            )
+        elif(not self.training_mode and predict_alignment_dir is None):
+            raise ValueError(
+                'In inference mode, predict_alignment_dir must be specified'
+            )
+        elif(val_data_dir is not None and val_alignment_dir is None):
+            raise ValueError(
+                'If val_data_dir is specified, val_alignment_dir must '
+                'be specified as well'
+        )
+        # An ad-hoc measure for our particular filesystem restrictions
+        self._distillation_structure_index = None
+        if(_distillation_structure_index_path is not None):
+            with open(_distillation_structure_index_path, "r") as fp:
+                self._distillation_structure_index = json.load(fp)
+        self.alignment_index = None
+        if(alignment_index_path is not None):
+            with open(alignment_index_path, "r") as fp:
+                self.alignment_index = json.load(fp)
+        self.distillation_alignment_index = None
+        if(distillation_alignment_index_path is not None):
+            with open(distillation_alignment_index_path, "r") as fp:
+                self.distillation_alignment_index = json.load(fp)
+    def setup(self):
+        # Most of the arguments are the same for the three datasets
+        dataset_gen = partial(OpenFoldSingleDataset,
+            template_mmcif_dir=self.template_mmcif_dir,
+            max_template_date=self.max_template_date,
+            config=self.config,
+            kalign_binary_path=self.kalign_binary_path,
+            template_release_dates_cache_path=
+                self.template_release_dates_cache_path,
+            obsolete_pdbs_file_path=
+                self.obsolete_pdbs_file_path,
+        )
+        if(self.training_mode):
+            train_dataset = dataset_gen(
+                data_dir=self.train_data_dir,
+                alignment_dir=self.train_alignment_dir,
+                mapping_path=self.train_mapping_path,
+                max_template_hits=self.config.train.max_template_hits,
+                shuffle_top_k_prefiltered=
+                    self.config.train.shuffle_top_k_prefiltered,
+                treat_pdb_as_distillation=False,
+                mode="train",
+                alignment_index=self.alignment_index,
+            )
+            distillation_dataset = None
+            if(self.distillation_data_dir is not None):
+                distillation_dataset = dataset_gen(
+                    data_dir=self.distillation_data_dir,
+                    alignment_dir=self.distillation_alignment_dir,
+                    mapping_path=self.distillation_mapping_path,
+                    max_template_hits=self.config.train.max_template_hits,
+                    treat_pdb_as_distillation=True,
+                    mode="train",
+                    alignment_index=self.distillation_alignment_index,
+                    _structure_index=self._distillation_structure_index,
+                )
+                d_prob = self.config.train.distillation_prob
+            if(distillation_dataset is not None):
+                datasets = [train_dataset, distillation_dataset]
+                d_prob = self.config.train.distillation_prob
+                probabilities = [1. - d_prob, d_prob]
+                chain_data_cache_paths = [
+                    self.train_chain_data_cache_path,
+                    self.distillation_chain_data_cache_path,
+                ]
+            else:
+                datasets = [train_dataset]
+                probabilities = [1.]
+                chain_data_cache_paths = [
+                    self.train_chain_data_cache_path,
+                ]
+            if(self.batch_seed is not None):
+                generator = torch.Generator()
+                generator = generator.manual_seed(self.batch_seed + 1)
+            self.train_dataset = OpenFoldDataset(
+                datasets=datasets,
+                probabilities=probabilities,
+                epoch_len=self.train_epoch_len,
+                chain_data_cache_paths=chain_data_cache_paths,
+                generator=generator,
+                _roll_at_init=False,
+            )
+            if(self.val_data_dir is not None):
+                self.eval_dataset = dataset_gen(
+                    data_dir=self.val_data_dir,
+                    alignment_dir=self.val_alignment_dir,
+                    mapping_path=None,
+                    max_template_hits=self.config.eval.max_template_hits,
+                    mode="eval",
+                )
+            else:
+                self.eval_dataset = None
+        else:
+            self.predict_dataset = dataset_gen(
+                data_dir=self.predict_data_dir,
+                alignment_dir=self.predict_alignment_dir,
+                mapping_path=None,
+                max_template_hits=self.config.predict.max_template_hits,
+                mode="predict",
+            )
+    def _gen_dataloader(self, stage):
+        generator = torch.Generator()
+        if(self.batch_seed is not None):
+            generator = generator.manual_seed(self.batch_seed)
+        dataset = None
+        if(stage == "train"):
+            dataset = self.train_dataset
+            # Filter the dataset, if necessary
+            dataset.reroll()
+        elif(stage == "eval"):
+            dataset = self.eval_dataset
+        elif(stage == "predict"):
+            dataset = self.predict_dataset
+        else:
+            raise ValueError("Invalid stage")
+        batch_collator = OpenFoldBatchCollator()
+        dl = OpenFoldDataLoader(
+            dataset,
+            config=self.config,
+            stage=stage,
+            generator=generator,
+            batch_size=self.config.data_module.data_loaders.batch_size,
+            num_workers=self.config.data_module.data_loaders.num_workers,
+            collate_fn=batch_collator,
+        )
+        return dl
+    def train_dataloader(self):
+        return self._gen_dataloader("train")
+    def val_dataloader(self):
+        if(self.eval_dataset is not None):
+            return self._gen_dataloader("eval")
+        return None
+    def predict_dataloader(self):
+        return self._gen_dataloader("predict")
+class DummyDataset(torch.utils.data.Dataset):
+    def __init__(self, batch_path):
+        with open(batch_path, "rb") as f:
+            self.batch = pickle.load(f)
+    def __getitem__(self, idx):
+        return copy.deepcopy(self.batch)
+    def __len__(self):
+        return 1000
+class DummyDataLoader(pl.LightningDataModule):
+    def __init__(self, batch_path):
+        super().__init__()
+        self.dataset = DummyDataset(batch_path)
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(self.dataset)

openfold/data/data_pipeline.py ADDED Viewed

	@@ -0,0 +1,826 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import datetime
+from multiprocessing import cpu_count
+from typing import Mapping, Optional, Sequence, Any
+import numpy as np
+from openfold.data import templates, parsers, mmcif_parsing
+from openfold.data.tools import jackhmmer, hhblits, hhsearch
+from openfold.data.tools.utils import to_date
+from openfold.np import residue_constants, protein
+FeatureDict = Mapping[str, np.ndarray]
+def empty_template_feats(n_res) -> FeatureDict:
+    return {
+        "template_aatype": np.zeros((0, n_res)).astype(np.int64),
+        "template_all_atom_positions":
+            np.zeros((0, n_res, 37, 3)).astype(np.float32),
+        "template_sum_probs": np.zeros((0, 1)).astype(np.float32),
+        "template_all_atom_mask": np.zeros((0, n_res, 37)).astype(np.float32),
+    }
+def make_template_features(
+    input_sequence: str,
+    hits: Sequence[Any],
+    template_featurizer: Any,
+    query_pdb_code: Optional[str] = None,
+    query_release_date: Optional[str] = None,
+) -> FeatureDict:
+    hits_cat = sum(hits.values(), [])
+    if(len(hits_cat) == 0 or template_featurizer is None):
+        template_features = empty_template_feats(len(input_sequence))
+    else:
+        templates_result = template_featurizer.get_templates(
+            query_sequence=input_sequence,
+            query_pdb_code=query_pdb_code,
+            query_release_date=query_release_date,
+            hits=hits_cat,
+        )
+        template_features = templates_result.features
+        # The template featurizer doesn't format empty template features
+        # properly. This is a quick fix.
+        if(template_features["template_aatype"].shape[0] == 0):
+            template_features = empty_template_feats(len(input_sequence))
+    return template_features
+def unify_template_features(
+    template_feature_list: Sequence[FeatureDict]
+) -> FeatureDict:
+    out_dicts = []
+    seq_lens = [fd["template_aatype"].shape[1] for fd in template_feature_list]
+    for i, fd in enumerate(template_feature_list):
+        out_dict = {}
+        n_templates, n_res = fd["template_aatype"].shape[:2]
+        for k,v in fd.items():
+            seq_keys = [
+                "template_aatype",
+                "template_all_atom_positions",
+                "template_all_atom_mask",
+            ]
+            if(k in seq_keys):
+                new_shape = list(v.shape)
+                assert(new_shape[1] == n_res)
+                new_shape[1] = sum(seq_lens)
+                new_array = np.zeros(new_shape, dtype=v.dtype)
+                if(k == "template_aatype"):
+                    new_array[..., residue_constants.HHBLITS_AA_TO_ID['-']] = 1
+                offset = sum(seq_lens[:i])
+                new_array[:, offset:offset + seq_lens[i]] = v
+                out_dict[k] = new_array
+            else:
+                out_dict[k] = v
+        chain_indices = np.array(n_templates * [i])
+        out_dict["template_chain_index"] = chain_indices
+        if(n_templates != 0):
+            out_dicts.append(out_dict)
+    out_dict = {
+        k: np.concatenate([od[k] for od in out_dicts]) for k in out_dicts[0]
+    }
+    return out_dict
+def make_sequence_features(
+    sequence: str, description: str, num_res: int
+) -> FeatureDict:
+    """Construct a feature dict of sequence features."""
+    features = {}
+    features["aatype"] = residue_constants.sequence_to_onehot(
+        sequence=sequence,
+        mapping=residue_constants.restype_order_with_x,
+        map_unknown_to_x=True,
+    )
+    features["between_segment_residues"] = np.zeros((num_res,), dtype=np.int32)
+    features["domain_name"] = np.array(
+        [description.encode("utf-8")], dtype=np.object_
+    )
+    features["residue_index"] = np.array(range(num_res), dtype=np.int32)
+    features["seq_length"] = np.array([num_res] * num_res, dtype=np.int32)
+    features["sequence"] = np.array(
+        [sequence.encode("utf-8")], dtype=np.object_
+    )
+    return features
+def make_mmcif_features(
+    mmcif_object: mmcif_parsing.MmcifObject, chain_id: str
+) -> FeatureDict:
+    input_sequence = mmcif_object.chain_to_seqres[chain_id]
+    description = "_".join([mmcif_object.file_id, chain_id])
+    num_res = len(input_sequence)
+    mmcif_feats = {}
+    mmcif_feats.update(
+        make_sequence_features(
+            sequence=input_sequence,
+            description=description,
+            num_res=num_res,
+        )
+    )
+    all_atom_positions, all_atom_mask = mmcif_parsing.get_atom_coords(
+        mmcif_object=mmcif_object, chain_id=chain_id
+    )
+    mmcif_feats["all_atom_positions"] = all_atom_positions
+    mmcif_feats["all_atom_mask"] = all_atom_mask
+    mmcif_feats["resolution"] = np.array(
+        [mmcif_object.header["resolution"]], dtype=np.float32
+    )
+    mmcif_feats["release_date"] = np.array(
+        [mmcif_object.header["release_date"].encode("utf-8")], dtype=np.object_
+    )
+    mmcif_feats["is_distillation"] = np.array(0., dtype=np.float32)
+    return mmcif_feats
+def _aatype_to_str_sequence(aatype):
+    return ''.join([
+        residue_constants.restypes_with_x[aatype[i]]
+        for i in range(len(aatype))
+    ])
+def make_protein_features(
+    protein_object: protein.Protein,
+    description: str,
+    _is_distillation: bool = False,
+) -> FeatureDict:
+    pdb_feats = {}
+    aatype = protein_object.aatype
+    sequence = _aatype_to_str_sequence(aatype)
+    pdb_feats.update(
+        make_sequence_features(
+            sequence=sequence,
+            description=description,
+            num_res=len(protein_object.aatype),
+        )
+    )
+    all_atom_positions = protein_object.atom_positions
+    all_atom_mask = protein_object.atom_mask
+    pdb_feats["all_atom_positions"] = all_atom_positions.astype(np.float32)
+    pdb_feats["all_atom_mask"] = all_atom_mask.astype(np.float32)
+    pdb_feats["resolution"] = np.array([0.]).astype(np.float32)
+    pdb_feats["is_distillation"] = np.array(
+        1. if _is_distillation else 0.
+    ).astype(np.float32)
+    return pdb_feats
+def make_pdb_features(
+    protein_object: protein.Protein,
+    description: str,
+    is_distillation: bool = True,
+    confidence_threshold: float = 50.,
+) -> FeatureDict:
+    pdb_feats = make_protein_features(
+        protein_object, description, _is_distillation=True
+    )
+    if(is_distillation):
+        high_confidence = protein_object.b_factors > confidence_threshold
+        high_confidence = np.any(high_confidence, axis=-1)
+        pdb_feats["all_atom_mask"] *= high_confidence[..., None]
+    return pdb_feats
+def make_msa_features(
+    msas: Sequence[Sequence[str]],
+    deletion_matrices: Sequence[parsers.DeletionMatrix],
+) -> FeatureDict:
+    """Constructs a feature dict of MSA features."""
+    if not msas:
+        raise ValueError("At least one MSA must be provided.")
+    int_msa = []
+    deletion_matrix = []
+    seen_sequences = set()
+    for msa_index, msa in enumerate(msas):
+        if not msa:
+            raise ValueError(
+                f"MSA {msa_index} must contain at least one sequence."
+            )
+        for sequence_index, sequence in enumerate(msa):
+            if sequence in seen_sequences:
+                continue
+            seen_sequences.add(sequence)
+            int_msa.append(
+                [residue_constants.HHBLITS_AA_TO_ID[res] for res in sequence]
+            )
+            deletion_matrix.append(deletion_matrices[msa_index][sequence_index])
+    num_res = len(msas[0][0])
+    num_alignments = len(int_msa)
+    features = {}
+    features["deletion_matrix_int"] = np.array(deletion_matrix, dtype=np.int32)
+    features["msa"] = np.array(int_msa, dtype=np.int32)
+    features["num_alignments"] = np.array(
+        [num_alignments] * num_res, dtype=np.int32
+    )
+    return features
+class AlignmentRunner:
+    """Runs alignment tools and saves the results"""
+    def __init__(
+        self,
+        jackhmmer_binary_path: Optional[str] = None,
+        hhblits_binary_path: Optional[str] = None,
+        hhsearch_binary_path: Optional[str] = None,
+        uniref90_database_path: Optional[str] = None,
+        mgnify_database_path: Optional[str] = None,
+        bfd_database_path: Optional[str] = None,
+        uniclust30_database_path: Optional[str] = None,
+        pdb70_database_path: Optional[str] = None,
+        use_small_bfd: Optional[bool] = None,
+        no_cpus: Optional[int] = None,
+        uniref_max_hits: int = 10000,
+        mgnify_max_hits: int = 5000,
+    ):
+        """
+        Args:
+            jackhmmer_binary_path:
+                Path to jackhmmer binary
+            hhblits_binary_path:
+                Path to hhblits binary
+            hhsearch_binary_path:
+                Path to hhsearch binary
+            uniref90_database_path:
+                Path to uniref90 database. If provided, jackhmmer_binary_path
+                must also be provided
+            mgnify_database_path:
+                Path to mgnify database. If provided, jackhmmer_binary_path
+                must also be provided
+            bfd_database_path:
+                Path to BFD database. Depending on the value of use_small_bfd,
+                one of hhblits_binary_path or jackhmmer_binary_path must be
+                provided.
+            uniclust30_database_path:
+                Path to uniclust30. Searched alongside BFD if use_small_bfd is
+                false.
+            pdb70_database_path:
+                Path to pdb70 database.
+            use_small_bfd:
+                Whether to search the BFD database alone with jackhmmer or
+                in conjunction with uniclust30 with hhblits.
+            no_cpus:
+                The number of CPUs available for alignment. By default, all
+                CPUs are used.
+            uniref_max_hits:
+                Max number of uniref hits
+            mgnify_max_hits:
+                Max number of mgnify hits
+        """
+        db_map = {
+            "jackhmmer": {
+                "binary": jackhmmer_binary_path,
+                "dbs": [
+                    uniref90_database_path,
+                    mgnify_database_path,
+                    bfd_database_path if use_small_bfd else None,
+                ],
+            },
+            "hhblits": {
+                "binary": hhblits_binary_path,
+                "dbs": [
+                    bfd_database_path if not use_small_bfd else None,
+                ],
+            },
+            "hhsearch": {
+                "binary": hhsearch_binary_path,
+                "dbs": [
+                    pdb70_database_path,
+                ],
+            },
+        }
+        for name, dic in db_map.items():
+            binary, dbs = dic["binary"], dic["dbs"]
+            if(binary is None and not all([x is None for x in dbs])):
+                raise ValueError(
+                    f"{name} DBs provided but {name} binary is None"
+                )
+        if(not all([x is None for x in db_map["hhsearch"]["dbs"]])
+            and uniref90_database_path is None):
+            raise ValueError(
+                """uniref90_database_path must be specified in order to perform
+                   template search"""
+            )
+        self.uniref_max_hits = uniref_max_hits
+        self.mgnify_max_hits = mgnify_max_hits
+        self.use_small_bfd = use_small_bfd
+        if(no_cpus is None):
+            no_cpus = cpu_count()
+        self.jackhmmer_uniref90_runner = None
+        if(jackhmmer_binary_path is not None and
+            uniref90_database_path is not None
+        ):
+            self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
+                binary_path=jackhmmer_binary_path,
+                database_path=uniref90_database_path,
+                n_cpu=no_cpus,
+            )
+        self.jackhmmer_small_bfd_runner = None
+        self.hhblits_bfd_uniclust_runner = None
+        if(bfd_database_path is not None):
+            if use_small_bfd:
+                self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
+                    binary_path=jackhmmer_binary_path,
+                    database_path=bfd_database_path,
+                    n_cpu=no_cpus,
+                )
+            else:
+                dbs = [bfd_database_path]
+                if(uniclust30_database_path is not None):
+                    dbs.append(uniclust30_database_path)
+                self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
+                    binary_path=hhblits_binary_path,
+                    databases=dbs,
+                    n_cpu=no_cpus,
+                )
+        self.jackhmmer_mgnify_runner = None
+        if(mgnify_database_path is not None):
+            self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
+                binary_path=jackhmmer_binary_path,
+                database_path=mgnify_database_path,
+                n_cpu=no_cpus,
+            )
+        self.hhsearch_pdb70_runner = None
+        if(pdb70_database_path is not None):
+            self.hhsearch_pdb70_runner = hhsearch.HHSearch(
+                binary_path=hhsearch_binary_path,
+                databases=[pdb70_database_path],
+                n_cpu=no_cpus,
+            )
+    def run(
+        self,
+        fasta_path: str,
+        output_dir: str,
+    ):
+        """Runs alignment tools on a sequence"""
+        if(self.jackhmmer_uniref90_runner is not None):
+            jackhmmer_uniref90_result = self.jackhmmer_uniref90_runner.query(
+                fasta_path
+            )[0]
+            uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(
+                jackhmmer_uniref90_result["sto"],
+                max_sequences=self.uniref_max_hits
+            )
+            uniref90_out_path = os.path.join(output_dir, "uniref90_hits.a3m")
+            with open(uniref90_out_path, "w") as f:
+                f.write(uniref90_msa_as_a3m)
+            if(self.hhsearch_pdb70_runner is not None):
+                hhsearch_result = self.hhsearch_pdb70_runner.query(
+                    uniref90_msa_as_a3m
+                )
+                pdb70_out_path = os.path.join(output_dir, "pdb70_hits.hhr")
+                with open(pdb70_out_path, "w") as f:
+                    f.write(hhsearch_result)
+        if(self.jackhmmer_mgnify_runner is not None):
+            jackhmmer_mgnify_result = self.jackhmmer_mgnify_runner.query(
+                fasta_path
+            )[0]
+            mgnify_msa_as_a3m = parsers.convert_stockholm_to_a3m(
+                jackhmmer_mgnify_result["sto"],
+                max_sequences=self.mgnify_max_hits
+            )
+            mgnify_out_path = os.path.join(output_dir, "mgnify_hits.a3m")
+            with open(mgnify_out_path, "w") as f:
+                f.write(mgnify_msa_as_a3m)
+        if(self.use_small_bfd and self.jackhmmer_small_bfd_runner is not None):
+            jackhmmer_small_bfd_result = self.jackhmmer_small_bfd_runner.query(
+                fasta_path
+            )[0]
+            bfd_out_path = os.path.join(output_dir, "small_bfd_hits.sto")
+            with open(bfd_out_path, "w") as f:
+                f.write(jackhmmer_small_bfd_result["sto"])
+        elif(self.hhblits_bfd_uniclust_runner is not None):
+            hhblits_bfd_uniclust_result = (
+                self.hhblits_bfd_uniclust_runner.query(fasta_path)
+            )
+            if output_dir is not None:
+                bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m")
+                with open(bfd_out_path, "w") as f:
+                    f.write(hhblits_bfd_uniclust_result["a3m"])
+class DataPipeline:
+    """Assembles input features."""
+    def __init__(
+        self,
+        template_featurizer: Optional[templates.TemplateHitFeaturizer],
+    ):
+        self.template_featurizer = template_featurizer
+    def _parse_msa_data(
+        self,
+        alignment_dir: str,
+        alignment_index: Optional[Any] = None,
+    ) -> Mapping[str, Any]:
+        msa_data = {}
+        if(alignment_index is not None):
+            fp = open(os.path.join(alignment_dir, alignment_index["db"]), "rb")
+            def read_msa(start, size):
+                fp.seek(start)
+                msa = fp.read(size).decode("utf-8")
+                return msa
+            for (name, start, size) in alignment_index["files"]:
+                ext = os.path.splitext(name)[-1]
+                if(ext == ".a3m"):
+                    msa, deletion_matrix = parsers.parse_a3m(
+                        read_msa(start, size)
+                    )
+                    data = {"msa": msa, "deletion_matrix": deletion_matrix}
+                elif(ext == ".sto"):
+                    msa, deletion_matrix, _ = parsers.parse_stockholm(
+                        read_msa(start, size)
+                    )
+                    data = {"msa": msa, "deletion_matrix": deletion_matrix}
+                else:
+                    continue
+                msa_data[name] = data
+            fp.close()
+        else:
+            for f in os.listdir(alignment_dir):
+                path = os.path.join(alignment_dir, f)
+                ext = os.path.splitext(f)[-1]
+                if(ext == ".a3m"):
+                    with open(path, "r") as fp:
+                        msa, deletion_matrix = parsers.parse_a3m(fp.read())
+                    data = {"msa": msa, "deletion_matrix": deletion_matrix}
+                elif(ext == ".sto"):
+                    with open(path, "r") as fp:
+                        msa, deletion_matrix, _ = parsers.parse_stockholm(
+                            fp.read()
+                        )
+                    data = {"msa": msa, "deletion_matrix": deletion_matrix}
+                else:
+                    continue
+                msa_data[f] = data
+        return msa_data
+    def _parse_template_hits(
+        self,
+        alignment_dir: str,
+        alignment_index: Optional[Any] = None
+    ) -> Mapping[str, Any]:
+        all_hits = {}
+        if(alignment_index is not None):
+            fp = open(os.path.join(alignment_dir, alignment_index["db"]), 'rb')
+            def read_template(start, size):
+                fp.seek(start)
+                return fp.read(size).decode("utf-8")
+            for (name, start, size) in alignment_index["files"]:
+                ext = os.path.splitext(name)[-1]
+                if(ext == ".hhr"):
+                    hits = parsers.parse_hhr(read_template(start, size))
+                    all_hits[name] = hits
+            fp.close()
+        else:
+            for f in os.listdir(alignment_dir):
+                path = os.path.join(alignment_dir, f)
+                ext = os.path.splitext(f)[-1]
+                if(ext == ".hhr"):
+                    with open(path, "r") as fp:
+                        hits = parsers.parse_hhr(fp.read())
+                    all_hits[f] = hits
+        return all_hits
+    def _get_msas(self,
+        alignment_dir: str,
+        input_sequence: Optional[str] = None,
+        alignment_index: Optional[str] = None,
+    ):
+        msa_data = self._parse_msa_data(alignment_dir, alignment_index)
+        if(len(msa_data) == 0):
+            if(input_sequence is None):
+                raise ValueError(
+                    """
+                    If the alignment dir contains no MSAs, an input sequence
+                    must be provided.
+                    """
+                )
+            msa_data["dummy"] = {
+                "msa": [input_sequence],
+                "deletion_matrix": [[0 for _ in input_sequence]],
+            }
+        msas, deletion_matrices = zip(*[
+            (v["msa"], v["deletion_matrix"]) for v in msa_data.values()
+        ])
+        return msas, deletion_matrices
+    def _process_msa_feats(
+        self,
+        alignment_dir: str,
+        input_sequence: Optional[str] = None,
+        alignment_index: Optional[str] = None
+    ) -> Mapping[str, Any]:
+        msas, deletion_matrices = self._get_msas(
+            alignment_dir, input_sequence, alignment_index
+        )
+        msa_features = make_msa_features(
+            msas=msas,
+            deletion_matrices=deletion_matrices,
+        )
+        return msa_features
+    def process_fasta(
+        self,
+        fasta_path: str,
+        alignment_dir: str,
+        alignment_index: Optional[str] = None,
+    ) -> FeatureDict:
+        """Assembles features for a single sequence in a FASTA file"""
+        with open(fasta_path) as f:
+            fasta_str = f.read()
+        input_seqs, input_descs = parsers.parse_fasta(fasta_str)
+        if len(input_seqs) != 1:
+            raise ValueError(
+                f"More than one input sequence found in {fasta_path}."
+            )
+        input_sequence = input_seqs[0]
+        input_description = input_descs[0]
+        num_res = len(input_sequence)
+        hits = self._parse_template_hits(alignment_dir, alignment_index)
+        template_features = make_template_features(
+            input_sequence,
+            hits,
+            self.template_featurizer,
+        )
+        sequence_features = make_sequence_features(
+            sequence=input_sequence,
+            description=input_description,
+            num_res=num_res,
+        )
+        msa_features = self._process_msa_feats(alignment_dir, input_sequence, alignment_index)
+        return {
+            **sequence_features,
+            **msa_features,
+            **template_features
+        }
+    def process_mmcif(
+        self,
+        mmcif: mmcif_parsing.MmcifObject,  # parsing is expensive, so no path
+        alignment_dir: str,
+        chain_id: Optional[str] = None,
+        alignment_index: Optional[str] = None,
+    ) -> FeatureDict:
+        """
+            Assembles features for a specific chain in an mmCIF object.
+            If chain_id is None, it is assumed that there is only one chain
+            in the object. Otherwise, a ValueError is thrown.
+        """
+        if chain_id is None:
+            chains = mmcif.structure.get_chains()
+            chain = next(chains, None)
+            if chain is None:
+                raise ValueError("No chains in mmCIF file")
+            chain_id = chain.id
+        mmcif_feats = make_mmcif_features(mmcif, chain_id)
+        input_sequence = mmcif.chain_to_seqres[chain_id]
+        hits = self._parse_template_hits(alignment_dir, alignment_index)
+        template_features = make_template_features(
+            input_sequence,
+            hits,
+            self.template_featurizer,
+            query_release_date=to_date(mmcif.header["release_date"])
+        )
+        msa_features = self._process_msa_feats(alignment_dir, input_sequence, alignment_index)
+        return {**mmcif_feats, **template_features, **msa_features}
+    def process_pdb(
+        self,
+        pdb_path: str,
+        alignment_dir: str,
+        is_distillation: bool = True,
+        chain_id: Optional[str] = None,
+        _structure_index: Optional[str] = None,
+        alignment_index: Optional[str] = None,
+    ) -> FeatureDict:
+        """
+            Assembles features for a protein in a PDB file.
+        """
+        if(_structure_index is not None):
+            db_dir = os.path.dirname(pdb_path)
+            db = _structure_index["db"]
+            db_path = os.path.join(db_dir, db)
+            fp = open(db_path, "rb")
+            _, offset, length = _structure_index["files"][0]
+            fp.seek(offset)
+            pdb_str = fp.read(length).decode("utf-8")
+            fp.close()
+        else:
+            with open(pdb_path, 'r') as f:
+                pdb_str = f.read()
+        protein_object = protein.from_pdb_string(pdb_str, chain_id)
+        input_sequence = _aatype_to_str_sequence(protein_object.aatype)
+        description = os.path.splitext(os.path.basename(pdb_path))[0].upper()
+        pdb_feats = make_pdb_features(
+            protein_object,
+            description,
+            is_distillation=is_distillation
+        )
+        hits = self._parse_template_hits(alignment_dir, alignment_index)
+        template_features = make_template_features(
+            input_sequence,
+            hits,
+            self.template_featurizer,
+        )
+        msa_features = self._process_msa_feats(alignment_dir, input_sequence, alignment_index)
+        return {**pdb_feats, **template_features, **msa_features}
+    def process_core(
+        self,
+        core_path: str,
+        alignment_dir: str,
+        alignment_index: Optional[str] = None,
+    ) -> FeatureDict:
+        """
+            Assembles features for a protein in a ProteinNet .core file.
+        """
+        with open(core_path, 'r') as f:
+            core_str = f.read()
+        protein_object = protein.from_proteinnet_string(core_str)
+        input_sequence = _aatype_to_str_sequence(protein_object.aatype)
+        description = os.path.splitext(os.path.basename(core_path))[0].upper()
+        core_feats = make_protein_features(protein_object, description)
+        hits = self._parse_template_hits(alignment_dir, alignment_index)
+        template_features = make_template_features(
+            input_sequence,
+            hits,
+            self.template_featurizer,
+        )
+        msa_features = self._process_msa_feats(alignment_dir, input_sequence)
+        return {**core_feats, **template_features, **msa_features}
+    def process_multiseq_fasta(self,
+        fasta_path: str,
+        super_alignment_dir: str,
+        ri_gap: int = 200,
+    ) -> FeatureDict:
+        """
+            Assembles features for a multi-sequence FASTA. Uses Minkyung Baek's
+            hack from Twitter (a.k.a. AlphaFold-Gap).
+        """
+        with open(fasta_path, 'r') as f:
+            fasta_str = f.read()
+        input_seqs, input_descs = parsers.parse_fasta(fasta_str)
+        # No whitespace allowed
+        input_descs = [i.split()[0] for i in input_descs]
+        # Stitch all of the sequences together
+        input_sequence = ''.join(input_seqs)
+        input_description = '-'.join(input_descs)
+        num_res = len(input_sequence)
+        sequence_features = make_sequence_features(
+            sequence=input_sequence,
+            description=input_description,
+            num_res=num_res,
+        )
+        seq_lens = [len(s) for s in input_seqs]
+        total_offset = 0
+        for sl in seq_lens:
+            total_offset += sl
+            sequence_features["residue_index"][total_offset:] += ri_gap
+        msa_list = []
+        deletion_mat_list = []
+        for seq, desc in zip(input_seqs, input_descs):
+            alignment_dir = os.path.join(
+                super_alignment_dir, desc
+            )
+            msas, deletion_mats = self._get_msas(
+                alignment_dir, seq, None
+            )
+            msa_list.append(msas)
+            deletion_mat_list.append(deletion_mats)
+        final_msa = []
+        final_deletion_mat = []
+        msa_it = enumerate(zip(msa_list, deletion_mat_list))
+        for i, (msas, deletion_mats) in msa_it:
+            prec, post = sum(seq_lens[:i]), sum(seq_lens[i + 1:])
+            msas = [
+                [prec * '-' + seq + post * '-' for seq in msa] for msa in msas
+            ]
+            deletion_mats = [
+                [prec * [0] + dml + post * [0] for dml in deletion_mat]
+                for deletion_mat in deletion_mats
+            ]
+            assert(len(msas[0][-1]) == len(input_sequence))
+            final_msa.extend(msas)
+            final_deletion_mat.extend(deletion_mats)
+        msa_features = make_msa_features(
+            msas=final_msa,
+            deletion_matrices=final_deletion_mat,
+        )
+        template_feature_list = []
+        for seq, desc in zip(input_seqs, input_descs):
+            alignment_dir = os.path.join(
+                super_alignment_dir, desc
+            )
+            hits = self._parse_template_hits(alignment_dir, alignment_index=None)
+            template_features = make_template_features(
+                seq,
+                hits,
+                self.template_featurizer,
+            )
+            template_feature_list.append(template_features)
+        template_features = unify_template_features(template_feature_list)
+        return {
+            **sequence_features,
+            **msa_features,
+            **template_features,
+        }

openfold/data/data_transforms.py ADDED Viewed

	@@ -0,0 +1,1212 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from functools import reduce, wraps
+from operator import add
+import numpy as np
+import torch
+from openfold.config import NUM_RES, NUM_EXTRA_SEQ, NUM_TEMPLATES, NUM_MSA_SEQ
+from openfold.np import residue_constants as rc
+from openfold.utils.rigid_utils import Rotation, Rigid
+from openfold.utils.tensor_utils import (
+    tree_map,
+    tensor_tree_map,
+    batched_gather,
+)
+MSA_FEATURE_NAMES = [
+    "msa",
+    "deletion_matrix",
+    "msa_mask",
+    "msa_row_mask",
+    "bert_mask",
+    "true_msa",
+]
+def cast_to_64bit_ints(protein):
+    # We keep all ints as int64
+    for k, v in protein.items():
+        if v.dtype == torch.int32:
+            protein[k] = v.type(torch.int64)
+    return protein
+def make_one_hot(x, num_classes):
+    x_one_hot = torch.zeros(*x.shape, num_classes, device=x.device)
+    x_one_hot.scatter_(-1, x.unsqueeze(-1), 1)
+    return x_one_hot
+def make_seq_mask(protein):
+    protein["seq_mask"] = torch.ones(
+        protein["aatype"].shape, dtype=torch.float32
+    )
+    return protein
+def make_template_mask(protein):
+    protein["template_mask"] = torch.ones(
+        protein["template_aatype"].shape[0], dtype=torch.float32
+    )
+    return protein
+def curry1(f):
+    """Supply all arguments but the first."""
+    @wraps(f)
+    def fc(*args, **kwargs):
+        return lambda x: f(x, *args, **kwargs)
+    return fc
+def make_all_atom_aatype(protein):
+    protein["all_atom_aatype"] = protein["aatype"]
+    return protein
+def fix_templates_aatype(protein):
+    # Map one-hot to indices
+    num_templates = protein["template_aatype"].shape[0]
+    if(num_templates > 0):
+        protein["template_aatype"] = torch.argmax(
+            protein["template_aatype"], dim=-1
+        )
+        # Map hhsearch-aatype to our aatype.
+        new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+        new_order = torch.tensor(
+            new_order_list, dtype=torch.int64, device=protein["aatype"].device,
+        ).expand(num_templates, -1)
+        protein["template_aatype"] = torch.gather(
+            new_order, 1, index=protein["template_aatype"]
+        )
+    return protein
+def correct_msa_restypes(protein):
+    """Correct MSA restype to have the same order as rc."""
+    new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+    new_order = torch.tensor(
+        [new_order_list] * protein["msa"].shape[1],
+        device=protein["msa"].device,
+    ).transpose(0, 1)
+    protein["msa"] = torch.gather(new_order, 0, protein["msa"])
+    perm_matrix = np.zeros((22, 22), dtype=np.float32)
+    perm_matrix[range(len(new_order_list)), new_order_list] = 1.0
+    for k in protein:
+        if "profile" in k:
+            num_dim = protein[k].shape.as_list()[-1]
+            assert num_dim in [
+                20,
+                21,
+                22,
+            ], "num_dim for %s out of expected range: %s" % (k, num_dim)
+            protein[k] = torch.dot(protein[k], perm_matrix[:num_dim, :num_dim])
+    return protein
+def squeeze_features(protein):
+    """Remove singleton and repeated dimensions in protein features."""
+    protein["aatype"] = torch.argmax(protein["aatype"], dim=-1)
+    for k in [
+        "domain_name",
+        "msa",
+        "num_alignments",
+        "seq_length",
+        "sequence",
+        "superfamily",
+        "deletion_matrix",
+        "resolution",
+        "between_segment_residues",
+        "residue_index",
+        "template_all_atom_mask",
+    ]:
+        if k in protein:
+            final_dim = protein[k].shape[-1]
+            if isinstance(final_dim, int) and final_dim == 1:
+                if torch.is_tensor(protein[k]):
+                    protein[k] = torch.squeeze(protein[k], dim=-1)
+                else:
+                    protein[k] = np.squeeze(protein[k], axis=-1)
+    for k in ["seq_length", "num_alignments"]:
+        if k in protein:
+            protein[k] = protein[k][0]
+    return protein
+@curry1
+def randomly_replace_msa_with_unknown(protein, replace_proportion):
+    """Replace a portion of the MSA with 'X'."""
+    msa_mask = torch.rand(protein["msa"].shape) < replace_proportion
+    x_idx = 20
+    gap_idx = 21
+    msa_mask = torch.logical_and(msa_mask, protein["msa"] != gap_idx)
+    protein["msa"] = torch.where(
+        msa_mask,
+        torch.ones_like(protein["msa"]) * x_idx,
+        protein["msa"]
+    )
+    aatype_mask = torch.rand(protein["aatype"].shape) < replace_proportion
+    protein["aatype"] = torch.where(
+        aatype_mask,
+        torch.ones_like(protein["aatype"]) * x_idx,
+        protein["aatype"],
+    )
+    return protein
+@curry1
+def sample_msa(protein, max_seq, keep_extra, seed=None):
+    """Sample MSA randomly, remaining sequences are stored are stored as `extra_*`."""
+    num_seq = protein["msa"].shape[0]
+    g = torch.Generator(device=protein["msa"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+    shuffled = torch.randperm(num_seq - 1, generator=g) + 1
+    index_order = torch.cat(
+        (torch.tensor([0], device=shuffled.device), shuffled),
+        dim=0
+    )
+    num_sel = min(max_seq, num_seq)
+    sel_seq, not_sel_seq = torch.split(
+        index_order, [num_sel, num_seq - num_sel]
+    )
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            if keep_extra:
+                protein["extra_" + k] = torch.index_select(
+                    protein[k], 0, not_sel_seq
+                )
+            protein[k] = torch.index_select(protein[k], 0, sel_seq)
+    return protein
+@curry1
+def add_distillation_flag(protein, distillation):
+    protein['is_distillation'] = distillation
+    return protein
+@curry1
+def sample_msa_distillation(protein, max_seq):
+    if(protein["is_distillation"] == 1):
+        protein = sample_msa(max_seq, keep_extra=False)(protein)
+    return protein
+@curry1
+def crop_extra_msa(protein, max_extra_msa):
+    num_seq = protein["extra_msa"].shape[0]
+    num_sel = min(max_extra_msa, num_seq)
+    select_indices = torch.randperm(num_seq)[:num_sel]
+    for k in MSA_FEATURE_NAMES:
+        if "extra_" + k in protein:
+            protein["extra_" + k] = torch.index_select(
+                protein["extra_" + k], 0, select_indices
+            )
+    return protein
+def delete_extra_msa(protein):
+    for k in MSA_FEATURE_NAMES:
+        if "extra_" + k in protein:
+            del protein["extra_" + k]
+    return protein
+# Not used in inference
+@curry1
+def block_delete_msa(protein, config):
+    num_seq = protein["msa"].shape[0]
+    block_num_seq = torch.floor(
+        torch.tensor(num_seq, dtype=torch.float32, device=protein["msa"].device)
+        * config.msa_fraction_per_block
+    ).to(torch.int32)
+    if config.randomize_num_blocks:
+        nb = torch.distributions.uniform.Uniform(
+            0, config.num_blocks + 1
+        ).sample()
+    else:
+        nb = config.num_blocks
+    del_block_starts = torch.distributions.Uniform(0, num_seq).sample(nb)
+    del_blocks = del_block_starts[:, None] + torch.range(block_num_seq)
+    del_blocks = torch.clip(del_blocks, 0, num_seq - 1)
+    del_indices = torch.unique(torch.sort(torch.reshape(del_blocks, [-1])))[0]
+    # Make sure we keep the original sequence
+    combined = torch.cat((torch.range(1, num_seq)[None], del_indices[None]))
+    uniques, counts = combined.unique(return_counts=True)
+    difference = uniques[counts == 1]
+    intersection = uniques[counts > 1]
+    keep_indices = torch.squeeze(difference, 0)
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            protein[k] = torch.gather(protein[k], keep_indices)
+    return protein
+@curry1
+def nearest_neighbor_clusters(protein, gap_agreement_weight=0.0):
+    weights = torch.cat(
+        [
+            torch.ones(21, device=protein["msa"].device),
+            gap_agreement_weight * torch.ones(1, device=protein["msa"].device),
+            torch.zeros(1, device=protein["msa"].device)
+        ],
+        0,
+    )
+    # Make agreement score as weighted Hamming distance
+    msa_one_hot = make_one_hot(protein["msa"], 23)
+    sample_one_hot = protein["msa_mask"][:, :, None] * msa_one_hot
+    extra_msa_one_hot = make_one_hot(protein["extra_msa"], 23)
+    extra_one_hot = protein["extra_msa_mask"][:, :, None] * extra_msa_one_hot
+    num_seq, num_res, _ = sample_one_hot.shape
+    extra_num_seq, _, _ = extra_one_hot.shape
+    # Compute tf.einsum('mrc,nrc,c->mn', sample_one_hot, extra_one_hot, weights)
+    # in an optimized fashion to avoid possible memory or computation blowup.
+    agreement = torch.matmul(
+        torch.reshape(extra_one_hot, [extra_num_seq, num_res * 23]),
+        torch.reshape(
+            sample_one_hot * weights, [num_seq, num_res * 23]
+        ).transpose(0, 1),
+    )
+    # Assign each sequence in the extra sequences to the closest MSA sample
+    protein["extra_cluster_assignment"] = torch.argmax(agreement, dim=1).to(
+        torch.int64
+    )
+    return protein
+def unsorted_segment_sum(data, segment_ids, num_segments):
+    """
+    Computes the sum along segments of a tensor. Similar to
+    tf.unsorted_segment_sum, but only supports 1-D indices.
+    :param data: A tensor whose segments are to be summed.
+    :param segment_ids: The 1-D segment indices tensor.
+    :param num_segments: The number of segments.
+    :return: A tensor of same data type as the data argument.
+    """
+    assert (
+        len(segment_ids.shape) == 1 and
+        segment_ids.shape[0] == data.shape[0]
+    )
+    segment_ids = segment_ids.view(
+        segment_ids.shape[0], *((1,) * len(data.shape[1:]))
+    )
+    segment_ids = segment_ids.expand(data.shape)
+    shape = [num_segments] + list(data.shape[1:])
+    tensor = (
+        torch.zeros(*shape, device=segment_ids.device)
+        .scatter_add_(0, segment_ids, data.float())
+    )
+    tensor = tensor.type(data.dtype)
+    return tensor
+@curry1
+def summarize_clusters(protein):
+    """Produce profile and deletion_matrix_mean within each cluster."""
+    num_seq = protein["msa"].shape[0]
+    def csum(x):
+        return unsorted_segment_sum(
+            x, protein["extra_cluster_assignment"], num_seq
+        )
+    mask = protein["extra_msa_mask"]
+    mask_counts = 1e-6 + protein["msa_mask"] + csum(mask)  # Include center
+    msa_sum = csum(mask[:, :, None] * make_one_hot(protein["extra_msa"], 23))
+    msa_sum += make_one_hot(protein["msa"], 23)  # Original sequence
+    protein["cluster_profile"] = msa_sum / mask_counts[:, :, None]
+    del msa_sum
+    del_sum = csum(mask * protein["extra_deletion_matrix"])
+    del_sum += protein["deletion_matrix"]  # Original sequence
+    protein["cluster_deletion_mean"] = del_sum / mask_counts
+    del del_sum
+    return protein
+def make_msa_mask(protein):
+    """Mask features are all ones, but will later be zero-padded."""
+    protein["msa_mask"] = torch.ones(protein["msa"].shape, dtype=torch.float32)
+    protein["msa_row_mask"] = torch.ones(
+        (protein["msa"].shape[0]), dtype=torch.float32
+    )
+    return protein
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_mask):
+    """Create pseudo beta features."""
+    is_gly = torch.eq(aatype, rc.restype_order["G"])
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        torch.tile(is_gly[..., None], [1] * len(is_gly.shape) + [3]),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+    if all_atom_mask is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly, all_atom_mask[..., ca_idx], all_atom_mask[..., cb_idx]
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+@curry1
+def make_pseudo_beta(protein, prefix=""):
+    """Create pseudo-beta (alpha for glycine) position and mask."""
+    assert prefix in ["", "template_"]
+    (
+        protein[prefix + "pseudo_beta"],
+        protein[prefix + "pseudo_beta_mask"],
+    ) = pseudo_beta_fn(
+        protein["template_aatype" if prefix else "aatype"],
+        protein[prefix + "all_atom_positions"],
+        protein["template_all_atom_mask" if prefix else "all_atom_mask"],
+    )
+    return protein
+@curry1
+def add_constant_field(protein, key, value):
+    protein[key] = torch.tensor(value, device=protein["msa"].device)
+    return protein
+def shaped_categorical(probs, epsilon=1e-10):
+    ds = probs.shape
+    num_classes = ds[-1]
+    distribution = torch.distributions.categorical.Categorical(
+        torch.reshape(probs + epsilon, [-1, num_classes])
+    )
+    counts = distribution.sample()
+    return torch.reshape(counts, ds[:-1])
+def make_hhblits_profile(protein):
+    """Compute the HHblits MSA profile if not already present."""
+    if "hhblits_profile" in protein:
+        return protein
+    # Compute the profile for every residue (over all MSA sequences).
+    msa_one_hot = make_one_hot(protein["msa"], 22)
+    protein["hhblits_profile"] = torch.mean(msa_one_hot, dim=0)
+    return protein
+@curry1
+def make_masked_msa(protein, config, replace_fraction):
+    """Create data for BERT on raw MSA."""
+    # Add a random amino acid uniformly.
+    random_aa = torch.tensor(
+        [0.05] * 20 + [0.0, 0.0],
+        dtype=torch.float32,
+        device=protein["aatype"].device
+    )
+    categorical_probs = (
+        config.uniform_prob * random_aa
+        + config.profile_prob * protein["hhblits_profile"]
+        + config.same_prob * make_one_hot(protein["msa"], 22)
+    )
+    # Put all remaining probability on [MASK] which is a new column
+    pad_shapes = list(
+        reduce(add, [(0, 0) for _ in range(len(categorical_probs.shape))])
+    )
+    pad_shapes[1] = 1
+    mask_prob = (
+        1.0 - config.profile_prob - config.same_prob - config.uniform_prob
+    )
+    assert mask_prob >= 0.0
+    categorical_probs = torch.nn.functional.pad(
+        categorical_probs, pad_shapes, value=mask_prob
+    )
+    sh = protein["msa"].shape
+    mask_position = torch.rand(sh) < replace_fraction
+    bert_msa = shaped_categorical(categorical_probs)
+    bert_msa = torch.where(mask_position, bert_msa, protein["msa"])
+    # Mix real and masked MSA
+    protein["bert_mask"] = mask_position.to(torch.float32)
+    protein["true_msa"] = protein["msa"]
+    protein["msa"] = bert_msa
+    return protein
+@curry1
+def make_fixed_size(
+    protein,
+    shape_schema,
+    msa_cluster_size,
+    extra_msa_size,
+    num_res=0,
+    num_templates=0,
+):
+    """Guess at the MSA and sequence dimension to make fixed size."""
+    pad_size_map = {
+        NUM_RES: num_res,
+        NUM_MSA_SEQ: msa_cluster_size,
+        NUM_EXTRA_SEQ: extra_msa_size,
+        NUM_TEMPLATES: num_templates,
+    }
+    for k, v in protein.items():
+        # Don't transfer this to the accelerator.
+        if k == "extra_cluster_assignment":
+            continue
+        shape = list(v.shape)
+        schema = shape_schema[k]
+        msg = "Rank mismatch between shape and shape schema for"
+        assert len(shape) == len(schema), f"{msg} {k}: {shape} vs {schema}"
+        pad_size = [
+            pad_size_map.get(s2, None) or s1 for (s1, s2) in zip(shape, schema)
+        ]
+        padding = [(0, p - v.shape[i]) for i, p in enumerate(pad_size)]
+        padding.reverse()
+        padding = list(itertools.chain(*padding))
+        if padding:
+            protein[k] = torch.nn.functional.pad(v, padding)
+            protein[k] = torch.reshape(protein[k], pad_size)
+    return protein
+@curry1
+def make_msa_feat(protein):
+    """Create and concatenate MSA features."""
+    # Whether there is a domain break. Always zero for chains, but keeping for
+    # compatibility with domain datasets.
+    has_break = torch.clip(
+        protein["between_segment_residues"].to(torch.float32), 0, 1
+    )
+    aatype_1hot = make_one_hot(protein["aatype"], 21)
+    target_feat = [
+        torch.unsqueeze(has_break, dim=-1),
+        aatype_1hot,  # Everyone gets the original sequence.
+    ]
+    msa_1hot = make_one_hot(protein["msa"], 23)
+    has_deletion = torch.clip(protein["deletion_matrix"], 0.0, 1.0)
+    deletion_value = torch.atan(protein["deletion_matrix"] / 3.0) * (
+        2.0 / np.pi
+    )
+    msa_feat = [
+        msa_1hot,
+        torch.unsqueeze(has_deletion, dim=-1),
+        torch.unsqueeze(deletion_value, dim=-1),
+    ]
+    if "cluster_profile" in protein:
+        deletion_mean_value = torch.atan(
+            protein["cluster_deletion_mean"] / 3.0
+        ) * (2.0 / np.pi)
+        msa_feat.extend(
+            [
+                protein["cluster_profile"],
+                torch.unsqueeze(deletion_mean_value, dim=-1),
+            ]
+        )
+    if "extra_deletion_matrix" in protein:
+        protein["extra_has_deletion"] = torch.clip(
+            protein["extra_deletion_matrix"], 0.0, 1.0
+        )
+        protein["extra_deletion_value"] = torch.atan(
+            protein["extra_deletion_matrix"] / 3.0
+        ) * (2.0 / np.pi)
+    protein["msa_feat"] = torch.cat(msa_feat, dim=-1)
+    protein["target_feat"] = torch.cat(target_feat, dim=-1)
+    return protein
+@curry1
+def select_feat(protein, feature_list):
+    return {k: v for k, v in protein.items() if k in feature_list}
+@curry1
+def crop_templates(protein, max_templates):
+    for k, v in protein.items():
+        if k.startswith("template_"):
+            protein[k] = v[:max_templates]
+    return protein
+def make_atom14_masks(protein):
+    """Construct denser atom positions (14 dimensions instead of 37)."""
+    restype_atom14_to_atom37 = []
+    restype_atom37_to_atom14 = []
+    restype_atom14_mask = []
+    for rt in rc.restypes:
+        atom_names = rc.restype_name_to_atom14_names[rc.restype_1to3[rt]]
+        restype_atom14_to_atom37.append(
+            [(rc.atom_order[name] if name else 0) for name in atom_names]
+        )
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14.append(
+            [
+                (atom_name_to_idx14[name] if name in atom_name_to_idx14 else 0)
+                for name in rc.atom_types
+            ]
+        )
+        restype_atom14_mask.append(
+            [(1.0 if name else 0.0) for name in atom_names]
+        )
+    # Add dummy mapping for restype 'UNK'
+    restype_atom14_to_atom37.append([0] * 14)
+    restype_atom37_to_atom14.append([0] * 37)
+    restype_atom14_mask.append([0.0] * 14)
+    restype_atom14_to_atom37 = torch.tensor(
+        restype_atom14_to_atom37,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom37_to_atom14 = torch.tensor(
+        restype_atom37_to_atom14,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom14_mask = torch.tensor(
+        restype_atom14_mask,
+        dtype=torch.float32,
+        device=protein["aatype"].device,
+    )
+    protein_aatype = protein['aatype'].to(torch.long)
+    # create the mapping for (residx, atom14) --> atom37, i.e. an array
+    # with shape (num_res, 14) containing the atom37 indices for this protein
+    residx_atom14_to_atom37 = restype_atom14_to_atom37[protein_aatype]
+    residx_atom14_mask = restype_atom14_mask[protein_aatype]
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["residx_atom14_to_atom37"] = residx_atom14_to_atom37.long()
+    # create the gather indices for mapping back
+    residx_atom37_to_atom14 = restype_atom37_to_atom14[protein_aatype]
+    protein["residx_atom37_to_atom14"] = residx_atom37_to_atom14.long()
+    # create the corresponding mask
+    restype_atom37_mask = torch.zeros(
+        [21, 37], dtype=torch.float32, device=protein["aatype"].device
+    )
+    for restype, restype_letter in enumerate(rc.restypes):
+        restype_name = rc.restype_1to3[restype_letter]
+        atom_names = rc.residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = rc.atom_order[atom_name]
+            restype_atom37_mask[restype, atom_type] = 1
+    residx_atom37_mask = restype_atom37_mask[protein_aatype]
+    protein["atom37_atom_exists"] = residx_atom37_mask
+    return protein
+def make_atom14_masks_np(batch):
+    batch = tree_map(
+        lambda n: torch.tensor(n, device=batch["aatype"].device),
+        batch,
+        np.ndarray
+    )
+    out = make_atom14_masks(batch)
+    out = tensor_tree_map(lambda t: np.array(t), out)
+    return out
+def make_atom14_positions(protein):
+    """Constructs denser atom positions (14 dimensions instead of 37)."""
+    residx_atom14_mask = protein["atom14_atom_exists"]
+    residx_atom14_to_atom37 = protein["residx_atom14_to_atom37"]
+    # Create a mask for known ground truth positions.
+    residx_atom14_gt_mask = residx_atom14_mask * batched_gather(
+        protein["all_atom_mask"],
+        residx_atom14_to_atom37,
+        dim=-1,
+        no_batch_dims=len(protein["all_atom_mask"].shape[:-1]),
+    )
+    # Gather the ground truth positions.
+    residx_atom14_gt_positions = residx_atom14_gt_mask[..., None] * (
+        batched_gather(
+            protein["all_atom_positions"],
+            residx_atom14_to_atom37,
+            dim=-2,
+            no_batch_dims=len(protein["all_atom_positions"].shape[:-2]),
+        )
+    )
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["atom14_gt_exists"] = residx_atom14_gt_mask
+    protein["atom14_gt_positions"] = residx_atom14_gt_positions
+    # As the atom naming is ambiguous for 7 of the 20 amino acids, provide
+    # alternative ground truth coordinates where the naming is swapped
+    restype_3 = [rc.restype_1to3[res] for res in rc.restypes]
+    restype_3 += ["UNK"]
+    # Matrices for renaming ambiguous atoms.
+    all_matrices = {
+        res: torch.eye(
+            14,
+            dtype=protein["all_atom_mask"].dtype,
+            device=protein["all_atom_mask"].device,
+        )
+        for res in restype_3
+    }
+    for resname, swap in rc.residue_atom_renaming_swaps.items():
+        correspondences = torch.arange(
+            14, device=protein["all_atom_mask"].device
+        )
+        for source_atom_swap, target_atom_swap in swap.items():
+            source_index = rc.restype_name_to_atom14_names[resname].index(
+                source_atom_swap
+            )
+            target_index = rc.restype_name_to_atom14_names[resname].index(
+                target_atom_swap
+            )
+            correspondences[source_index] = target_index
+            correspondences[target_index] = source_index
+            renaming_matrix = protein["all_atom_mask"].new_zeros((14, 14))
+            for index, correspondence in enumerate(correspondences):
+                renaming_matrix[index, correspondence] = 1.0
+        all_matrices[resname] = renaming_matrix
+    renaming_matrices = torch.stack(
+        [all_matrices[restype] for restype in restype_3]
+    )
+    # Pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14).
+    renaming_transform = renaming_matrices[protein["aatype"]]
+    # Apply it to the ground truth positions. shape (num_res, 14, 3).
+    alternative_gt_positions = torch.einsum(
+        "...rac,...rab->...rbc", residx_atom14_gt_positions, renaming_transform
+    )
+    protein["atom14_alt_gt_positions"] = alternative_gt_positions
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position).
+    alternative_gt_mask = torch.einsum(
+        "...ra,...rab->...rb", residx_atom14_gt_mask, renaming_transform
+    )
+    protein["atom14_alt_gt_exists"] = alternative_gt_mask
+    # Create an ambiguous atoms mask.  shape: (21, 14).
+    restype_atom14_is_ambiguous = protein["all_atom_mask"].new_zeros((21, 14))
+    for resname, swap in rc.residue_atom_renaming_swaps.items():
+        for atom_name1, atom_name2 in swap.items():
+            restype = rc.restype_order[rc.restype_3to1[resname]]
+            atom_idx1 = rc.restype_name_to_atom14_names[resname].index(
+                atom_name1
+            )
+            atom_idx2 = rc.restype_name_to_atom14_names[resname].index(
+                atom_name2
+            )
+            restype_atom14_is_ambiguous[restype, atom_idx1] = 1
+            restype_atom14_is_ambiguous[restype, atom_idx2] = 1
+    # From this create an ambiguous_mask for the given sequence.
+    protein["atom14_atom_is_ambiguous"] = restype_atom14_is_ambiguous[
+        protein["aatype"]
+    ]
+    return protein
+def atom37_to_frames(protein, eps=1e-8):
+    aatype = protein["aatype"]
+    all_atom_positions = protein["all_atom_positions"]
+    all_atom_mask = protein["all_atom_mask"]
+    batch_dims = len(aatype.shape[:-1])
+    restype_rigidgroup_base_atom_names = np.full([21, 8, 3], "", dtype=object)
+    restype_rigidgroup_base_atom_names[:, 0, :] = ["C", "CA", "N"]
+    restype_rigidgroup_base_atom_names[:, 3, :] = ["CA", "C", "O"]
+    for restype, restype_letter in enumerate(rc.restypes):
+        resname = rc.restype_1to3[restype_letter]
+        for chi_idx in range(4):
+            if rc.chi_angles_mask[restype][chi_idx]:
+                names = rc.chi_angles_atoms[resname][chi_idx]
+                restype_rigidgroup_base_atom_names[
+                    restype, chi_idx + 4, :
+                ] = names[1:]
+    restype_rigidgroup_mask = all_atom_mask.new_zeros(
+        (*aatype.shape[:-1], 21, 8),
+    )
+    restype_rigidgroup_mask[..., 0] = 1
+    restype_rigidgroup_mask[..., 3] = 1
+    restype_rigidgroup_mask[..., :20, 4:] = all_atom_mask.new_tensor(
+        rc.chi_angles_mask
+    )
+    lookuptable = rc.atom_order.copy()
+    lookuptable[""] = 0
+    lookup = np.vectorize(lambda x: lookuptable[x])
+    restype_rigidgroup_base_atom37_idx = lookup(
+        restype_rigidgroup_base_atom_names,
+    )
+    restype_rigidgroup_base_atom37_idx = aatype.new_tensor(
+        restype_rigidgroup_base_atom37_idx,
+    )
+    restype_rigidgroup_base_atom37_idx = (
+        restype_rigidgroup_base_atom37_idx.view(
+            *((1,) * batch_dims), *restype_rigidgroup_base_atom37_idx.shape
+        )
+    )
+    residx_rigidgroup_base_atom37_idx = batched_gather(
+        restype_rigidgroup_base_atom37_idx,
+        aatype,
+        dim=-3,
+        no_batch_dims=batch_dims,
+    )
+    base_atom_pos = batched_gather(
+        all_atom_positions,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-2,
+        no_batch_dims=len(all_atom_positions.shape[:-2]),
+    )
+    gt_frames = Rigid.from_3_points(
+        p_neg_x_axis=base_atom_pos[..., 0, :],
+        origin=base_atom_pos[..., 1, :],
+        p_xy_plane=base_atom_pos[..., 2, :],
+        eps=eps,
+    )
+    group_exists = batched_gather(
+        restype_rigidgroup_mask,
+        aatype,
+        dim=-2,
+        no_batch_dims=batch_dims,
+    )
+    gt_atoms_exist = batched_gather(
+        all_atom_mask,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-1,
+        no_batch_dims=len(all_atom_mask.shape[:-1]),
+    )
+    gt_exists = torch.min(gt_atoms_exist, dim=-1)[0] * group_exists
+    rots = torch.eye(3, dtype=all_atom_mask.dtype, device=aatype.device)
+    rots = torch.tile(rots, (*((1,) * batch_dims), 8, 1, 1))
+    rots[..., 0, 0, 0] = -1
+    rots[..., 0, 2, 2] = -1
+    rots = Rotation(rot_mats=rots)
+    gt_frames = gt_frames.compose(Rigid(rots, None))
+    restype_rigidgroup_is_ambiguous = all_atom_mask.new_zeros(
+        *((1,) * batch_dims), 21, 8
+    )
+    restype_rigidgroup_rots = torch.eye(
+        3, dtype=all_atom_mask.dtype, device=aatype.device
+    )
+    restype_rigidgroup_rots = torch.tile(
+        restype_rigidgroup_rots,
+        (*((1,) * batch_dims), 21, 8, 1, 1),
+    )
+    for resname, _ in rc.residue_atom_renaming_swaps.items():
+        restype = rc.restype_order[rc.restype_3to1[resname]]
+        chi_idx = int(sum(rc.chi_angles_mask[restype]) - 1)
+        restype_rigidgroup_is_ambiguous[..., restype, chi_idx + 4] = 1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 1, 1] = -1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 2, 2] = -1
+    residx_rigidgroup_is_ambiguous = batched_gather(
+        restype_rigidgroup_is_ambiguous,
+        aatype,
+        dim=-2,
+        no_batch_dims=batch_dims,
+    )
+    residx_rigidgroup_ambiguity_rot = batched_gather(
+        restype_rigidgroup_rots,
+        aatype,
+        dim=-4,
+        no_batch_dims=batch_dims,
+    )
+    residx_rigidgroup_ambiguity_rot = Rotation(
+        rot_mats=residx_rigidgroup_ambiguity_rot
+    )
+    alt_gt_frames = gt_frames.compose(
+        Rigid(residx_rigidgroup_ambiguity_rot, None)
+    )
+    gt_frames_tensor = gt_frames.to_tensor_4x4()
+    alt_gt_frames_tensor = alt_gt_frames.to_tensor_4x4()
+    protein["rigidgroups_gt_frames"] = gt_frames_tensor
+    protein["rigidgroups_gt_exists"] = gt_exists
+    protein["rigidgroups_group_exists"] = group_exists
+    protein["rigidgroups_group_is_ambiguous"] = residx_rigidgroup_is_ambiguous
+    protein["rigidgroups_alt_gt_frames"] = alt_gt_frames_tensor
+    return protein
+def get_chi_atom_indices():
+    """Returns atom indices needed to compute chi angles for all residue types.
+    Returns:
+      A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are
+      in the order specified in rc.restypes + unknown residue type
+      at the end. For chi angles which are not defined on the residue, the
+      positions indices are by default set to 0.
+    """
+    chi_atom_indices = []
+    for residue_name in rc.restypes:
+        residue_name = rc.restype_1to3[residue_name]
+        residue_chi_angles = rc.chi_angles_atoms[residue_name]
+        atom_indices = []
+        for chi_angle in residue_chi_angles:
+            atom_indices.append([rc.atom_order[atom] for atom in chi_angle])
+        for _ in range(4 - len(atom_indices)):
+            atom_indices.append(
+                [0, 0, 0, 0]
+            )  # For chi angles not defined on the AA.
+        chi_atom_indices.append(atom_indices)
+    chi_atom_indices.append([[0, 0, 0, 0]] * 4)  # For UNKNOWN residue.
+    return chi_atom_indices
+@curry1
+def atom37_to_torsion_angles(
+    protein,
+    prefix="",
+):
+    """
+    Convert coordinates to torsion angles.
+    This function is extremely sensitive to floating point imprecisions
+    and should be run with double precision whenever possible.
+    Args:
+        Dict containing:
+            * (prefix)aatype:
+                [*, N_res] residue indices
+            * (prefix)all_atom_positions:
+                [*, N_res, 37, 3] atom positions (in atom37
+                format)
+            * (prefix)all_atom_mask:
+                [*, N_res, 37] atom position mask
+    Returns:
+        The same dictionary updated with the following features:
+        "(prefix)torsion_angles_sin_cos" ([*, N_res, 7, 2])
+            Torsion angles
+        "(prefix)alt_torsion_angles_sin_cos" ([*, N_res, 7, 2])
+            Alternate torsion angles (accounting for 180-degree symmetry)
+        "(prefix)torsion_angles_mask" ([*, N_res, 7])
+            Torsion angles mask
+    """
+    aatype = protein[prefix + "aatype"]
+    all_atom_positions = protein[prefix + "all_atom_positions"]
+    all_atom_mask = protein[prefix + "all_atom_mask"]
+    aatype = torch.clamp(aatype, max=20)
+    pad = all_atom_positions.new_zeros(
+        [*all_atom_positions.shape[:-3], 1, 37, 3]
+    )
+    prev_all_atom_positions = torch.cat(
+        [pad, all_atom_positions[..., :-1, :, :]], dim=-3
+    )
+    pad = all_atom_mask.new_zeros([*all_atom_mask.shape[:-2], 1, 37])
+    prev_all_atom_mask = torch.cat([pad, all_atom_mask[..., :-1, :]], dim=-2)
+    pre_omega_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 1:3, :], all_atom_positions[..., :2, :]],
+        dim=-2,
+    )
+    phi_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 2:3, :], all_atom_positions[..., :3, :]],
+        dim=-2,
+    )
+    psi_atom_pos = torch.cat(
+        [all_atom_positions[..., :3, :], all_atom_positions[..., 4:5, :]],
+        dim=-2,
+    )
+    pre_omega_mask = torch.prod(
+        prev_all_atom_mask[..., 1:3], dim=-1
+    ) * torch.prod(all_atom_mask[..., :2], dim=-1)
+    phi_mask = prev_all_atom_mask[..., 2] * torch.prod(
+        all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype
+    )
+    psi_mask = (
+        torch.prod(all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
+        * all_atom_mask[..., 4]
+    )
+    chi_atom_indices = torch.as_tensor(
+        get_chi_atom_indices(), device=aatype.device
+    )
+    atom_indices = chi_atom_indices[..., aatype, :, :]
+    chis_atom_pos = batched_gather(
+        all_atom_positions, atom_indices, -2, len(atom_indices.shape[:-2])
+    )
+    chi_angles_mask = list(rc.chi_angles_mask)
+    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
+    chi_angles_mask = all_atom_mask.new_tensor(chi_angles_mask)
+    chis_mask = chi_angles_mask[aatype, :]
+    chi_angle_atoms_mask = batched_gather(
+        all_atom_mask,
+        atom_indices,
+        dim=-1,
+        no_batch_dims=len(atom_indices.shape[:-2]),
+    )
+    chi_angle_atoms_mask = torch.prod(
+        chi_angle_atoms_mask, dim=-1, dtype=chi_angle_atoms_mask.dtype
+    )
+    chis_mask = chis_mask * chi_angle_atoms_mask
+    torsions_atom_pos = torch.cat(
+        [
+            pre_omega_atom_pos[..., None, :, :],
+            phi_atom_pos[..., None, :, :],
+            psi_atom_pos[..., None, :, :],
+            chis_atom_pos,
+        ],
+        dim=-3,
+    )
+    torsion_angles_mask = torch.cat(
+        [
+            pre_omega_mask[..., None],
+            phi_mask[..., None],
+            psi_mask[..., None],
+            chis_mask,
+        ],
+        dim=-1,
+    )
+    torsion_frames = Rigid.from_3_points(
+        torsions_atom_pos[..., 1, :],
+        torsions_atom_pos[..., 2, :],
+        torsions_atom_pos[..., 0, :],
+        eps=1e-8,
+    )
+    fourth_atom_rel_pos = torsion_frames.invert().apply(
+        torsions_atom_pos[..., 3, :]
+    )
+    torsion_angles_sin_cos = torch.stack(
+        [fourth_atom_rel_pos[..., 2], fourth_atom_rel_pos[..., 1]], dim=-1
+    )
+    denom = torch.sqrt(
+        torch.sum(
+            torch.square(torsion_angles_sin_cos),
+            dim=-1,
+            dtype=torsion_angles_sin_cos.dtype,
+            keepdims=True,
+        )
+        + 1e-8
+    )
+    torsion_angles_sin_cos = torsion_angles_sin_cos / denom
+    torsion_angles_sin_cos = torsion_angles_sin_cos * all_atom_mask.new_tensor(
+        [1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0],
+    )[((None,) * len(torsion_angles_sin_cos.shape[:-2])) + (slice(None), None)]
+    chi_is_ambiguous = torsion_angles_sin_cos.new_tensor(
+        rc.chi_pi_periodic,
+    )[aatype, ...]
+    mirror_torsion_angles = torch.cat(
+        [
+            all_atom_mask.new_ones(*aatype.shape, 3),
+            1.0 - 2.0 * chi_is_ambiguous,
+        ],
+        dim=-1,
+    )
+    alt_torsion_angles_sin_cos = (
+        torsion_angles_sin_cos * mirror_torsion_angles[..., None]
+    )
+    protein[prefix + "torsion_angles_sin_cos"] = torsion_angles_sin_cos
+    protein[prefix + "alt_torsion_angles_sin_cos"] = alt_torsion_angles_sin_cos
+    protein[prefix + "torsion_angles_mask"] = torsion_angles_mask
+    return protein
+def get_backbone_frames(protein):
+    # DISCREPANCY: AlphaFold uses tensor_7s here. I don't know why.
+    protein["backbone_rigid_tensor"] = protein["rigidgroups_gt_frames"][
+        ..., 0, :, :
+    ]
+    protein["backbone_rigid_mask"] = protein["rigidgroups_gt_exists"][..., 0]
+    return protein
+def get_chi_angles(protein):
+    dtype = protein["all_atom_mask"].dtype
+    protein["chi_angles_sin_cos"] = (
+        protein["torsion_angles_sin_cos"][..., 3:, :]
+    ).to(dtype)
+    protein["chi_mask"] = protein["torsion_angles_mask"][..., 3:].to(dtype)
+    return protein
+@curry1
+def random_crop_to_size(
+    protein,
+    crop_size,
+    max_templates,
+    shape_schema,
+    subsample_templates=False,
+    seed=None,
+):
+    """Crop randomly to `crop_size`, or keep as is if shorter than that."""
+    # We want each ensemble to be cropped the same way
+    g = torch.Generator(device=protein["seq_length"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+    seq_length = protein["seq_length"]
+    if "template_mask" in protein:
+        num_templates = protein["template_mask"].shape[-1]
+    else:
+        num_templates = 0
+    # No need to subsample templates if there aren't any
+    subsample_templates = subsample_templates and num_templates
+    num_res_crop_size = min(int(seq_length), crop_size)
+    def _randint(lower, upper):
+        return int(torch.randint(
+                lower,
+                upper + 1,
+                (1,),
+                device=protein["seq_length"].device,
+                generator=g,
+        )[0])
+    if subsample_templates:
+        templates_crop_start = _randint(0, num_templates)
+        templates_select_indices = torch.randperm(
+            num_templates, device=protein["seq_length"].device, generator=g
+        )
+    else:
+        templates_crop_start = 0
+    num_templates_crop_size = min(
+        num_templates - templates_crop_start, max_templates
+    )
+    n = seq_length - num_res_crop_size
+    if "use_clamped_fape" in protein and protein["use_clamped_fape"] == 1.:
+        right_anchor = n
+    else:
+        x = _randint(0, n)
+        right_anchor = n - x
+    num_res_crop_start = _randint(0, right_anchor)
+    for k, v in protein.items():
+        if k not in shape_schema or (
+            "template" not in k and NUM_RES not in shape_schema[k]
+        ):
+            continue
+        # randomly permute the templates before cropping them.
+        if k.startswith("template") and subsample_templates:
+            v = v[templates_select_indices]
+        slices = []
+        for i, (dim_size, dim) in enumerate(zip(shape_schema[k], v.shape)):
+            is_num_res = dim_size == NUM_RES
+            if i == 0 and k.startswith("template"):
+                crop_size = num_templates_crop_size
+                crop_start = templates_crop_start
+            else:
+                crop_start = num_res_crop_start if is_num_res else 0
+                crop_size = num_res_crop_size if is_num_res else dim
+            slices.append(slice(crop_start, crop_start + crop_size))
+        protein[k] = v[slices]
+    protein["seq_length"] = protein["seq_length"].new_tensor(num_res_crop_size)
+    return protein

openfold/data/errors.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""General-purpose errors used throughout the data pipeline"""
+class Error(Exception):
+    """Base class for exceptions."""
+class MultipleChainsError(Error):
+    """An error indicating that multiple chains were found for a given ID."""

openfold/data/feature_pipeline.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from typing import Mapping, Tuple, List, Optional, Dict, Sequence
+import ml_collections
+import numpy as np
+import torch
+from openfold.data import input_pipeline
+FeatureDict = Mapping[str, np.ndarray]
+TensorDict = Dict[str, torch.Tensor]
+def np_to_tensor_dict(
+    np_example: Mapping[str, np.ndarray],
+    features: Sequence[str],
+) -> TensorDict:
+    """Creates dict of tensors from a dict of NumPy arrays.
+    Args:
+        np_example: A dict of NumPy feature arrays.
+        features: A list of strings of feature names to be returned in the dataset.
+    Returns:
+        A dictionary of features mapping feature names to features. Only the given
+        features are returned, all other ones are filtered out.
+    """
+    tensor_dict = {
+        k: torch.tensor(v) for k, v in np_example.items() if k in features
+    }
+    return tensor_dict
+def make_data_config(
+    config: ml_collections.ConfigDict,
+    mode: str,
+    num_res: int,
+) -> Tuple[ml_collections.ConfigDict, List[str]]:
+    cfg = copy.deepcopy(config)
+    mode_cfg = cfg[mode]
+    with cfg.unlocked():
+        if mode_cfg.crop_size is None:
+            mode_cfg.crop_size = num_res
+    feature_names = cfg.common.unsupervised_features
+    if cfg.common.use_templates:
+        feature_names += cfg.common.template_features
+    if cfg[mode].supervised:
+        feature_names += cfg.supervised.supervised_features
+    return cfg, feature_names
+def np_example_to_features(
+    np_example: FeatureDict,
+    config: ml_collections.ConfigDict,
+    mode: str,
+):
+    np_example = dict(np_example)
+    num_res = int(np_example["seq_length"][0])
+    cfg, feature_names = make_data_config(config, mode=mode, num_res=num_res)
+    if "deletion_matrix_int" in np_example:
+        np_example["deletion_matrix"] = np_example.pop(
+            "deletion_matrix_int"
+        ).astype(np.float32)
+    tensor_dict = np_to_tensor_dict(
+        np_example=np_example, features=feature_names
+    )
+    with torch.no_grad():
+        features = input_pipeline.process_tensors_from_config(
+            tensor_dict,
+            cfg.common,
+            cfg[mode],
+        )
+    return {k: v for k, v in features.items()}
+class FeaturePipeline:
+    def __init__(
+        self,
+        config: ml_collections.ConfigDict,
+    ):
+        self.config = config
+    def process_features(
+        self,
+        raw_features: FeatureDict,
+        mode: str = "train",
+    ) -> FeatureDict:
+        return np_example_to_features(
+            np_example=raw_features,
+            config=self.config,
+            mode=mode,
+        )

openfold/data/input_pipeline.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import torch
+from openfold.data import data_transforms
+def nonensembled_transform_fns(common_cfg, mode_cfg):
+    """Input pipeline data transformers that are not ensembled."""
+    transforms = [
+        data_transforms.cast_to_64bit_ints,
+        data_transforms.correct_msa_restypes,
+        data_transforms.squeeze_features,
+        data_transforms.randomly_replace_msa_with_unknown(0.0),
+        data_transforms.make_seq_mask,
+        data_transforms.make_msa_mask,
+        data_transforms.make_hhblits_profile,
+    ]
+    if common_cfg.use_templates:
+        transforms.extend(
+            [
+                data_transforms.fix_templates_aatype,
+                data_transforms.make_template_mask,
+                data_transforms.make_pseudo_beta("template_"),
+            ]
+        )
+        if common_cfg.use_template_torsion_angles:
+            transforms.extend(
+                [
+                    data_transforms.atom37_to_torsion_angles("template_"),
+                ]
+            )
+    transforms.extend(
+        [
+            data_transforms.make_atom14_masks,
+        ]
+    )
+    if mode_cfg.supervised:
+        transforms.extend(
+            [
+                data_transforms.make_atom14_positions,
+                data_transforms.atom37_to_frames,
+                data_transforms.atom37_to_torsion_angles(""),
+                data_transforms.make_pseudo_beta(""),
+                data_transforms.get_backbone_frames,
+                data_transforms.get_chi_angles,
+            ]
+        )
+    return transforms
+def ensembled_transform_fns(common_cfg, mode_cfg, ensemble_seed):
+    """Input pipeline data transformers that can be ensembled and averaged."""
+    transforms = []
+    if "max_distillation_msa_clusters" in mode_cfg:
+        transforms.append(
+            data_transforms.sample_msa_distillation(
+                mode_cfg.max_distillation_msa_clusters
+            )
+        )
+    if common_cfg.reduce_msa_clusters_by_max_templates:
+        pad_msa_clusters = mode_cfg.max_msa_clusters - mode_cfg.max_templates
+    else:
+        pad_msa_clusters = mode_cfg.max_msa_clusters
+    max_msa_clusters = pad_msa_clusters
+    max_extra_msa = mode_cfg.max_extra_msa
+    msa_seed = None
+    if(not common_cfg.resample_msa_in_recycling):
+        msa_seed = ensemble_seed
+    transforms.append(
+        data_transforms.sample_msa(
+            max_msa_clusters,
+            keep_extra=True,
+            seed=msa_seed,
+        )
+    )
+    if "masked_msa" in common_cfg:
+        # Masked MSA should come *before* MSA clustering so that
+        # the clustering and full MSA profile do not leak information about
+        # the masked locations and secret corrupted locations.
+        transforms.append(
+            data_transforms.make_masked_msa(
+                common_cfg.masked_msa, mode_cfg.masked_msa_replace_fraction
+            )
+        )
+    if common_cfg.msa_cluster_features:
+        transforms.append(data_transforms.nearest_neighbor_clusters())
+        transforms.append(data_transforms.summarize_clusters())
+    # Crop after creating the cluster profiles.
+    if max_extra_msa:
+        transforms.append(data_transforms.crop_extra_msa(max_extra_msa))
+    else:
+        transforms.append(data_transforms.delete_extra_msa)
+    transforms.append(data_transforms.make_msa_feat())
+    crop_feats = dict(common_cfg.feat)
+    if mode_cfg.fixed_size:
+        transforms.append(data_transforms.select_feat(list(crop_feats)))
+        transforms.append(
+            data_transforms.random_crop_to_size(
+                mode_cfg.crop_size,
+                mode_cfg.max_templates,
+                crop_feats,
+                mode_cfg.subsample_templates,
+                seed=ensemble_seed + 1,
+            )
+        )
+        transforms.append(
+            data_transforms.make_fixed_size(
+                crop_feats,
+                pad_msa_clusters,
+                mode_cfg.max_extra_msa,
+                mode_cfg.crop_size,
+                mode_cfg.max_templates,
+            )
+        )
+    else:
+        transforms.append(
+            data_transforms.crop_templates(mode_cfg.max_templates)
+        )
+    return transforms
+def process_tensors_from_config(tensors, common_cfg, mode_cfg):
+    """Based on the config, apply filters and transformations to the data."""
+    ensemble_seed = torch.Generator().seed()
+    def wrap_ensemble_fn(data, i):
+        """Function to be mapped over the ensemble dimension."""
+        d = data.copy()
+        fns = ensembled_transform_fns(
+            common_cfg,
+            mode_cfg,
+            ensemble_seed,
+        )
+        fn = compose(fns)
+        d["ensemble_index"] = i
+        return fn(d)
+    no_templates = True
+    if("template_aatype" in tensors):
+        no_templates = tensors["template_aatype"].shape[0] == 0
+    nonensembled = nonensembled_transform_fns(
+        common_cfg,
+        mode_cfg,
+    )
+    tensors = compose(nonensembled)(tensors)
+    if("no_recycling_iters" in tensors):
+        num_recycling = int(tensors["no_recycling_iters"])
+    else:
+        num_recycling = common_cfg.max_recycling_iters
+    tensors = map_fn(
+        lambda x: wrap_ensemble_fn(tensors, x), torch.arange(num_recycling + 1)
+    )
+    return tensors
+@data_transforms.curry1
+def compose(x, fs):
+    for f in fs:
+        x = f(x)
+    return x
+def map_fn(fun, x):
+    ensembles = [fun(elem) for elem in x]
+    features = ensembles[0].keys()
+    ensembled_dict = {}
+    for feat in features:
+        ensembled_dict[feat] = torch.stack(
+            [dict_i[feat] for dict_i in ensembles], dim=-1
+        )
+    return ensembled_dict

openfold/data/mmcif_parsing.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parses the mmCIF file format."""
+import collections
+import dataclasses
+import io
+import json
+import logging
+import os
+from typing import Any, Mapping, Optional, Sequence, Tuple
+from Bio import PDB
+from Bio.Data import SCOPData
+import numpy as np
+from openfold.data.errors import MultipleChainsError
+import openfold.np.residue_constants as residue_constants
+# Type aliases:
+ChainId = str
+PdbHeader = Mapping[str, Any]
+PdbStructure = PDB.Structure.Structure
+SeqRes = str
+MmCIFDict = Mapping[str, Sequence[str]]
+@dataclasses.dataclass(frozen=True)
+class Monomer:
+    id: str
+    num: int
+# Note - mmCIF format provides no guarantees on the type of author-assigned
+# sequence numbers. They need not be integers.
+@dataclasses.dataclass(frozen=True)
+class AtomSite:
+    residue_name: str
+    author_chain_id: str
+    mmcif_chain_id: str
+    author_seq_num: str
+    mmcif_seq_num: int
+    insertion_code: str
+    hetatm_atom: str
+    model_num: int
+# Used to map SEQRES index to a residue in the structure.
+@dataclasses.dataclass(frozen=True)
+class ResiduePosition:
+    chain_id: str
+    residue_number: int
+    insertion_code: str
+@dataclasses.dataclass(frozen=True)
+class ResidueAtPosition:
+    position: Optional[ResiduePosition]
+    name: str
+    is_missing: bool
+    hetflag: str
+@dataclasses.dataclass(frozen=True)
+class MmcifObject:
+    """Representation of a parsed mmCIF file.
+    Contains:
+      file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
+        files being processed.
+      header: Biopython header.
+      structure: Biopython structure.
+      chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
+        {'A': 'ABCDEFG'}
+      seqres_to_structure: Dict; for each chain_id contains a mapping between
+        SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
+                                                          1: ResidueAtPosition,
+                                                          ...}}
+      raw_string: The raw string used to construct the MmcifObject.
+    """
+    file_id: str
+    header: PdbHeader
+    structure: PdbStructure
+    chain_to_seqres: Mapping[ChainId, SeqRes]
+    seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
+    raw_string: Any
+@dataclasses.dataclass(frozen=True)
+class ParsingResult:
+    """Returned by the parse function.
+    Contains:
+      mmcif_object: A MmcifObject, may be None if no chain could be successfully
+        parsed.
+      errors: A dict mapping (file_id, chain_id) to any exception generated.
+    """
+    mmcif_object: Optional[MmcifObject]
+    errors: Mapping[Tuple[str, str], Any]
+class ParseError(Exception):
+    """An error indicating that an mmCIF file could not be parsed."""
+def mmcif_loop_to_list(
+    prefix: str, parsed_info: MmCIFDict
+) -> Sequence[Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a list.
+    Reference for loop_ in mmCIF:
+      http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
+    Args:
+      prefix: Prefix shared by each of the data items in the loop.
+        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+        _entity_poly_seq.mon_id. Should include the trailing period.
+      parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+        parser.
+    Returns:
+      Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
+    """
+    cols = []
+    data = []
+    for key, value in parsed_info.items():
+        if key.startswith(prefix):
+            cols.append(key)
+            data.append(value)
+    assert all([len(xs) == len(data[0]) for xs in data]), (
+        "mmCIF error: Not all loops are the same length: %s" % cols
+    )
+    return [dict(zip(cols, xs)) for xs in zip(*data)]
+def mmcif_loop_to_dict(
+    prefix: str,
+    index: str,
+    parsed_info: MmCIFDict,
+) -> Mapping[str, Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a dictionary.
+    Args:
+      prefix: Prefix shared by each of the data items in the loop.
+        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+        _entity_poly_seq.mon_id. Should include the trailing period.
+      index: Which item of loop data should serve as the key.
+      parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+        parser.
+    Returns:
+      Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
+      indexed by the index column.
+    """
+    entries = mmcif_loop_to_list(prefix, parsed_info)
+    return {entry[index]: entry for entry in entries}
+def parse(
+    *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
+) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+    Args:
+      file_id: A string identifier for this file. Should be unique within the
+        collection of files being processed.
+      mmcif_string: Contents of an mmCIF file.
+      catch_all_errors: If True, all exceptions are caught and error messages are
+        returned as part of the ParsingResult. If False exceptions will be allowed
+        to propagate.
+    Returns:
+      A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = PDB.MMCIFParser(QUIET=True)
+        handle = io.StringIO(mmcif_string)
+        full_structure = parser.get_structure("", handle)
+        first_model_structure = _get_first_model(full_structure)
+        # Extract the _mmcif_dict from the parser, which contains useful fields not
+        # reflected in the Biopython structure.
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+        header = _get_header(parsed_info)
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ""): "No protein chains found in this file."}
+            )
+        seq_start_num = {
+            chain_id: min([monomer.num for monomer in seq])
+            for chain_id, seq in valid_chains.items()
+        }
+        # Loop over the atoms for which we have coordinates. Populate two mappings:
+        # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
+        # the authors / Biopython).
+        # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
+        mmcif_to_author_chain_id = {}
+        seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != "1":
+                # We only process the first model at the moment.
+                continue
+            mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
+            if atom.mmcif_chain_id in valid_chains:
+                hetflag = " "
+                if atom.hetatm_atom == "HETATM":
+                    # Water atoms are assigned a special hetflag of W in Biopython. We
+                    # need to do the same, so that this hetflag can be used to fetch
+                    # a residue from the Biopython structure by id.
+                    if atom.residue_name in ("HOH", "WAT"):
+                        hetflag = "W"
+                    else:
+                        hetflag = "H_" + atom.residue_name
+                insertion_code = atom.insertion_code
+                if not _is_set(atom.insertion_code):
+                    insertion_code = " "
+                position = ResiduePosition(
+                    chain_id=atom.author_chain_id,
+                    residue_number=int(atom.author_seq_num),
+                    insertion_code=insertion_code,
+                )
+                seq_idx = (
+                    int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
+                )
+                current = seq_to_structure_mappings.get(
+                    atom.author_chain_id, {}
+                )
+                current[seq_idx] = ResidueAtPosition(
+                    position=position,
+                    name=atom.residue_name,
+                    is_missing=False,
+                    hetflag=hetflag,
+                )
+                seq_to_structure_mappings[atom.author_chain_id] = current
+        # Add missing residue information to seq_to_structure_mappings.
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            current_mapping = seq_to_structure_mappings[author_chain]
+            for idx, monomer in enumerate(seq_info):
+                if idx not in current_mapping:
+                    current_mapping[idx] = ResidueAtPosition(
+                        position=None,
+                        name=monomer.id,
+                        is_missing=True,
+                        hetflag=" ",
+                    )
+        author_chain_to_sequence = {}
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            seq = []
+            for monomer in seq_info:
+                code = SCOPData.protein_letters_3to1.get(monomer.id, "X")
+                seq.append(code if len(code) == 1 else "X")
+            seq = "".join(seq)
+            author_chain_to_sequence[author_chain] = seq
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=first_model_structure,
+            chain_to_seqres=author_chain_to_sequence,
+            seqres_to_structure=seq_to_structure_mappings,
+            raw_string=parsed_info,
+        )
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, "")] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+def _get_first_model(structure: PdbStructure) -> PdbStructure:
+    """Returns the first model in a Biopython structure."""
+    return next(structure.get_models())
+_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
+def get_release_date(parsed_info: MmCIFDict) -> str:
+    """Returns the oldest revision date."""
+    revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"]
+    return min(revision_dates)
+def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
+    """Returns a basic header containing method, release date and resolution."""
+    header = {}
+    experiments = mmcif_loop_to_list("_exptl.", parsed_info)
+    header["structure_method"] = ",".join(
+        [experiment["_exptl.method"].lower() for experiment in experiments]
+    )
+    # Note: The release_date here corresponds to the oldest revision. We prefer to
+    # use this for dataset filtering over the deposition_date.
+    if "_pdbx_audit_revision_history.revision_date" in parsed_info:
+        header["release_date"] = get_release_date(parsed_info)
+    else:
+        logging.warning(
+            "Could not determine release_date: %s", parsed_info["_entry.id"]
+        )
+    header["resolution"] = 0.00
+    for res_key in (
+        "_refine.ls_d_res_high",
+        "_em_3d_reconstruction.resolution",
+        "_reflns.d_resolution_high",
+    ):
+        if res_key in parsed_info:
+            try:
+                raw_resolution = parsed_info[res_key][0]
+                header["resolution"] = float(raw_resolution)
+            except ValueError:
+                logging.info(
+                    "Invalid resolution format: %s", parsed_info[res_key]
+                )
+    return header
+def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
+    """Returns list of atom sites; contains data not present in the structure."""
+    return [
+        AtomSite(*site)
+        for site in zip(  # pylint:disable=g-complex-comprehension
+            parsed_info["_atom_site.label_comp_id"],
+            parsed_info["_atom_site.auth_asym_id"],
+            parsed_info["_atom_site.label_asym_id"],
+            parsed_info["_atom_site.auth_seq_id"],
+            parsed_info["_atom_site.label_seq_id"],
+            parsed_info["_atom_site.pdbx_PDB_ins_code"],
+            parsed_info["_atom_site.group_PDB"],
+            parsed_info["_atom_site.pdbx_PDB_model_num"],
+        )
+    ]
+def _get_protein_chains(
+    *, parsed_info: Mapping[str, Any]
+) -> Mapping[ChainId, Sequence[Monomer]]:
+    """Extracts polymer information for protein chains only.
+    Args:
+      parsed_info: _mmcif_dict produced by the Biopython parser.
+    Returns:
+      A dict mapping mmcif chain id to a list of Monomers.
+    """
+    # Get polymer information for each entity in the structure.
+    entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info)
+    polymers = collections.defaultdict(list)
+    for entity_poly_seq in entity_poly_seqs:
+        polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append(
+            Monomer(
+                id=entity_poly_seq["_entity_poly_seq.mon_id"],
+                num=int(entity_poly_seq["_entity_poly_seq.num"]),
+            )
+        )
+    # Get chemical compositions. Will allow us to identify which of these polymers
+    # are proteins.
+    chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info)
+    # Get chains information for each entity. Necessary so that we can return a
+    # dict keyed on chain id rather than entity.
+    struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info)
+    entity_to_mmcif_chains = collections.defaultdict(list)
+    for struct_asym in struct_asyms:
+        chain_id = struct_asym["_struct_asym.id"]
+        entity_id = struct_asym["_struct_asym.entity_id"]
+        entity_to_mmcif_chains[entity_id].append(chain_id)
+    # Identify and return the valid protein chains.
+    valid_chains = {}
+    for entity_id, seq_info in polymers.items():
+        chain_ids = entity_to_mmcif_chains[entity_id]
+        # Reject polymers without any peptide-like components, such as DNA/RNA.
+        if any(
+            [
+                "peptide" in chem_comps[monomer.id]["_chem_comp.type"]
+                for monomer in seq_info
+            ]
+        ):
+            for chain_id in chain_ids:
+                valid_chains[chain_id] = seq_info
+    return valid_chains
+def _is_set(data: str) -> bool:
+    """Returns False if data is a special mmCIF character indicating 'unset'."""
+    return data not in (".", "?")
+def get_atom_coords(
+    mmcif_object: MmcifObject,
+    chain_id: str,
+    _zero_center_positions: bool = False
+) -> Tuple[np.ndarray, np.ndarray]:
+    # Locate the right chain
+    chains = list(mmcif_object.structure.get_chains())
+    relevant_chains = [c for c in chains if c.id == chain_id]
+    if len(relevant_chains) != 1:
+        raise MultipleChainsError(
+            f"Expected exactly one chain in structure with id {chain_id}."
+        )
+    chain = relevant_chains[0]
+    # Extract the coordinates
+    num_res = len(mmcif_object.chain_to_seqres[chain_id])
+    all_atom_positions = np.zeros(
+        [num_res, residue_constants.atom_type_num, 3], dtype=np.float32
+    )
+    all_atom_mask = np.zeros(
+        [num_res, residue_constants.atom_type_num], dtype=np.float32
+    )
+    for res_index in range(num_res):
+        pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
+        mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
+        res_at_position = mmcif_object.seqres_to_structure[chain_id][res_index]
+        if not res_at_position.is_missing:
+            res = chain[
+                (
+                    res_at_position.hetflag,
+                    res_at_position.position.residue_number,
+                    res_at_position.position.insertion_code,
+                )
+            ]
+            for atom in res.get_atoms():
+                atom_name = atom.get_name()
+                x, y, z = atom.get_coord()
+                if atom_name in residue_constants.atom_order.keys():
+                    pos[residue_constants.atom_order[atom_name]] = [x, y, z]
+                    mask[residue_constants.atom_order[atom_name]] = 1.0
+                elif atom_name.upper() == "SE" and res.get_resname() == "MSE":
+                    # Put the coords of the selenium atom in the sulphur column
+                    pos[residue_constants.atom_order["SD"]] = [x, y, z]
+                    mask[residue_constants.atom_order["SD"]] = 1.0
+        all_atom_positions[res_index] = pos
+        all_atom_mask[res_index] = mask
+    if _zero_center_positions:
+        binary_mask = all_atom_mask.astype(bool)
+        translation_vec = all_atom_positions[binary_mask].mean(axis=0)
+        all_atom_positions[binary_mask] -= translation_vec
+    return all_atom_positions, all_atom_mask

openfold/data/parsers.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for parsing various file formats."""
+import collections
+import dataclasses
+import re
+import string
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+DeletionMatrix = Sequence[Sequence[int]]
+@dataclasses.dataclass(frozen=True)
+class TemplateHit:
+    """Class representing a template hit."""
+    index: int
+    name: str
+    aligned_cols: int
+    sum_probs: float
+    query: str
+    hit_sequence: str
+    indices_query: List[int]
+    indices_hit: List[int]
+def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
+    """Parses FASTA string and returns list of strings with amino-acid sequences.
+    Arguments:
+        fasta_string: The string contents of a FASTA file.
+    Returns:
+        A tuple of two lists:
+        * A list of sequences.
+        * A list of sequence descriptions taken from the comment lines. In the
+            same order as the sequences.
+    """
+    sequences = []
+    descriptions = []
+    index = -1
+    for line in fasta_string.splitlines():
+        line = line.strip()
+        if line.startswith(">"):
+            index += 1
+            descriptions.append(line[1:])  # Remove the '>' at the beginning.
+            sequences.append("")
+            continue
+        elif not line:
+            continue  # Skip blank lines.
+        sequences[index] += line
+    return sequences, descriptions
+def parse_stockholm(
+    stockholm_string: str,
+) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]:
+    """Parses sequences and deletion matrix from stockholm format alignment.
+    Args:
+        stockholm_string: The string contents of a stockholm file. The first
+            sequence in the file should be the query sequence.
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * The names of the targets matched, including the jackhmmer subsequence
+                suffix.
+    """
+    name_to_sequence = collections.OrderedDict()
+    for line in stockholm_string.splitlines():
+        line = line.strip()
+        if not line or line.startswith(("#", "//")):
+            continue
+        name, sequence = line.split()
+        if name not in name_to_sequence:
+            name_to_sequence[name] = ""
+        name_to_sequence[name] += sequence
+    msa = []
+    deletion_matrix = []
+    query = ""
+    keep_columns = []
+    for seq_index, sequence in enumerate(name_to_sequence.values()):
+        if seq_index == 0:
+            # Gather the columns with gaps from the query
+            query = sequence
+            keep_columns = [i for i, res in enumerate(query) if res != "-"]
+        # Remove the columns with gaps in the query from all sequences.
+        aligned_sequence = "".join([sequence[c] for c in keep_columns])
+        msa.append(aligned_sequence)
+        # Count the number of deletions w.r.t. query.
+        deletion_vec = []
+        deletion_count = 0
+        for seq_res, query_res in zip(sequence, query):
+            if seq_res != "-" or query_res != "-":
+                if query_res == "-":
+                    deletion_count += 1
+                else:
+                    deletion_vec.append(deletion_count)
+                    deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+    return msa, deletion_matrix, list(name_to_sequence.keys())
+def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
+    """Parses sequences and deletion matrix from a3m format alignment.
+    Args:
+        a3m_string: The string contents of a a3m file. The first sequence in the
+            file should be the query sequence.
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+    """
+    sequences, _ = parse_fasta(a3m_string)
+    deletion_matrix = []
+    for msa_sequence in sequences:
+        deletion_vec = []
+        deletion_count = 0
+        for j in msa_sequence:
+            if j.islower():
+                deletion_count += 1
+            else:
+                deletion_vec.append(deletion_count)
+                deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+    # Make the MSA matrix out of aligned (deletion-free) sequences.
+    deletion_table = str.maketrans("", "", string.ascii_lowercase)
+    aligned_sequences = [s.translate(deletion_table) for s in sequences]
+    return aligned_sequences, deletion_matrix
+def _convert_sto_seq_to_a3m(
+    query_non_gaps: Sequence[bool], sto_seq: str
+) -> Iterable[str]:
+    for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
+        if is_query_res_non_gap:
+            yield sequence_res
+        elif sequence_res != "-":
+            yield sequence_res.lower()
+def convert_stockholm_to_a3m(
+    stockholm_format: str, max_sequences: Optional[int] = None
+) -> str:
+    """Converts MSA in Stockholm format to the A3M format."""
+    descriptions = {}
+    sequences = {}
+    reached_max_sequences = False
+    for line in stockholm_format.splitlines():
+        reached_max_sequences = (
+            max_sequences and len(sequences) >= max_sequences
+        )
+        if line.strip() and not line.startswith(("#", "//")):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname, aligned_seq = line.split(maxsplit=1)
+            if seqname not in sequences:
+                if reached_max_sequences:
+                    continue
+                sequences[seqname] = ""
+            sequences[seqname] += aligned_seq
+    for line in stockholm_format.splitlines():
+        if line[:4] == "#=GS":
+            # Description row - example format is:
+            # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
+            columns = line.split(maxsplit=3)
+            seqname, feature = columns[1:3]
+            value = columns[3] if len(columns) == 4 else ""
+            if feature != "DE":
+                continue
+            if reached_max_sequences and seqname not in sequences:
+                continue
+            descriptions[seqname] = value
+            if len(descriptions) == len(sequences):
+                break
+    # Convert sto format to a3m line by line
+    a3m_sequences = {}
+    # query_sequence is assumed to be the first sequence
+    query_sequence = next(iter(sequences.values()))
+    query_non_gaps = [res != "-" for res in query_sequence]
+    for seqname, sto_sequence in sequences.items():
+        a3m_sequences[seqname] = "".join(
+            _convert_sto_seq_to_a3m(query_non_gaps, sto_sequence)
+        )
+    fasta_chunks = (
+        f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
+        for k in a3m_sequences
+    )
+    return "\n".join(fasta_chunks) + "\n"  # Include terminating newline.
+def _get_hhr_line_regex_groups(
+    regex_pattern: str, line: str
+) -> Sequence[Optional[str]]:
+    match = re.match(regex_pattern, line)
+    if match is None:
+        raise RuntimeError(f"Could not parse query line {line}")
+    return match.groups()
+def _update_hhr_residue_indices_list(
+    sequence: str, start_index: int, indices_list: List[int]
+):
+    """Computes the relative indices for each residue with respect to the original sequence."""
+    counter = start_index
+    for symbol in sequence:
+        if symbol == "-":
+            indices_list.append(-1)
+        else:
+            indices_list.append(counter)
+            counter += 1
+def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
+    """Parses the detailed HMM HMM comparison section for a single Hit.
+    This works on .hhr files generated from both HHBlits and HHSearch.
+    Args:
+        detailed_lines: A list of lines from a single comparison section between 2
+            sequences (which each have their own HMM's)
+    Returns:
+        A dictionary with the information from that detailed comparison section
+    Raises:
+        RuntimeError: If a certain line cannot be processed
+    """
+    # Parse first 2 lines.
+    number_of_hit = int(detailed_lines[0].split()[-1])
+    name_hit = detailed_lines[1][1:]
+    # Parse the summary line.
+    pattern = (
+        "Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t"
+        " ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t "
+        "]*Template_Neff=(.*)"
+    )
+    match = re.match(pattern, detailed_lines[2])
+    if match is None:
+        raise RuntimeError(
+            "Could not parse section: %s. Expected this: \n%s to contain summary."
+            % (detailed_lines, detailed_lines[2])
+        )
+    (prob_true, e_value, _, aligned_cols, _, _, sum_probs, neff) = [
+        float(x) for x in match.groups()
+    ]
+    # The next section reads the detailed comparisons. These are in a 'human
+    # readable' format which has a fixed length. The strategy employed is to
+    # assume that each block starts with the query sequence line, and to parse
+    # that with a regexp in order to deduce the fixed length used for that block.
+    query = ""
+    hit_sequence = ""
+    indices_query = []
+    indices_hit = []
+    length_block = None
+    for line in detailed_lines[3:]:
+        # Parse the query sequence line
+        if (
+            line.startswith("Q ")
+            and not line.startswith("Q ss_dssp")
+            and not line.startswith("Q ss_pred")
+            and not line.startswith("Q Consensus")
+        ):
+            # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
+            # everything after that.
+            #              start    sequence       end       total_sequence_length
+            patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)"
+            groups = _get_hhr_line_regex_groups(patt, line[17:])
+            # Get the length of the parsed block using the start and finish indices,
+            # and ensure it is the same as the actual block length.
+            start = int(groups[0]) - 1  # Make index zero based.
+            delta_query = groups[1]
+            end = int(groups[2])
+            num_insertions = len([x for x in delta_query if x == "-"])
+            length_block = end - start + num_insertions
+            assert length_block == len(delta_query)
+            # Update the query sequence and indices list.
+            query += delta_query
+            _update_hhr_residue_indices_list(delta_query, start, indices_query)
+        elif line.startswith("T "):
+            # Parse the hit sequence.
+            if (
+                not line.startswith("T ss_dssp")
+                and not line.startswith("T ss_pred")
+                and not line.startswith("T Consensus")
+            ):
+                # Thus the first 17 characters must be 'T <hit_name> ', and we can
+                # parse everything after that.
+                #              start    sequence       end     total_sequence_length
+                patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)"
+                groups = _get_hhr_line_regex_groups(patt, line[17:])
+                start = int(groups[0]) - 1  # Make index zero based.
+                delta_hit_sequence = groups[1]
+                assert length_block == len(delta_hit_sequence)
+                # Update the hit sequence and indices list.
+                hit_sequence += delta_hit_sequence
+                _update_hhr_residue_indices_list(
+                    delta_hit_sequence, start, indices_hit
+                )
+    return TemplateHit(
+        index=number_of_hit,
+        name=name_hit,
+        aligned_cols=int(aligned_cols),
+        sum_probs=sum_probs,
+        query=query,
+        hit_sequence=hit_sequence,
+        indices_query=indices_query,
+        indices_hit=indices_hit,
+    )
+def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
+    """Parses the content of an entire HHR file."""
+    lines = hhr_string.splitlines()
+    # Each .hhr file starts with a results table, then has a sequence of hit
+    # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
+    # iterate through each paragraph to parse each hit.
+    block_starts = [i for i, line in enumerate(lines) if line.startswith("No ")]
+    hits = []
+    if block_starts:
+        block_starts.append(len(lines))  # Add the end of the final block.
+        for i in range(len(block_starts) - 1):
+            hits.append(
+                _parse_hhr_hit(lines[block_starts[i] : block_starts[i + 1]])
+            )
+    return hits
+def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
+    """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
+    e_values = {"query": 0}
+    lines = [line for line in tblout.splitlines() if line[0] != "#"]
+    # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
+    # space-delimited. Relevant fields are (1) target name:  and
+    # (5) E-value (full sequence) (numbering from 1).
+    for line in lines:
+        fields = line.split()
+        e_value = fields[4]
+        target_name = fields[0]
+        e_values[target_name] = float(e_value)
+    return e_values

openfold/data/templates.py ADDED Viewed

	@@ -0,0 +1,1108 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for getting templates and calculating template features."""
+import dataclasses
+import datetime
+import glob
+import json
+import logging
+import os
+import re
+from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
+import numpy as np
+from openfold.data import parsers, mmcif_parsing
+from openfold.data.errors import Error
+from openfold.data.tools import kalign
+from openfold.data.tools.utils import to_date
+from openfold.np import residue_constants
+class NoChainsError(Error):
+    """An error indicating that template mmCIF didn't have any chains."""
+class SequenceNotInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain the sequence."""
+class NoAtomDataInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain atom positions."""
+class TemplateAtomMaskAllZerosError(Error):
+    """An error indicating that template mmCIF had all atom positions masked."""
+class QueryToTemplateAlignError(Error):
+    """An error indicating that the query can't be aligned to the template."""
+class CaDistanceError(Error):
+    """An error indicating that a CA atom distance exceeds a threshold."""
+# Prefilter exceptions.
+class PrefilterError(Exception):
+    """A base class for template prefilter exceptions."""
+class DateError(PrefilterError):
+    """An error indicating that the hit date was after the max allowed date."""
+class PdbIdError(PrefilterError):
+    """An error indicating that the hit PDB ID was identical to the query."""
+class AlignRatioError(PrefilterError):
+    """An error indicating that the hit align ratio to the query was too small."""
+class DuplicateError(PrefilterError):
+    """An error indicating that the hit was an exact subsequence of the query."""
+class LengthError(PrefilterError):
+    """An error indicating that the hit was too short."""
+TEMPLATE_FEATURES = {
+    "template_aatype": np.int64,
+    "template_all_atom_mask": np.float32,
+    "template_all_atom_positions": np.float32,
+    "template_domain_names": np.object,
+    "template_sequence": np.object,
+    "template_sum_probs": np.float32,
+}
+def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
+    """Returns PDB id and chain id for an HHSearch Hit."""
+    # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
+    id_match = re.match(r"[a-zA-Z\d]{4}_[a-zA-Z0-9.]+", hit.name)
+    if not id_match:
+        raise ValueError(f"hit.name did not start with PDBID_chain: {hit.name}")
+    pdb_id, chain_id = id_match.group(0).split("_")
+    return pdb_id.lower(), chain_id
+def _is_after_cutoff(
+    pdb_id: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: Optional[datetime.datetime],
+) -> bool:
+    """Checks if the template date is after the release date cutoff.
+    Args:
+        pdb_id: 4 letter pdb code.
+        release_dates: Dictionary mapping PDB ids to their structure release dates.
+        release_date_cutoff: Max release date that is valid for this query.
+    Returns:
+        True if the template release date is after the cutoff, False otherwise.
+    """
+    pdb_id_upper = pdb_id.upper()
+    if release_date_cutoff is None:
+        raise ValueError("The release_date_cutoff must not be None.")
+    if pdb_id_upper in release_dates:
+        return release_dates[pdb_id_upper] > release_date_cutoff
+    else:
+        # Since this is just a quick prefilter to reduce the number of mmCIF files
+        # we need to parse, we don't have to worry about returning True here.
+        logging.info(
+            "Template structure not in release dates dict: %s", pdb_id
+        )
+        return False
+def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, str]:
+    """Parses the data file from PDB that lists which PDB ids are obsolete."""
+    with open(obsolete_file_path) as f:
+        result = {}
+        for line in f:
+            line = line.strip()
+            # We skip obsolete entries that don't contain a mapping to a new entry.
+            if line.startswith("OBSLTE") and len(line) > 30:
+                # Format:    Date      From     To
+                # 'OBSLTE    31-JUL-94 116L     216L'
+                from_id = line[20:24].lower()
+                to_id = line[29:33].lower()
+                result[from_id] = to_id
+        return result
+def generate_release_dates_cache(mmcif_dir: str, out_path: str):
+    dates = {}
+    for f in os.listdir(mmcif_dir):
+        if f.endswith(".cif"):
+            path = os.path.join(mmcif_dir, f)
+            with open(path, "r") as fp:
+                mmcif_string = fp.read()
+            file_id = os.path.splitext(f)[0]
+            mmcif = mmcif_parsing.parse(
+                file_id=file_id, mmcif_string=mmcif_string
+            )
+            if mmcif.mmcif_object is None:
+                logging.info(f"Failed to parse {f}. Skipping...")
+                continue
+            mmcif = mmcif.mmcif_object
+            release_date = mmcif.header["release_date"]
+            dates[file_id] = release_date
+    with open(out_path, "r") as fp:
+        fp.write(json.dumps(dates))
+def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
+    """Parses release dates file, returns a mapping from PDBs to release dates."""
+    with open(path, "r") as fp:
+        data = json.load(fp)
+    return {
+        pdb.upper(): to_date(v)
+        for pdb, d in data.items()
+        for k, v in d.items()
+        if k == "release_date"
+    }
+def _assess_hhsearch_hit(
+    hit: parsers.TemplateHit,
+    hit_pdb_code: str,
+    query_sequence: str,
+    query_pdb_code: Optional[str],
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: datetime.datetime,
+    max_subsequence_ratio: float = 0.95,
+    min_align_ratio: float = 0.1,
+) -> bool:
+    """Determines if template is valid (without parsing the template mmcif file).
+    Args:
+        hit: HhrHit for the template.
+        hit_pdb_code: The 4 letter pdb code of the template hit. This might be
+            different from the value in the actual hit since the original pdb might
+            have become obsolete.
+        query_sequence: Amino acid sequence of the query.
+        query_pdb_code: 4 letter pdb code of the query.
+        release_dates: Dictionary mapping pdb codes to their structure release
+            dates.
+        release_date_cutoff: Max release date that is valid for this query.
+        max_subsequence_ratio: Exclude any exact matches with this much overlap.
+        min_align_ratio: Minimum overlap between the template and query.
+    Returns:
+        True if the hit passed the prefilter. Raises an exception otherwise.
+    Raises:
+        DateError: If the hit date was after the max allowed date.
+        PdbIdError: If the hit PDB ID was identical to the query.
+        AlignRatioError: If the hit align ratio to the query was too small.
+        DuplicateError: If the hit was an exact subsequence of the query.
+        LengthError: If the hit was too short.
+    """
+    aligned_cols = hit.aligned_cols
+    align_ratio = aligned_cols / len(query_sequence)
+    template_sequence = hit.hit_sequence.replace("-", "")
+    length_ratio = float(len(template_sequence)) / len(query_sequence)
+    # Check whether the template is a large subsequence or duplicate of original
+    # query. This can happen due to duplicate entries in the PDB database.
+    duplicate = (
+        template_sequence in query_sequence
+        and length_ratio > max_subsequence_ratio
+    )
+    if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
+        date = release_dates[hit_pdb_code.upper()]
+        raise DateError(
+            f"Date ({date}) > max template date "
+            f"({release_date_cutoff})."
+        )
+    if query_pdb_code is not None:
+        if query_pdb_code.lower() == hit_pdb_code.lower():
+            raise PdbIdError("PDB code identical to Query PDB code.")
+    if align_ratio <= min_align_ratio:
+        raise AlignRatioError(
+            "Proportion of residues aligned to query too small. "
+            f"Align ratio: {align_ratio}."
+        )
+    if duplicate:
+        raise DuplicateError(
+            "Template is an exact subsequence of query with large "
+            f"coverage. Length ratio: {length_ratio}."
+        )
+    if len(template_sequence) < 10:
+        raise LengthError(
+            f"Template too short. Length: {len(template_sequence)}."
+        )
+    return True
+def _find_template_in_pdb(
+    template_chain_id: str,
+    template_sequence: str,
+    mmcif_object: mmcif_parsing.MmcifObject,
+) -> Tuple[str, str, int]:
+    """Tries to find the template chain in the given pdb file.
+    This method tries the three following things in order:
+        1. Tries if there is an exact match in both the chain ID and the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        2. Tries if there is an exact match only in the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
+             If yes, the chain sequence is returned.
+    If none of these succeed, a SequenceNotInTemplateError is thrown.
+    Args:
+        template_chain_id: The template chain ID.
+        template_sequence: The template chain sequence.
+        mmcif_object: The PDB object to search for the template in.
+    Returns:
+        A tuple with:
+        * The chain sequence that was found to match the template in the PDB object.
+        * The ID of the chain that is being returned.
+        * The offset where the template sequence starts in the chain sequence.
+    Raises:
+        SequenceNotInTemplateError: If no match is found after the steps described
+            above.
+    """
+    # Try if there is an exact match in both the chain ID and the (sub)sequence.
+    pdb_id = mmcif_object.file_id
+    chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
+    if chain_sequence and (template_sequence in chain_sequence):
+        logging.info(
+            "Found an exact template match %s_%s.", pdb_id, template_chain_id
+        )
+        mapping_offset = chain_sequence.find(template_sequence)
+        return chain_sequence, template_chain_id, mapping_offset
+    # Try if there is an exact match in the (sub)sequence only.
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        if chain_sequence and (template_sequence in chain_sequence):
+            logging.info("Found a sequence-only match %s_%s.", pdb_id, chain_id)
+            mapping_offset = chain_sequence.find(template_sequence)
+            return chain_sequence, chain_id, mapping_offset
+    # Return a chain sequence that fuzzy matches (X = wildcard) the template.
+    # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
+    regex = ["." if aa == "X" else "(?:%s|X)" % aa for aa in template_sequence]
+    regex = re.compile("".join(regex))
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        match = re.search(regex, chain_sequence)
+        if match:
+            logging.info(
+                "Found a fuzzy sequence-only match %s_%s.", pdb_id, chain_id
+            )
+            mapping_offset = match.start()
+            return chain_sequence, chain_id, mapping_offset
+    # No hits, raise an error.
+    raise SequenceNotInTemplateError(
+        "Could not find the template sequence in %s_%s. Template sequence: %s, "
+        "chain_to_seqres: %s"
+        % (
+            pdb_id,
+            template_chain_id,
+            template_sequence,
+            mmcif_object.chain_to_seqres,
+        )
+    )
+def _realign_pdb_template_to_query(
+    old_template_sequence: str,
+    template_chain_id: str,
+    mmcif_object: mmcif_parsing.MmcifObject,
+    old_mapping: Mapping[int, int],
+    kalign_binary_path: str,
+) -> Tuple[str, Mapping[int, int]]:
+    """Aligns template from the mmcif_object to the query.
+    In case PDB70 contains a different version of the template sequence, we need
+    to perform a realignment to the actual sequence that is in the mmCIF file.
+    This method performs such realignment, but returns the new sequence and
+    mapping only if the sequence in the mmCIF file is 90% identical to the old
+    sequence.
+    Note that the old_template_sequence comes from the hit, and contains only that
+    part of the chain that matches with the query while the new_template_sequence
+    is the full chain.
+    Args:
+        old_template_sequence: The template sequence that was returned by the PDB
+            template search (typically done using HHSearch).
+        template_chain_id: The template chain id was returned by the PDB template
+            search (typically done using HHSearch). This is used to find the right
+            chain in the mmcif_object chain_to_seqres mapping.
+        mmcif_object: A mmcif_object which holds the actual template data.
+        old_mapping: A mapping from the query sequence to the template sequence.
+            This mapping will be used to compute the new mapping from the query
+            sequence to the actual mmcif_object template sequence by aligning the
+            old_template_sequence and the actual template sequence.
+        kalign_binary_path: The path to a kalign executable.
+    Returns:
+        A tuple (new_template_sequence, new_query_to_template_mapping) where:
+        * new_template_sequence is the actual template sequence that was found in
+            the mmcif_object.
+        * new_query_to_template_mapping is the new mapping from the query to the
+            actual template found in the mmcif_object.
+    Raises:
+        QueryToTemplateAlignError:
+        * If there was an error thrown by the alignment tool.
+        * Or if the actual template sequence differs by more than 10% from the
+            old_template_sequence.
+    """
+    aligner = kalign.Kalign(binary_path=kalign_binary_path)
+    new_template_sequence = mmcif_object.chain_to_seqres.get(
+        template_chain_id, ""
+    )
+    # Sometimes the template chain id is unknown. But if there is only a single
+    # sequence within the mmcif_object, it is safe to assume it is that one.
+    if not new_template_sequence:
+        if len(mmcif_object.chain_to_seqres) == 1:
+            logging.info(
+                "Could not find %s in %s, but there is only 1 sequence, so "
+                "using that one.",
+                template_chain_id,
+                mmcif_object.file_id,
+            )
+            new_template_sequence = list(mmcif_object.chain_to_seqres.values())[
+                0
+            ]
+        else:
+            raise QueryToTemplateAlignError(
+                f"Could not find chain {template_chain_id} in {mmcif_object.file_id}. "
+                "If there are no mmCIF parsing errors, it is possible it was not a "
+                "protein chain."
+            )
+    try:
+        (old_aligned_template, new_aligned_template), _ = parsers.parse_a3m(
+            aligner.align([old_template_sequence, new_template_sequence])
+        )
+    except Exception as e:
+        raise QueryToTemplateAlignError(
+            "Could not align old template %s to template %s (%s_%s). Error: %s"
+            % (
+                old_template_sequence,
+                new_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                str(e),
+            )
+        )
+    logging.info(
+        "Old aligned template: %s\nNew aligned template: %s",
+        old_aligned_template,
+        new_aligned_template,
+    )
+    old_to_new_template_mapping = {}
+    old_template_index = -1
+    new_template_index = -1
+    num_same = 0
+    for old_template_aa, new_template_aa in zip(
+        old_aligned_template, new_aligned_template
+    ):
+        if old_template_aa != "-":
+            old_template_index += 1
+        if new_template_aa != "-":
+            new_template_index += 1
+        if old_template_aa != "-" and new_template_aa != "-":
+            old_to_new_template_mapping[old_template_index] = new_template_index
+            if old_template_aa == new_template_aa:
+                num_same += 1
+    # Require at least 90 % sequence identity wrt to the shorter of the sequences.
+    if (
+        float(num_same)
+        / min(len(old_template_sequence), len(new_template_sequence))
+        < 0.9
+    ):
+        raise QueryToTemplateAlignError(
+            "Insufficient similarity of the sequence in the database: %s to the "
+            "actual sequence in the mmCIF file %s_%s: %s. We require at least "
+            "90 %% similarity wrt to the shorter of the sequences. This is not a "
+            "problem unless you think this is a template that should be included."
+            % (
+                old_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                new_template_sequence,
+            )
+        )
+    new_query_to_template_mapping = {}
+    for query_index, old_template_index in old_mapping.items():
+        new_query_to_template_mapping[
+            query_index
+        ] = old_to_new_template_mapping.get(old_template_index, -1)
+    new_template_sequence = new_template_sequence.replace("-", "")
+    return new_template_sequence, new_query_to_template_mapping
+def _check_residue_distances(
+    all_positions: np.ndarray,
+    all_positions_mask: np.ndarray,
+    max_ca_ca_distance: float,
+):
+    """Checks if the distance between unmasked neighbor residues is ok."""
+    ca_position = residue_constants.atom_order["CA"]
+    prev_is_unmasked = False
+    prev_calpha = None
+    for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
+        this_is_unmasked = bool(mask[ca_position])
+        if this_is_unmasked:
+            this_calpha = coords[ca_position]
+            if prev_is_unmasked:
+                distance = np.linalg.norm(this_calpha - prev_calpha)
+                if distance > max_ca_ca_distance:
+                    raise CaDistanceError(
+                        "The distance between residues %d and %d is %f > limit %f."
+                        % (i, i + 1, distance, max_ca_ca_distance)
+                    )
+            prev_calpha = this_calpha
+        prev_is_unmasked = this_is_unmasked
+def _get_atom_positions(
+    mmcif_object: mmcif_parsing.MmcifObject,
+    auth_chain_id: str,
+    max_ca_ca_distance: float,
+    _zero_center_positions: bool = False,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Gets atom positions and mask from a list of Biopython Residues."""
+    coords_with_mask = mmcif_parsing.get_atom_coords(
+        mmcif_object=mmcif_object,
+        chain_id=auth_chain_id,
+        _zero_center_positions=_zero_center_positions,
+    )
+    all_atom_positions, all_atom_mask = coords_with_mask
+    _check_residue_distances(
+        all_atom_positions, all_atom_mask, max_ca_ca_distance
+    )
+    return all_atom_positions, all_atom_mask
+def _extract_template_features(
+    mmcif_object: mmcif_parsing.MmcifObject,
+    pdb_id: str,
+    mapping: Mapping[int, int],
+    template_sequence: str,
+    query_sequence: str,
+    template_chain_id: str,
+    kalign_binary_path: str,
+    _zero_center_positions: bool = True,
+) -> Tuple[Dict[str, Any], Optional[str]]:
+    """Parses atom positions in the target structure and aligns with the query.
+    Atoms for each residue in the template structure are indexed to coincide
+    with their corresponding residue in the query sequence, according to the
+    alignment mapping provided.
+    Args:
+        mmcif_object: mmcif_parsing.MmcifObject representing the template.
+        pdb_id: PDB code for the template.
+        mapping: Dictionary mapping indices in the query sequence to indices in
+            the template sequence.
+        template_sequence: String describing the amino acid sequence for the
+            template protein.
+        query_sequence: String describing the amino acid sequence for the query
+            protein.
+        template_chain_id: String ID describing which chain in the structure proto
+            should be used.
+        kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+    Returns:
+        A tuple with:
+        * A dictionary containing the extra features derived from the template
+            protein structure.
+        * A warning message if the hit was realigned to the actual mmCIF sequence.
+            Otherwise None.
+    Raises:
+        NoChainsError: If the mmcif object doesn't contain any chains.
+        SequenceNotInTemplateError: If the given chain id / sequence can't
+            be found in the mmcif object.
+        QueryToTemplateAlignError: If the actual template in the mmCIF file
+            can't be aligned to the query.
+        NoAtomDataInTemplateError: If the mmcif object doesn't contain
+            atom positions.
+        TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
+            unmasked residues.
+    """
+    if mmcif_object is None or not mmcif_object.chain_to_seqres:
+        raise NoChainsError(
+            "No chains in PDB: %s_%s" % (pdb_id, template_chain_id)
+        )
+    warning = None
+    try:
+        seqres, chain_id, mapping_offset = _find_template_in_pdb(
+            template_chain_id=template_chain_id,
+            template_sequence=template_sequence,
+            mmcif_object=mmcif_object,
+        )
+    except SequenceNotInTemplateError:
+        # If PDB70 contains a different version of the template, we use the sequence
+        # from the mmcif_object.
+        chain_id = template_chain_id
+        warning = (
+            f"The exact sequence {template_sequence} was not found in "
+            f"{pdb_id}_{chain_id}. Realigning the template to the actual sequence."
+        )
+        logging.warning(warning)
+        # This throws an exception if it fails to realign the hit.
+        seqres, mapping = _realign_pdb_template_to_query(
+            old_template_sequence=template_sequence,
+            template_chain_id=template_chain_id,
+            mmcif_object=mmcif_object,
+            old_mapping=mapping,
+            kalign_binary_path=kalign_binary_path,
+        )
+        logging.info(
+            "Sequence in %s_%s: %s successfully realigned to %s",
+            pdb_id,
+            chain_id,
+            template_sequence,
+            seqres,
+        )
+        # The template sequence changed.
+        template_sequence = seqres
+        # No mapping offset, the query is aligned to the actual sequence.
+        mapping_offset = 0
+    try:
+        # Essentially set to infinity - we don't want to reject templates unless
+        # they're really really bad.
+        all_atom_positions, all_atom_mask = _get_atom_positions(
+            mmcif_object,
+            chain_id,
+            max_ca_ca_distance=150.0,
+            _zero_center_positions=_zero_center_positions,
+        )
+    except (CaDistanceError, KeyError) as ex:
+        raise NoAtomDataInTemplateError(
+            "Could not get atom data (%s_%s): %s" % (pdb_id, chain_id, str(ex))
+        ) from ex
+    all_atom_positions = np.split(
+        all_atom_positions, all_atom_positions.shape[0]
+    )
+    all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
+    output_templates_sequence = []
+    templates_all_atom_positions = []
+    templates_all_atom_masks = []
+    for _ in query_sequence:
+        # Residues in the query_sequence that are not in the template_sequence:
+        templates_all_atom_positions.append(
+            np.zeros((residue_constants.atom_type_num, 3))
+        )
+        templates_all_atom_masks.append(
+            np.zeros(residue_constants.atom_type_num)
+        )
+        output_templates_sequence.append("-")
+    for k, v in mapping.items():
+        template_index = v + mapping_offset
+        templates_all_atom_positions[k] = all_atom_positions[template_index][0]
+        templates_all_atom_masks[k] = all_atom_masks[template_index][0]
+        output_templates_sequence[k] = template_sequence[v]
+    # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
+    if np.sum(templates_all_atom_masks) < 5:
+        raise TemplateAtomMaskAllZerosError(
+            "Template all atom mask was all zeros: %s_%s. Residue range: %d-%d"
+            % (
+                pdb_id,
+                chain_id,
+                min(mapping.values()) + mapping_offset,
+                max(mapping.values()) + mapping_offset,
+            )
+        )
+    output_templates_sequence = "".join(output_templates_sequence)
+    templates_aatype = residue_constants.sequence_to_onehot(
+        output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID
+    )
+    return (
+        {
+            "template_all_atom_positions": np.array(
+                templates_all_atom_positions
+            ),
+            "template_all_atom_mask": np.array(templates_all_atom_masks),
+            "template_sequence": output_templates_sequence.encode(),
+            "template_aatype": np.array(templates_aatype),
+            "template_domain_names": f"{pdb_id.lower()}_{chain_id}".encode(),
+        },
+        warning,
+    )
+def _build_query_to_hit_index_mapping(
+    hit_query_sequence: str,
+    hit_sequence: str,
+    indices_hit: Sequence[int],
+    indices_query: Sequence[int],
+    original_query_sequence: str,
+) -> Mapping[int, int]:
+    """Gets mapping from indices in original query sequence to indices in the hit.
+    hit_query_sequence and hit_sequence are two aligned sequences containing gap
+    characters. hit_query_sequence contains only the part of the original query
+    sequence that matched the hit. When interpreting the indices from the .hhr, we
+    need to correct for this to recover a mapping from original query sequence to
+    the hit sequence.
+    Args:
+        hit_query_sequence: The portion of the query sequence that is in the .hhr
+            hit
+        hit_sequence: The portion of the hit sequence that is in the .hhr
+        indices_hit: The indices for each aminoacid relative to the hit sequence
+        indices_query: The indices for each aminoacid relative to the original query
+            sequence
+        original_query_sequence: String describing the original query sequence.
+    Returns:
+        Dictionary with indices in the original query sequence as keys and indices
+        in the hit sequence as values.
+    """
+    # If the hit is empty (no aligned residues), return empty mapping
+    if not hit_query_sequence:
+        return {}
+    # Remove gaps and find the offset of hit.query relative to original query.
+    hhsearch_query_sequence = hit_query_sequence.replace("-", "")
+    hit_sequence = hit_sequence.replace("-", "")
+    hhsearch_query_offset = original_query_sequence.find(
+        hhsearch_query_sequence
+    )
+    # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
+    min_idx = min(x for x in indices_hit if x > -1)
+    fixed_indices_hit = [x - min_idx if x > -1 else -1 for x in indices_hit]
+    min_idx = min(x for x in indices_query if x > -1)
+    fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]
+    # Zip the corrected indices, ignore case where both seqs have gap characters.
+    mapping = {}
+    for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
+        if q_t != -1 and q_i != -1:
+            if q_t >= len(hit_sequence) or q_i + hhsearch_query_offset >= len(
+                original_query_sequence
+            ):
+                continue
+            mapping[q_i + hhsearch_query_offset] = q_t
+    return mapping
+@dataclasses.dataclass(frozen=True)
+class PrefilterResult:
+    valid: bool
+    error: Optional[str]
+    warning: Optional[str]
+@dataclasses.dataclass(frozen=True)
+class SingleHitResult:
+    features: Optional[Mapping[str, Any]]
+    error: Optional[str]
+    warning: Optional[str]
+def _prefilter_hit(
+    query_sequence: str,
+    query_pdb_code: Optional[str],
+    hit: parsers.TemplateHit,
+    max_template_date: datetime.datetime,
+    release_dates: Mapping[str, datetime.datetime],
+    obsolete_pdbs: Mapping[str, str],
+    strict_error_check: bool = False,
+):
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+    # Pass hit_pdb_code since it might have changed due to the pdb being
+    # obsolete.
+    try:
+        _assess_hhsearch_hit(
+            hit=hit,
+            hit_pdb_code=hit_pdb_code,
+            query_sequence=query_sequence,
+            query_pdb_code=query_pdb_code,
+            release_dates=release_dates,
+            release_date_cutoff=max_template_date,
+        )
+    except PrefilterError as e:
+        hit_name = f"{hit_pdb_code}_{hit_chain_id}"
+        msg = f"hit {hit_name} did not pass prefilter: {str(e)}"
+        logging.info("%s: %s", query_pdb_code, msg)
+        if strict_error_check and isinstance(
+            e, (DateError, PdbIdError, DuplicateError)
+        ):
+            # In strict mode we treat some prefilter cases as errors.
+            return PrefilterResult(valid=False, error=msg, warning=None)
+        return PrefilterResult(valid=False, error=None, warning=None)
+    return PrefilterResult(valid=True, error=None, warning=None)
+def _process_single_hit(
+    query_sequence: str,
+    query_pdb_code: Optional[str],
+    hit: parsers.TemplateHit,
+    mmcif_dir: str,
+    max_template_date: datetime.datetime,
+    release_dates: Mapping[str, datetime.datetime],
+    obsolete_pdbs: Mapping[str, str],
+    kalign_binary_path: str,
+    strict_error_check: bool = False,
+    _zero_center_positions: bool = True,
+) -> SingleHitResult:
+    """Tries to extract template features from a single HHSearch hit."""
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+    mapping = _build_query_to_hit_index_mapping(
+        hit.query,
+        hit.hit_sequence,
+        hit.indices_hit,
+        hit.indices_query,
+        query_sequence,
+    )
+    # The mapping is from the query to the actual hit sequence, so we need to
+    # remove gaps (which regardless have a missing confidence score).
+    template_sequence = hit.hit_sequence.replace("-", "")
+    cif_path = os.path.join(mmcif_dir, hit_pdb_code + ".cif")
+    logging.info(
+        "Reading PDB entry from %s. Query: %s, template: %s",
+        cif_path,
+        query_sequence,
+        template_sequence,
+    )
+    # Fail if we can't find the mmCIF file.
+    with open(cif_path, "r") as cif_file:
+        cif_string = cif_file.read()
+    parsing_result = mmcif_parsing.parse(
+        file_id=hit_pdb_code, mmcif_string=cif_string
+    )
+    if parsing_result.mmcif_object is not None:
+        hit_release_date = datetime.datetime.strptime(
+            parsing_result.mmcif_object.header["release_date"], "%Y-%m-%d"
+        )
+        if hit_release_date > max_template_date:
+            error = "Template %s date (%s) > max template date (%s)." % (
+                hit_pdb_code,
+                hit_release_date,
+                max_template_date,
+            )
+            if strict_error_check:
+                return SingleHitResult(features=None, error=error, warning=None)
+            else:
+                logging.info(error)
+                return SingleHitResult(features=None, error=None, warning=None)
+    try:
+        features, realign_warning = _extract_template_features(
+            mmcif_object=parsing_result.mmcif_object,
+            pdb_id=hit_pdb_code,
+            mapping=mapping,
+            template_sequence=template_sequence,
+            query_sequence=query_sequence,
+            template_chain_id=hit_chain_id,
+            kalign_binary_path=kalign_binary_path,
+            _zero_center_positions=_zero_center_positions,
+        )
+        features["template_sum_probs"] = [hit.sum_probs]
+        # It is possible there were some errors when parsing the other chains in the
+        # mmCIF file, but the template features for the chain we want were still
+        # computed. In such case the mmCIF parsing errors are not relevant.
+        return SingleHitResult(
+            features=features, error=None, warning=realign_warning
+        )
+    except (
+        NoChainsError,
+        NoAtomDataInTemplateError,
+        TemplateAtomMaskAllZerosError,
+    ) as e:
+        # These 3 errors indicate missing mmCIF experimental data rather than a
+        # problem with the template search, so turn them into warnings.
+        warning = (
+            "%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: "
+            "%s, mmCIF parsing errors: %s"
+            % (
+                hit_pdb_code,
+                hit_chain_id,
+                hit.sum_probs,
+                hit.index,
+                str(e),
+                parsing_result.errors,
+            )
+        )
+        if strict_error_check:
+            return SingleHitResult(features=None, error=warning, warning=None)
+        else:
+            return SingleHitResult(features=None, error=None, warning=warning)
+    except Error as e:
+        error = (
+            "%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: "
+            "%s, mmCIF parsing errors: %s"
+            % (
+                hit_pdb_code,
+                hit_chain_id,
+                hit.sum_probs,
+                hit.index,
+                str(e),
+                parsing_result.errors,
+            )
+        )
+        return SingleHitResult(features=None, error=error, warning=None)
+@dataclasses.dataclass(frozen=True)
+class TemplateSearchResult:
+    features: Mapping[str, Any]
+    errors: Sequence[str]
+    warnings: Sequence[str]
+class TemplateHitFeaturizer:
+    """A class for turning hhr hits to template features."""
+    def __init__(
+        self,
+        mmcif_dir: str,
+        max_template_date: str,
+        max_hits: int,
+        kalign_binary_path: str,
+        release_dates_path: Optional[str] = None,
+        obsolete_pdbs_path: Optional[str] = None,
+        strict_error_check: bool = False,
+        _shuffle_top_k_prefiltered: Optional[int] = None,
+        _zero_center_positions: bool = True,
+    ):
+        """Initializes the Template Search.
+        Args:
+            mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
+                is found by HHSearch, this directory is used to retrieve the template
+                data.
+            max_template_date: The maximum date permitted for template structures. No
+                template with date higher than this date will be returned. In ISO8601
+                date format, YYYY-MM-DD.
+            max_hits: The maximum number of templates that will be returned.
+            kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+            release_dates_path: An optional path to a file with a mapping from PDB IDs
+                to their release dates. Thanks to this we don't have to redundantly
+                parse mmCIF files to get that information.
+            obsolete_pdbs_path: An optional path to a file containing a mapping from
+                obsolete PDB IDs to the PDB IDs of their replacements.
+            strict_error_check: If True, then the following will be treated as errors:
+                * If any template date is after the max_template_date.
+                * If any template has identical PDB ID to the query.
+                * If any template is a duplicate of the query.
+                * Any feature computation errors.
+        """
+        self._mmcif_dir = mmcif_dir
+        if not glob.glob(os.path.join(self._mmcif_dir, "*.cif")):
+            logging.error("Could not find CIFs in %s", self._mmcif_dir)
+            raise ValueError(f"Could not find CIFs in {self._mmcif_dir}")
+        try:
+            self._max_template_date = datetime.datetime.strptime(
+                max_template_date, "%Y-%m-%d"
+            )
+        except ValueError:
+            raise ValueError(
+                "max_template_date must be set and have format YYYY-MM-DD."
+            )
+        self.max_hits = max_hits
+        self._kalign_binary_path = kalign_binary_path
+        self._strict_error_check = strict_error_check
+        if release_dates_path:
+            logging.info(
+                "Using precomputed release dates %s.", release_dates_path
+            )
+            self._release_dates = _parse_release_dates(release_dates_path)
+        else:
+            self._release_dates = {}
+        if obsolete_pdbs_path:
+            logging.info(
+                "Using precomputed obsolete pdbs %s.", obsolete_pdbs_path
+            )
+            self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
+        else:
+            self._obsolete_pdbs = {}
+        self._shuffle_top_k_prefiltered = _shuffle_top_k_prefiltered
+        self._zero_center_positions = _zero_center_positions
+    def get_templates(
+        self,
+        query_sequence: str,
+        query_pdb_code: Optional[str],
+        query_release_date: Optional[datetime.datetime],
+        hits: Sequence[parsers.TemplateHit],
+    ) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info("Searching for template for: %s", query_pdb_code)
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+        # Always use a max_template_date. Set to query_release_date minus 60 days
+        # if that's earlier.
+        template_cutoff_date = self._max_template_date
+        if query_release_date:
+            delta = datetime.timedelta(days=60)
+            if query_release_date - delta < template_cutoff_date:
+                template_cutoff_date = query_release_date - delta
+            assert template_cutoff_date < query_release_date
+        assert template_cutoff_date <= self._max_template_date
+        num_hits = 0
+        errors = []
+        warnings = []
+        filtered = []
+        for hit in hits:
+            prefilter_result = _prefilter_hit(
+                query_sequence=query_sequence,
+                query_pdb_code=query_pdb_code,
+                hit=hit,
+                max_template_date=template_cutoff_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+            )
+            if prefilter_result.error:
+                errors.append(prefilter_result.error)
+            if prefilter_result.warning:
+                warnings.append(prefilter_result.warning)
+            if prefilter_result.valid:
+                filtered.append(hit)
+        filtered = list(
+            sorted(filtered, key=lambda x: x.sum_probs, reverse=True)
+        )
+        idx = list(range(len(filtered)))
+        if(self._shuffle_top_k_prefiltered):
+            stk = self._shuffle_top_k_prefiltered
+            idx[:stk] = np.random.permutation(idx[:stk])
+        for i in idx:
+            # We got all the templates we wanted, stop processing hits.
+            if num_hits >= self.max_hits:
+                break
+            hit = filtered[i]
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                query_pdb_code=query_pdb_code,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=template_cutoff_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path,
+                _zero_center_positions=self._zero_center_positions,
+            )
+            if result.error:
+                errors.append(result.error)
+            # There could be an error even if there are some results, e.g. thrown by
+            # other unparsable chains in the same mmCIF file.
+            if result.warning:
+                warnings.append(result.warning)
+            if result.features is None:
+                logging.info(
+                    "Skipped invalid hit %s, error: %s, warning: %s",
+                    hit.name,
+                    result.error,
+                    result.warning,
+                )
+            else:
+                # Increment the hit counter, since we got features out of this hit.
+                num_hits += 1
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+        for name in template_features:
+            if num_hits > 0:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+            else:
+                # Make sure the feature has correct dtype even if empty.
+                template_features[name] = np.array(
+                    [], dtype=TEMPLATE_FEATURES[name]
+                )
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings
+        )

openfold/data/tools/__init__.py ADDED Viewed

File without changes

openfold/data/tools/hhblits.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run HHblits from Python."""
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Mapping, Optional, Sequence
+from openfold.data.tools import utils
+_HHBLITS_DEFAULT_P = 20
+_HHBLITS_DEFAULT_Z = 500
+class HHBlits:
+    """Python wrapper of the HHblits binary."""
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 4,
+        n_iter: int = 3,
+        e_value: float = 0.001,
+        maxseq: int = 1_000_000,
+        realign_max: int = 100_000,
+        maxfilt: int = 100_000,
+        min_prefilter_hits: int = 1000,
+        all_seqs: bool = False,
+        alt: Optional[int] = None,
+        p: int = _HHBLITS_DEFAULT_P,
+        z: int = _HHBLITS_DEFAULT_Z,
+    ):
+        """Initializes the Python HHblits wrapper.
+        Args:
+          binary_path: The path to the HHblits executable.
+          databases: A sequence of HHblits database paths. This should be the
+            common prefix for the database files (i.e. up to but not including
+            _hhm.ffindex etc.)
+          n_cpu: The number of CPUs to give HHblits.
+          n_iter: The number of HHblits iterations.
+          e_value: The E-value, see HHblits docs for more details.
+          maxseq: The maximum number of rows in an input alignment. Note that this
+            parameter is only supported in HHBlits version 3.1 and higher.
+          realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
+          maxfilt: Max number of hits allowed to pass the 2nd prefilter.
+            HHblits default: 20000.
+          min_prefilter_hits: Min number of hits to pass prefilter.
+            HHblits default: 100.
+          all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
+            HHblits default: False.
+          alt: Show up to this many alternative alignments.
+          p: Minimum Prob for a hit to be included in the output hhr file.
+            HHblits default: 20.
+          z: Hard cap on number of hits reported in the hhr file.
+            HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
+        Raises:
+          RuntimeError: If HHblits binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error(
+                    "Could not find HHBlits database %s", database_path
+                )
+                raise ValueError(
+                    f"Could not find HHBlits database {database_path}"
+                )
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.maxseq = maxseq
+        self.realign_max = realign_max
+        self.maxfilt = maxfilt
+        self.min_prefilter_hits = min_prefilter_hits
+        self.all_seqs = all_seqs
+        self.alt = alt
+        self.p = p
+        self.z = z
+    def query(self, input_fasta_path: str) -> Mapping[str, Any]:
+        """Queries the database using HHblits."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-cpu",
+                str(self.n_cpu),
+                "-oa3m",
+                a3m_path,
+                "-o",
+                "/dev/null",
+                "-n",
+                str(self.n_iter),
+                "-e",
+                str(self.e_value),
+                "-maxseq",
+                str(self.maxseq),
+                "-realign_max",
+                str(self.realign_max),
+                "-maxfilt",
+                str(self.maxfilt),
+                "-min_prefilter_hits",
+                str(self.min_prefilter_hits),
+            ]
+            if self.all_seqs:
+                cmd += ["-all"]
+            if self.alt:
+                cmd += ["-alt", str(self.alt)]
+            if self.p != _HHBLITS_DEFAULT_P:
+                cmd += ["-p", str(self.p)]
+            if self.z != _HHBLITS_DEFAULT_Z:
+                cmd += ["-Z", str(self.z)]
+            cmd += db_cmd
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("HHblits query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                # Logs have a 15k character limit, so log HHblits error line by line.
+                logging.error("HHblits failed. HHblits stderr begin:")
+                for error_line in stderr.decode("utf-8").splitlines():
+                    if error_line.strip():
+                        logging.error(error_line.strip())
+                logging.error("HHblits stderr end")
+                raise RuntimeError(
+                    "HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:500_000].decode("utf-8"))
+                )
+            with open(a3m_path) as f:
+                a3m = f.read()
+        raw_output = dict(
+            a3m=a3m,
+            output=stdout,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return raw_output

openfold/data/tools/hhsearch.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run HHsearch from Python."""
+import glob
+import logging
+import os
+import subprocess
+from typing import Sequence
+from openfold.data.tools import utils
+class HHSearch:
+    """Python wrapper of the HHsearch binary."""
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 2,
+        maxseq: int = 1_000_000,
+    ):
+        """Initializes the Python HHsearch wrapper.
+        Args:
+          binary_path: The path to the HHsearch executable.
+          databases: A sequence of HHsearch database paths. This should be the
+            common prefix for the database files (i.e. up to but not including
+            _hhm.ffindex etc.)
+          n_cpu: The number of CPUs to use
+          maxseq: The maximum number of rows in an input alignment. Note that this
+            parameter is only supported in HHBlits version 3.1 and higher.
+        Raises:
+          RuntimeError: If HHsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        self.n_cpu = n_cpu
+        self.maxseq = maxseq
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error(
+                    "Could not find HHsearch database %s", database_path
+                )
+                raise ValueError(
+                    f"Could not find HHsearch database {database_path}"
+                )
+    def query(self, a3m: str) -> str:
+        """Queries the database using HHsearch using a given a3m."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            input_path = os.path.join(query_tmp_dir, "query.a3m")
+            hhr_path = os.path.join(query_tmp_dir, "output.hhr")
+            with open(input_path, "w") as f:
+                f.write(a3m)
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_path,
+                "-o",
+                hhr_path,
+                "-maxseq",
+                str(self.maxseq),
+                "-cpu",
+                str(self.n_cpu),
+            ] + db_cmd
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("HHsearch query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                # Stderr is truncated to prevent proto size errors in Beam.
+                raise RuntimeError(
+                    "HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:100_000].decode("utf-8"))
+                )
+            with open(hhr_path) as f:
+                hhr = f.read()
+        return hhr

openfold/data/tools/jackhmmer.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run Jackhmmer from Python."""
+from concurrent import futures
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+from openfold.data.tools import utils
+class Jackhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        database_path: str,
+        n_cpu: int = 8,
+        n_iter: int = 1,
+        e_value: float = 0.0001,
+        z_value: Optional[int] = None,
+        get_tblout: bool = False,
+        filter_f1: float = 0.0005,
+        filter_f2: float = 0.00005,
+        filter_f3: float = 0.0000005,
+        incdom_e: Optional[float] = None,
+        dom_e: Optional[float] = None,
+        num_streamed_chunks: Optional[int] = None,
+        streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Jackhmmer wrapper.
+        Args:
+          binary_path: The path to the jackhmmer executable.
+          database_path: The path to the jackhmmer database (FASTA format).
+          n_cpu: The number of CPUs to give Jackhmmer.
+          n_iter: The number of Jackhmmer iterations.
+          e_value: The E-value, see Jackhmmer docs for more details.
+          z_value: The Z-value, see Jackhmmer docs for more details.
+          get_tblout: Whether to save tblout string.
+          filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+          filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+          filter_f3: Forward pre-filter, set to >1.0 to turn off.
+          incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+            round.
+          dom_e: Domain e-value criteria for inclusion in tblout.
+          num_streamed_chunks: Number of database chunks to stream over.
+          streaming_callback: Callback function run after each chunk iteration with
+            the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+        if (
+            not os.path.exists(self.database_path)
+            and num_streamed_chunks is None
+        ):
+            logging.error("Could not find Jackhmmer database %s", database_path)
+            raise ValueError(
+                f"Could not find Jackhmmer database {database_path}"
+            )
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.z_value = z_value
+        self.filter_f1 = filter_f1
+        self.filter_f2 = filter_f2
+        self.filter_f3 = filter_f3
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+    def _query_chunk(
+        self, input_fasta_path: str, database_path: str
+    ) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, "output.sto")
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.  They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                "-o",
+                "/dev/null",
+                "-A",
+                sto_path,
+                "--noali",
+                "--F1",
+                str(self.filter_f1),
+                "--F2",
+                str(self.filter_f2),
+                "--F3",
+                str(self.filter_f3),
+                "--incE",
+                str(self.e_value),
+                # Report only sequences with E-values <= x in per-sequence output.
+                "-E",
+                str(self.e_value),
+                "--cpu",
+                str(self.n_cpu),
+                "-N",
+                str(self.n_iter),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
+                cmd_flags.extend(["--tblout", tblout_path])
+            if self.z_value:
+                cmd_flags.extend(["-Z", str(self.z_value)])
+            if self.dom_e is not None:
+                cmd_flags.extend(["--domE", str(self.dom_e)])
+            if self.incdom_e is not None:
+                cmd_flags.extend(["--incdomE", str(self.incdom_e)])
+            cmd = (
+                [self.binary_path]
+                + cmd_flags
+                + [input_fasta_path, database_path]
+            )
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(
+                f"Jackhmmer ({os.path.basename(database_path)}) query"
+            ):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                raise RuntimeError(
+                    "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
+                )
+            # Get e-values for each target name
+            tbl = ""
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+            with open(sto_path) as f:
+                sto = f.read()
+        raw_output = dict(
+            sto=sto,
+            tbl=tbl,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return raw_output
+    def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
+        """Queries the database using Jackhmmer."""
+        if self.num_streamed_chunks is None:
+            return [self._query_chunk(input_fasta_path, self.database_path)]
+        db_basename = os.path.basename(self.database_path)
+        db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
+        db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk("[0-9]*")):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f"OSError while deleting {f}")
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_output = []
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i),
+                        db_local_chunk(i),
+                    )
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+                # Run Jackhmmer with the chunk
+                future.result()
+                chunked_output.append(
+                    self._query_chunk(input_fasta_path, db_local_chunk(i))
+                )
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_output

openfold/data/tools/kalign.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for Kalign."""
+import os
+import subprocess
+from typing import Sequence
+from absl import logging
+from openfold.data.tools import utils
+def _to_a3m(sequences: Sequence[str]) -> str:
+    """Converts sequences to an a3m file."""
+    names = ["sequence %d" % i for i in range(1, len(sequences) + 1)]
+    a3m = []
+    for sequence, name in zip(sequences, names):
+        a3m.append(u">" + name + u"\n")
+        a3m.append(sequence + u"\n")
+    return "".join(a3m)
+class Kalign:
+    """Python wrapper of the Kalign binary."""
+    def __init__(self, *, binary_path: str):
+        """Initializes the Python Kalign wrapper.
+        Args:
+          binary_path: The path to the Kalign binary.
+        Raises:
+          RuntimeError: If Kalign binary not found within the path.
+        """
+        self.binary_path = binary_path
+    def align(self, sequences: Sequence[str]) -> str:
+        """Aligns the sequences and returns the alignment in A3M string.
+        Args:
+          sequences: A list of query sequence strings. The sequences have to be at
+            least 6 residues long (Kalign requires this). Note that the order in
+            which you give the sequences might alter the output slightly as
+            different alignment tree might get constructed.
+        Returns:
+          A string with the alignment in a3m format.
+        Raises:
+          RuntimeError: If Kalign fails.
+          ValueError: If any of the sequences is less than 6 residues long.
+        """
+        logging.info("Aligning %d sequences", len(sequences))
+        for s in sequences:
+            if len(s) < 6:
+                raise ValueError(
+                    "Kalign requires all sequences to be at least 6 "
+                    "residues long. Got %s (%d residues)." % (s, len(s))
+                )
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            input_fasta_path = os.path.join(query_tmp_dir, "input.fasta")
+            output_a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+            with open(input_fasta_path, "w") as f:
+                f.write(_to_a3m(sequences))
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-o",
+                output_a3m_path,
+                "-format",
+                "fasta",
+            ]
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("Kalign query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    "Kalign stdout:\n%s\n\nstderr:\n%s\n",
+                    stdout.decode("utf-8"),
+                    stderr.decode("utf-8"),
+                )
+            if retcode:
+                raise RuntimeError(
+                    "Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+                )
+            with open(output_a3m_path) as f:
+                a3m = f.read()
+            return a3m

openfold/data/tools/utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common utilities for data pipeline tools."""
+import contextlib
+import datetime
+import logging
+import shutil
+import tempfile
+import time
+from typing import Optional
+@contextlib.contextmanager
+def tmpdir_manager(base_dir: Optional[str] = None):
+    """Context manager that deletes a temporary directory on exit."""
+    tmpdir = tempfile.mkdtemp(dir=base_dir)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+@contextlib.contextmanager
+def timing(msg: str):
+    logging.info("Started %s", msg)
+    tic = time.perf_counter()
+    yield
+    toc = time.perf_counter()
+    logging.info("Finished %s in %.3f seconds", msg, toc - tic)
+def to_date(s: str):
+    return datetime.datetime(
+        year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])
+    )

openfold/np/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import glob
+import importlib as importlib
+_files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py"))
+__all__ = [
+    os.path.basename(f)[:-3]
+    for f in _files
+    if os.path.isfile(f) and not f.endswith("__init__.py")
+]
+_modules = [(m, importlib.import_module("." + m, __name__)) for m in __all__]
+for _m in _modules:
+    globals()[_m[0]] = _m[1]
+# Avoid needlessly cluttering the global namespace
+del _files, _m, _modules

openfold/np/protein.py ADDED Viewed

	@@ -0,0 +1,441 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Protein data type."""
+import dataclasses
+import io
+from typing import Any, Sequence, Mapping, Optional
+import re
+import string
+from openfold.np import residue_constants
+from Bio import PDB
+import numpy as np
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+PICO_TO_ANGSTROM = 0.01
+@dataclasses.dataclass(frozen=True)
+class Protein:
+    """Protein structure representation."""
+    # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+    # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+    atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+    # Amino-acid type for each residue represented as an integer between 0 and
+    # 20, where 20 is 'X'.
+    aatype: np.ndarray  # [num_res]
+    # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+    # is present and 0.0 if not. This should be used for loss masking.
+    atom_mask: np.ndarray  # [num_res, num_atom_type]
+    # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+    residue_index: np.ndarray  # [num_res]
+    # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+    # representing the displacement of the residue from its ground truth mean
+    # value.
+    b_factors: np.ndarray  # [num_res, num_atom_type]
+    # Chain indices for multi-chain predictions
+    chain_index: Optional[np.ndarray] = None
+    # Optional remark about the protein. Included as a comment in output PDB
+    # files
+    remark: Optional[str] = None
+    # Templates used to generate this protein (prediction-only)
+    parents: Optional[Sequence[str]] = None
+    # Chain corresponding to each parent
+    parents_chain_index: Optional[Sequence[int]] = None
+def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
+    """Takes a PDB string and constructs a Protein object.
+    WARNING: All non-standard residue types will be converted into UNK. All
+      non-standard atoms will be ignored.
+    Args:
+      pdb_str: The contents of the pdb file
+      chain_id: If None, then the pdb file must contain a single chain (which
+        will be parsed). If chain_id is specified (e.g. A), then only that chain
+        is parsed.
+    Returns:
+      A new `Protein` parsed from the pdb contents.
+    """
+    pdb_fh = pdb_str
+    parser = PDB.PDBParser(QUIET=True)
+    structure = parser.get_structure("none", pdb_fh)
+    models = list(structure.get_models())
+    if len(models) != 1:
+        raise ValueError(
+            f"Only single model PDBs are supported. Found {len(models)} models."
+        )
+    model = models[0]
+    atom_positions = []
+    aatype = []
+    atom_mask = []
+    residue_index = []
+    chain_ids = []
+    b_factors = []
+    for chain in model:
+        if(chain_id is not None and chain.id != chain_id):
+            continue
+        for res in chain:
+            if res.id[2] != " ":
+                raise ValueError(
+                    f"PDB contains an insertion code at chain {chain.id} and residue "
+                    f"index {res.id[1]}. These are not supported."
+                )
+            res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
+            restype_idx = residue_constants.restype_order.get(
+                res_shortname, residue_constants.restype_num
+            )
+            pos = np.zeros((residue_constants.atom_type_num, 3))
+            mask = np.zeros((residue_constants.atom_type_num,))
+            res_b_factors = np.zeros((residue_constants.atom_type_num,))
+            for atom in res:
+                if atom.name not in residue_constants.atom_types:
+                    continue
+                pos[residue_constants.atom_order[atom.name]] = atom.coord
+                mask[residue_constants.atom_order[atom.name]] = 1.0
+                res_b_factors[
+                    residue_constants.atom_order[atom.name]
+                ] = atom.bfactor
+            if np.sum(mask) < 0.5:
+                # If no known atom positions are reported for the residue then skip it.
+                continue
+            aatype.append(restype_idx)
+            atom_positions.append(pos)
+            atom_mask.append(mask)
+            residue_index.append(res.id[1])
+            chain_ids.append(chain.id)
+            b_factors.append(res_b_factors)
+    parents = None
+    parents_chain_index = None
+    if("PARENT" in pdb_str):
+        parents = []
+        parents_chain_index = []
+        chain_id = 0
+        for l in pdb_str.split("\n"):
+            if("PARENT" in l):
+                if(not "N/A" in l):
+                    parent_names = l.split()[1:]
+                    parents.extend(parent_names)
+                    parents_chain_index.extend([
+                        chain_id for _ in parent_names
+                    ])
+                chain_id += 1
+    unique_chain_ids = np.unique(chain_ids)
+    chain_id_mapping = {cid: n for n, cid in enumerate(string.ascii_uppercase)}
+    chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
+    return Protein(
+        atom_positions=np.array(atom_positions),
+        atom_mask=np.array(atom_mask),
+        aatype=np.array(aatype),
+        residue_index=np.array(residue_index),
+        chain_index=chain_index,
+        b_factors=np.array(b_factors),
+        parents=parents,
+        parents_chain_index=parents_chain_index,
+    )
+def from_proteinnet_string(proteinnet_str: str) -> Protein:
+    tag_re = r'(\[[A-Z]+\]\n)'
+    tags = [
+        tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0
+    ]
+    groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]])
+    atoms = ['N', 'CA', 'C']
+    aatype = None
+    atom_positions = None
+    atom_mask = None
+    for g in groups:
+        if("[PRIMARY]" == g[0]):
+            seq = g[1][0].strip()
+            for i in range(len(seq)):
+                if(seq[i] not in residue_constants.restypes):
+                    seq[i] = 'X'
+            aatype = np.array([
+                residue_constants.restype_order.get(
+                    res_symbol, residue_constants.restype_num
+                ) for res_symbol in seq
+            ])
+        elif("[TERTIARY]" == g[0]):
+            tertiary = []
+            for axis in range(3):
+                tertiary.append(list(map(float, g[1][axis].split())))
+            tertiary_np = np.array(tertiary)
+            atom_positions = np.zeros(
+                (len(tertiary[0])//3, residue_constants.atom_type_num, 3)
+            ).astype(np.float32)
+            for i, atom in enumerate(atoms):
+                atom_positions[:, residue_constants.atom_order[atom], :] = (
+                    np.transpose(tertiary_np[:, i::3])
+                )
+            atom_positions *= PICO_TO_ANGSTROM
+        elif("[MASK]" == g[0]):
+            mask = np.array(list(map({'-': 0, '+': 1}.get, g[1][0].strip())))
+            atom_mask = np.zeros(
+                (len(mask), residue_constants.atom_type_num,)
+            ).astype(np.float32)
+            for i, atom in enumerate(atoms):
+                atom_mask[:, residue_constants.atom_order[atom]] = 1
+            atom_mask *= mask[..., None]
+    return Protein(
+        atom_positions=atom_positions,
+        atom_mask=atom_mask,
+        aatype=aatype,
+        residue_index=np.arange(len(aatype)),
+        b_factors=None,
+    )
+def get_pdb_headers(prot: Protein, chain_id: int = 0) -> Sequence[str]:
+    pdb_headers = []
+    remark = prot.remark
+    if(remark is not None):
+        pdb_headers.append(f"REMARK {remark}")
+    parents = prot.parents
+    parents_chain_index = prot.parents_chain_index
+    if(parents_chain_index is not None):
+        parents = [
+            p for i, p in zip(parents_chain_index, parents) if i == chain_id
+        ]
+    if(parents is None or len(parents) == 0):
+        parents = ["N/A"]
+    pdb_headers.append(f"PARENT {' '.join(parents)}")
+    return pdb_headers
+def add_pdb_headers(prot: Protein, pdb_str: str) -> str:
+    """ Add pdb headers to an existing PDB string. Useful during multi-chain
+        recycling
+    """
+    out_pdb_lines = []
+    lines = pdb_str.split('\n')
+    remark = prot.remark
+    if(remark is not None):
+        out_pdb_lines.append(f"REMARK {remark}")
+    parents_per_chain = None
+    if(prot.parents is not None and len(prot.parents) > 0):
+        parents_per_chain = []
+        if(prot.parents_chain_index is not None):
+            cur_chain = prot.parents_chain_index[0]
+            parent_dict = {}
+            for p, i in zip(prot.parents, prot.parents_chain_index):
+                parent_dict.setdefault(str(i), [])
+                parent_dict[str(i)].append(p)
+            max_idx = max([int(chain_idx) for chain_idx in parent_dict])
+            for i in range(max_idx + 1):
+                chain_parents = parent_dict.get(str(i), ["N/A"])
+                parents_per_chain.append(chain_parents)
+        else:
+            parents_per_chain.append(prot.parents)
+    else:
+        parents_per_chain = [["N/A"]]
+    make_parent_line = lambda p: f"PARENT {' '.join(p)}"
+    out_pdb_lines.append(make_parent_line(parents_per_chain[0]))
+    chain_counter = 0
+    for i, l in enumerate(lines):
+        if("PARENT" not in l and "REMARK" not in l):
+            out_pdb_lines.append(l)
+        if("TER" in l and not "END" in lines[i + 1]):
+            chain_counter += 1
+            if(not chain_counter >= len(parents_per_chain)):
+                chain_parents = parents_per_chain[chain_counter]
+            else:
+                chain_parents = ["N/A"]
+            out_pdb_lines.append(make_parent_line(chain_parents))
+    return '\n'.join(out_pdb_lines)
+def to_pdb(prot: Protein) -> str:
+    """Converts a `Protein` instance to a PDB string.
+    Args:
+      prot: The protein to convert to PDB.
+    Returns:
+      PDB string.
+    """
+    restypes = residue_constants.restypes + ["X"]
+    res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
+    atom_types = residue_constants.atom_types
+    pdb_lines = []
+    atom_mask = prot.atom_mask
+    aatype = prot.aatype
+    atom_positions = prot.atom_positions
+    residue_index = prot.residue_index.astype(np.int32)
+    b_factors = prot.b_factors
+    chain_index = prot.chain_index
+    if np.any(aatype > residue_constants.restype_num):
+        raise ValueError("Invalid aatypes.")
+    headers = get_pdb_headers(prot)
+    if(len(headers) > 0):
+        pdb_lines.extend(headers)
+    n = aatype.shape[0]
+    atom_index = 1
+    prev_chain_index = 0
+    chain_tags = string.ascii_uppercase
+    # Add all atom sites.
+    for i in range(n):
+        res_name_3 = res_1to3(aatype[i])
+        for atom_name, pos, mask, b_factor in zip(
+            atom_types, atom_positions[i], atom_mask[i], b_factors[i]
+        ):
+            if mask < 0.5:
+                chain_tag = "A"
+                if(chain_index is not None):
+                    chain_tag = chain_tags[chain_index[i]]
+                continue
+            record_type = "ATOM"
+            name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+            alt_loc = ""
+            insertion_code = ""
+            occupancy = 1.00
+            element = atom_name[
+                0
+            ]  # Protein supports only C, N, O, S, this works.
+            charge = ""
+            chain_tag = "A"
+            if(chain_index is not None):
+                chain_tag = chain_tags[chain_index[i]]
+            # PDB is a columnar format, every space matters here!
+            atom_line = (
+                f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
+                f"{res_name_3:>3} {chain_tag:>1}"
+                f"{residue_index[i]:>4}{insertion_code:>1}   "
+                f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                f"{element:>2}{charge:>2}"
+            )
+            pdb_lines.append(atom_line)
+            atom_index += 1
+        should_terminate = (i == n - 1)
+        if(chain_index is not None):
+            if(i != n - 1 and chain_index[i + 1] != prev_chain_index):
+                should_terminate = True
+                prev_chain_index = chain_index[i + 1]
+        if(should_terminate):
+            # Close the chain.
+            chain_end = "TER"
+            chain_termination_line = (
+                f"{chain_end:<6}{atom_index:>5}      "
+                f"{res_1to3(aatype[i]):>3} "
+                f"{chain_tag:>1}{residue_index[i]:>4}"
+            )
+            pdb_lines.append(chain_termination_line)
+            atom_index += 1
+            if(i != n - 1):
+                # "prev" is a misnomer here. This happens at the beginning of
+                # each new chain.
+                pdb_lines.extend(get_pdb_headers(prot, prev_chain_index))
+    pdb_lines.append("END")
+    pdb_lines.append("")
+    return "\n".join(pdb_lines)
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+    """Computes an ideal atom mask.
+    `Protein.atom_mask` typically is defined according to the atoms that are
+    reported in the PDB. This function computes a mask according to heavy atoms
+    that should be present in the given sequence of amino acids.
+    Args:
+      prot: `Protein` whose fields are `numpy.ndarray` objects.
+    Returns:
+      An ideal atom mask.
+    """
+    return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+def from_prediction(
+    features: FeatureDict,
+    result: ModelOutput,
+    b_factors: Optional[np.ndarray] = None,
+    chain_index: Optional[np.ndarray] = None,
+    remark: Optional[str] = None,
+    parents: Optional[Sequence[str]] = None,
+    parents_chain_index: Optional[Sequence[int]] = None
+) -> Protein:
+    """Assembles a protein from a prediction.
+    Args:
+      features: Dictionary holding model inputs.
+      result: Dictionary holding model outputs.
+      b_factors: (Optional) B-factors to use for the protein.
+      chain_index: (Optional) Chain indices for multi-chain predictions
+      remark: (Optional) Remark about the prediction
+      parents: (Optional) List of template names
+    Returns:
+      A protein instance.
+    """
+    if b_factors is None:
+        b_factors = np.zeros_like(result["final_atom_mask"])
+    return Protein(
+        aatype=features["aatype"],
+        atom_positions=result["final_atom_positions"],
+        atom_mask=result["final_atom_mask"],
+        residue_index=features["residue_index"] + 1,
+        b_factors=b_factors,
+        chain_index=chain_index,
+        remark=remark,
+        parents=parents,
+        parents_chain_index=parents_chain_index,
+    )

openfold/np/relax/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import glob
+import importlib as importlib
+_files = glob.glob(os.path.join(os.path.dirname(__file__), "*.py"))
+__all__ = [
+    os.path.basename(f)[:-3]
+    for f in _files
+    if os.path.isfile(f) and not f.endswith("__init__.py")
+]
+_modules = [(m, importlib.import_module("." + m, __name__)) for m in __all__]
+for _m in _modules:
+    globals()[_m[0]] = _m[1]
+# Avoid needlessly cluttering the global namespace
+del _files, _m, _modules

openfold/np/relax/amber_minimize.py ADDED Viewed

	@@ -0,0 +1,625 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Restrained Amber Minimization of a structure."""
+import io
+import time
+from typing import Collection, Optional, Sequence
+from absl import logging
+from openfold.np import (
+    protein,
+    residue_constants,
+)
+import openfold.utils.loss as loss
+from openfold.np.relax import cleanup, utils
+import ml_collections
+import numpy as np
+from simtk import openmm
+from simtk import unit
+from simtk.openmm import app as openmm_app
+from simtk.openmm.app.internal.pdbstructure import PdbStructure
+ENERGY = unit.kilocalories_per_mole
+LENGTH = unit.angstroms
+def will_restrain(atom: openmm_app.Atom, rset: str) -> bool:
+    """Returns True if the atom will be restrained by the given restraint set."""
+    if rset == "non_hydrogen":
+        return atom.element.name != "hydrogen"
+    elif rset == "c_alpha":
+        return atom.name == "CA"
+def _add_restraints(
+    system: openmm.System,
+    reference_pdb: openmm_app.PDBFile,
+    stiffness: unit.Unit,
+    rset: str,
+    exclude_residues: Sequence[int],
+):
+    """Adds a harmonic potential that restrains the system to a structure."""
+    assert rset in ["non_hydrogen", "c_alpha"]
+    force = openmm.CustomExternalForce(
+        "0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)"
+    )
+    force.addGlobalParameter("k", stiffness)
+    for p in ["x0", "y0", "z0"]:
+        force.addPerParticleParameter(p)
+    for i, atom in enumerate(reference_pdb.topology.atoms()):
+        if atom.residue.index in exclude_residues:
+            continue
+        if will_restrain(atom, rset):
+            force.addParticle(i, reference_pdb.positions[i])
+    logging.info(
+        "Restraining %d / %d particles.",
+        force.getNumParticles(),
+        system.getNumParticles(),
+    )
+    system.addForce(force)
+def _openmm_minimize(
+    pdb_str: str,
+    max_iterations: int,
+    tolerance: unit.Unit,
+    stiffness: unit.Unit,
+    restraint_set: str,
+    exclude_residues: Sequence[int],
+    use_gpu: bool,
+):
+    """Minimize energy via openmm."""
+    pdb_file = io.StringIO(pdb_str)
+    pdb = openmm_app.PDBFile(pdb_file)
+    force_field = openmm_app.ForceField("amber99sb.xml")
+    constraints = openmm_app.HBonds
+    system = force_field.createSystem(pdb.topology, constraints=constraints)
+    if stiffness > 0 * ENERGY / (LENGTH ** 2):
+        _add_restraints(system, pdb, stiffness, restraint_set, exclude_residues)
+    integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
+    platform = openmm.Platform.getPlatformByName("CUDA" if use_gpu else "CPU")
+    simulation = openmm_app.Simulation(
+        pdb.topology, system, integrator, platform
+    )
+    simulation.context.setPositions(pdb.positions)
+    ret = {}
+    state = simulation.context.getState(getEnergy=True, getPositions=True)
+    ret["einit"] = state.getPotentialEnergy().value_in_unit(ENERGY)
+    ret["posinit"] = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+    simulation.minimizeEnergy(maxIterations=max_iterations, tolerance=tolerance)
+    state = simulation.context.getState(getEnergy=True, getPositions=True)
+    ret["efinal"] = state.getPotentialEnergy().value_in_unit(ENERGY)
+    ret["pos"] = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+    ret["min_pdb"] = _get_pdb_string(simulation.topology, state.getPositions())
+    return ret
+def _get_pdb_string(topology: openmm_app.Topology, positions: unit.Quantity):
+    """Returns a pdb string provided OpenMM topology and positions."""
+    with io.StringIO() as f:
+        openmm_app.PDBFile.writeFile(topology, positions, f)
+        return f.getvalue()
+def _check_cleaned_atoms(pdb_cleaned_string: str, pdb_ref_string: str):
+    """Checks that no atom positions have been altered by cleaning."""
+    cleaned = openmm_app.PDBFile(io.StringIO(pdb_cleaned_string))
+    reference = openmm_app.PDBFile(io.StringIO(pdb_ref_string))
+    cl_xyz = np.array(cleaned.getPositions().value_in_unit(LENGTH))
+    ref_xyz = np.array(reference.getPositions().value_in_unit(LENGTH))
+    for ref_res, cl_res in zip(
+        reference.topology.residues(), cleaned.topology.residues()
+    ):
+        assert ref_res.name == cl_res.name
+        for rat in ref_res.atoms():
+            for cat in cl_res.atoms():
+                if cat.name == rat.name:
+                    if not np.array_equal(
+                        cl_xyz[cat.index], ref_xyz[rat.index]
+                    ):
+                        raise ValueError(
+                            f"Coordinates of cleaned atom {cat} do not match "
+                            f"coordinates of reference atom {rat}."
+                        )
+def _check_residues_are_well_defined(prot: protein.Protein):
+    """Checks that all residues contain non-empty atom sets."""
+    if (prot.atom_mask.sum(axis=-1) == 0).any():
+        raise ValueError(
+            "Amber minimization can only be performed on proteins with"
+            " well-defined residues. This protein contains at least"
+            " one residue with no atoms."
+        )
+def _check_atom_mask_is_ideal(prot):
+    """Sanity-check the atom mask is ideal, up to a possible OXT."""
+    atom_mask = prot.atom_mask
+    ideal_atom_mask = protein.ideal_atom_mask(prot)
+    utils.assert_equal_nonterminal_atom_types(atom_mask, ideal_atom_mask)
+def clean_protein(prot: protein.Protein, checks: bool = True):
+    """Adds missing atoms to Protein instance.
+    Args:
+      prot: A `protein.Protein` instance.
+      checks: A `bool` specifying whether to add additional checks to the cleaning
+        process.
+    Returns:
+      pdb_string: A string of the cleaned protein.
+    """
+    _check_atom_mask_is_ideal(prot)
+    # Clean pdb.
+    prot_pdb_string = protein.to_pdb(prot)
+    pdb_file = io.StringIO(prot_pdb_string)
+    alterations_info = {}
+    fixed_pdb = cleanup.fix_pdb(pdb_file, alterations_info)
+    fixed_pdb_file = io.StringIO(fixed_pdb)
+    pdb_structure = PdbStructure(fixed_pdb_file)
+    cleanup.clean_structure(pdb_structure, alterations_info)
+    logging.info("alterations info: %s", alterations_info)
+    # Write pdb file of cleaned structure.
+    as_file = openmm_app.PDBFile(pdb_structure)
+    pdb_string = _get_pdb_string(as_file.getTopology(), as_file.getPositions())
+    if checks:
+        _check_cleaned_atoms(pdb_string, prot_pdb_string)
+    headers = protein.get_pdb_headers(prot)
+    if(len(headers) > 0):
+        pdb_string = '\n'.join(['\n'.join(headers), pdb_string])
+    return pdb_string
+def make_atom14_positions(prot):
+    """Constructs denser atom positions (14 dimensions instead of 37)."""
+    restype_atom14_to_atom37 = []  # mapping (restype, atom14) --> atom37
+    restype_atom37_to_atom14 = []  # mapping (restype, atom37) --> atom14
+    restype_atom14_mask = []
+    for rt in residue_constants.restypes:
+        atom_names = residue_constants.restype_name_to_atom14_names[
+            residue_constants.restype_1to3[rt]
+        ]
+        restype_atom14_to_atom37.append(
+            [
+                (residue_constants.atom_order[name] if name else 0)
+                for name in atom_names
+            ]
+        )
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14.append(
+            [
+                (atom_name_to_idx14[name] if name in atom_name_to_idx14 else 0)
+                for name in residue_constants.atom_types
+            ]
+        )
+        restype_atom14_mask.append(
+            [(1.0 if name else 0.0) for name in atom_names]
+        )
+    # Add dummy mapping for restype 'UNK'.
+    restype_atom14_to_atom37.append([0] * 14)
+    restype_atom37_to_atom14.append([0] * 37)
+    restype_atom14_mask.append([0.0] * 14)
+    restype_atom14_to_atom37 = np.array(
+        restype_atom14_to_atom37, dtype=np.int32
+    )
+    restype_atom37_to_atom14 = np.array(
+        restype_atom37_to_atom14, dtype=np.int32
+    )
+    restype_atom14_mask = np.array(restype_atom14_mask, dtype=np.float32)
+    # Create the mapping for (residx, atom14) --> atom37, i.e. an array
+    # with shape (num_res, 14) containing the atom37 indices for this protein.
+    residx_atom14_to_atom37 = restype_atom14_to_atom37[prot["aatype"]]
+    residx_atom14_mask = restype_atom14_mask[prot["aatype"]]
+    # Create a mask for known ground truth positions.
+    residx_atom14_gt_mask = residx_atom14_mask * np.take_along_axis(
+        prot["all_atom_mask"], residx_atom14_to_atom37, axis=1
+    ).astype(np.float32)
+    # Gather the ground truth positions.
+    residx_atom14_gt_positions = residx_atom14_gt_mask[:, :, None] * (
+        np.take_along_axis(
+            prot["all_atom_positions"],
+            residx_atom14_to_atom37[..., None],
+            axis=1,
+        )
+    )
+    prot["atom14_atom_exists"] = residx_atom14_mask
+    prot["atom14_gt_exists"] = residx_atom14_gt_mask
+    prot["atom14_gt_positions"] = residx_atom14_gt_positions
+    prot["residx_atom14_to_atom37"] = residx_atom14_to_atom37.astype(np.int64)
+    # Create the gather indices for mapping back.
+    residx_atom37_to_atom14 = restype_atom37_to_atom14[prot["aatype"]]
+    prot["residx_atom37_to_atom14"] = residx_atom37_to_atom14.astype(np.int64)
+    # Create the corresponding mask.
+    restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+    for restype, restype_letter in enumerate(residue_constants.restypes):
+        restype_name = residue_constants.restype_1to3[restype_letter]
+        atom_names = residue_constants.residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = residue_constants.atom_order[atom_name]
+            restype_atom37_mask[restype, atom_type] = 1
+    residx_atom37_mask = restype_atom37_mask[prot["aatype"]]
+    prot["atom37_atom_exists"] = residx_atom37_mask
+    # As the atom naming is ambiguous for 7 of the 20 amino acids, provide
+    # alternative ground truth coordinates where the naming is swapped
+    restype_3 = [
+        residue_constants.restype_1to3[res]
+        for res in residue_constants.restypes
+    ]
+    restype_3 += ["UNK"]
+    # Matrices for renaming ambiguous atoms.
+    all_matrices = {res: np.eye(14, dtype=np.float32) for res in restype_3}
+    for resname, swap in residue_constants.residue_atom_renaming_swaps.items():
+        correspondences = np.arange(14)
+        for source_atom_swap, target_atom_swap in swap.items():
+            source_index = residue_constants.restype_name_to_atom14_names[
+                resname
+            ].index(source_atom_swap)
+            target_index = residue_constants.restype_name_to_atom14_names[
+                resname
+            ].index(target_atom_swap)
+            correspondences[source_index] = target_index
+            correspondences[target_index] = source_index
+            renaming_matrix = np.zeros((14, 14), dtype=np.float32)
+            for index, correspondence in enumerate(correspondences):
+                renaming_matrix[index, correspondence] = 1.0
+        all_matrices[resname] = renaming_matrix.astype(np.float32)
+    renaming_matrices = np.stack(
+        [all_matrices[restype] for restype in restype_3]
+    )
+    # Pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14).
+    renaming_transform = renaming_matrices[prot["aatype"]]
+    # Apply it to the ground truth positions. shape (num_res, 14, 3).
+    alternative_gt_positions = np.einsum(
+        "rac,rab->rbc", residx_atom14_gt_positions, renaming_transform
+    )
+    prot["atom14_alt_gt_positions"] = alternative_gt_positions
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position).
+    alternative_gt_mask = np.einsum(
+        "ra,rab->rb", residx_atom14_gt_mask, renaming_transform
+    )
+    prot["atom14_alt_gt_exists"] = alternative_gt_mask
+    # Create an ambiguous atoms mask.  shape: (21, 14).
+    restype_atom14_is_ambiguous = np.zeros((21, 14), dtype=np.float32)
+    for resname, swap in residue_constants.residue_atom_renaming_swaps.items():
+        for atom_name1, atom_name2 in swap.items():
+            restype = residue_constants.restype_order[
+                residue_constants.restype_3to1[resname]
+            ]
+            atom_idx1 = residue_constants.restype_name_to_atom14_names[
+                resname
+            ].index(atom_name1)
+            atom_idx2 = residue_constants.restype_name_to_atom14_names[
+                resname
+            ].index(atom_name2)
+            restype_atom14_is_ambiguous[restype, atom_idx1] = 1
+            restype_atom14_is_ambiguous[restype, atom_idx2] = 1
+    # From this create an ambiguous_mask for the given sequence.
+    prot["atom14_atom_is_ambiguous"] = restype_atom14_is_ambiguous[
+        prot["aatype"]
+    ]
+    return prot
+def find_violations(prot_np: protein.Protein):
+    """Analyzes a protein and returns structural violation information.
+    Args:
+      prot_np: A protein.
+    Returns:
+      violations: A `dict` of structure components with structural violations.
+      violation_metrics: A `dict` of violation metrics.
+    """
+    batch = {
+        "aatype": prot_np.aatype,
+        "all_atom_positions": prot_np.atom_positions.astype(np.float32),
+        "all_atom_mask": prot_np.atom_mask.astype(np.float32),
+        "residue_index": prot_np.residue_index,
+    }
+    batch["seq_mask"] = np.ones_like(batch["aatype"], np.float32)
+    batch = make_atom14_positions(batch)
+    violations = loss.find_structural_violations_np(
+        batch=batch,
+        atom14_pred_positions=batch["atom14_gt_positions"],
+        config=ml_collections.ConfigDict(
+            {
+                "violation_tolerance_factor": 12,  # Taken from model config.
+                "clash_overlap_tolerance": 1.5,  # Taken from model config.
+            }
+        ),
+    )
+    violation_metrics = loss.compute_violation_metrics_np(
+        batch=batch,
+        atom14_pred_positions=batch["atom14_gt_positions"],
+        violations=violations,
+    )
+    return violations, violation_metrics
+def get_violation_metrics(prot: protein.Protein):
+    """Computes violation and alignment metrics."""
+    structural_violations, struct_metrics = find_violations(prot)
+    violation_idx = np.flatnonzero(
+        structural_violations["total_per_residue_violations_mask"]
+    )
+    struct_metrics["residue_violations"] = violation_idx
+    struct_metrics["num_residue_violations"] = len(violation_idx)
+    struct_metrics["structural_violations"] = structural_violations
+    return struct_metrics
+def _run_one_iteration(
+    *,
+    pdb_string: str,
+    max_iterations: int,
+    tolerance: float,
+    stiffness: float,
+    restraint_set: str,
+    max_attempts: int,
+    exclude_residues: Optional[Collection[int]] = None,
+    use_gpu: bool,
+):
+    """Runs the minimization pipeline.
+    Args:
+      pdb_string: A pdb string.
+      max_iterations: An `int` specifying the maximum number of L-BFGS iterations.
+      A value of 0 specifies no limit.
+      tolerance: kcal/mol, the energy tolerance of L-BFGS.
+      stiffness: kcal/mol A**2, spring constant of heavy atom restraining
+        potential.
+      restraint_set: The set of atoms to restrain.
+      max_attempts: The maximum number of minimization attempts.
+      exclude_residues: An optional list of zero-indexed residues to exclude from
+          restraints.
+      use_gpu: Whether to run relaxation on GPU
+    Returns:
+      A `dict` of minimization info.
+    """
+    exclude_residues = exclude_residues or []
+    # Assign physical dimensions.
+    tolerance = tolerance * ENERGY
+    stiffness = stiffness * ENERGY / (LENGTH ** 2)
+    start = time.perf_counter()
+    minimized = False
+    attempts = 0
+    while not minimized and attempts < max_attempts:
+        attempts += 1
+        try:
+            logging.info(
+                "Minimizing protein, attempt %d of %d.", attempts, max_attempts
+            )
+            ret = _openmm_minimize(
+                pdb_string,
+                max_iterations=max_iterations,
+                tolerance=tolerance,
+                stiffness=stiffness,
+                restraint_set=restraint_set,
+                exclude_residues=exclude_residues,
+                use_gpu=use_gpu,
+            )
+            minimized = True
+        except Exception as e:  # pylint: disable=broad-except
+            print(e)
+            logging.info(e)
+    if not minimized:
+        raise ValueError(f"Minimization failed after {max_attempts} attempts.")
+    ret["opt_time"] = time.perf_counter() - start
+    ret["min_attempts"] = attempts
+    return ret
+def run_pipeline(
+    prot: protein.Protein,
+    stiffness: float,
+    use_gpu: bool,
+    max_outer_iterations: int = 1,
+    place_hydrogens_every_iteration: bool = True,
+    max_iterations: int = 0,
+    tolerance: float = 2.39,
+    restraint_set: str = "non_hydrogen",
+    max_attempts: int = 100,
+    checks: bool = True,
+    exclude_residues: Optional[Sequence[int]] = None,
+):
+    """Run iterative amber relax.
+    Successive relax iterations are performed until all violations have been
+    resolved. Each iteration involves a restrained Amber minimization, with
+    restraint exclusions determined by violation-participating residues.
+    Args:
+      prot: A protein to be relaxed.
+      stiffness: kcal/mol A**2, the restraint stiffness.
+      use_gpu: Whether to run on GPU
+      max_outer_iterations: The maximum number of iterative minimization.
+      place_hydrogens_every_iteration: Whether hydrogens are re-initialized
+          prior to every minimization.
+      max_iterations: An `int` specifying the maximum number of L-BFGS steps
+          per relax iteration. A value of 0 specifies no limit.
+      tolerance: kcal/mol, the energy tolerance of L-BFGS.
+          The default value is the OpenMM default.
+      restraint_set: The set of atoms to restrain.
+      max_attempts: The maximum number of minimization attempts per iteration.
+      checks: Whether to perform cleaning checks.
+      exclude_residues: An optional list of zero-indexed residues to exclude from
+          restraints.
+    Returns:
+      out: A dictionary of output values.
+    """
+    # `protein.to_pdb` will strip any poorly-defined residues so we need to
+    # perform this check before `clean_protein`.
+    _check_residues_are_well_defined(prot)
+    pdb_string = clean_protein(prot, checks=checks)
+    # We keep the input around to restore metadata deleted by the relaxer
+    input_prot = prot
+    exclude_residues = exclude_residues or []
+    exclude_residues = set(exclude_residues)
+    violations = np.inf
+    iteration = 0
+    while violations > 0 and iteration < max_outer_iterations:
+        ret = _run_one_iteration(
+            pdb_string=pdb_string,
+            exclude_residues=exclude_residues,
+            max_iterations=max_iterations,
+            tolerance=tolerance,
+            stiffness=stiffness,
+            restraint_set=restraint_set,
+            max_attempts=max_attempts,
+            use_gpu=use_gpu,
+        )
+        headers = protein.get_pdb_headers(prot)
+        if(len(headers) > 0):
+            ret["min_pdb"] = '\n'.join(['\n'.join(headers), ret["min_pdb"]])
+        prot = protein.from_pdb_string(ret["min_pdb"])
+        if place_hydrogens_every_iteration:
+            pdb_string = clean_protein(prot, checks=True)
+        else:
+            pdb_string = ret["min_pdb"]
+        ret.update(get_violation_metrics(prot))
+        ret.update(
+            {
+                "num_exclusions": len(exclude_residues),
+                "iteration": iteration,
+            }
+        )
+        violations = ret["violations_per_residue"]
+        exclude_residues = exclude_residues.union(ret["residue_violations"])
+        logging.info(
+            "Iteration completed: Einit %.2f Efinal %.2f Time %.2f s "
+            "num residue violations %d num residue exclusions %d ",
+            ret["einit"],
+            ret["efinal"],
+            ret["opt_time"],
+            ret["num_residue_violations"],
+            ret["num_exclusions"],
+        )
+        iteration += 1
+    return ret
+def get_initial_energies(
+    pdb_strs: Sequence[str],
+    stiffness: float = 0.0,
+    restraint_set: str = "non_hydrogen",
+    exclude_residues: Optional[Sequence[int]] = None,
+):
+    """Returns initial potential energies for a sequence of PDBs.
+    Assumes the input PDBs are ready for minimization, and all have the same
+    topology.
+    Allows time to be saved by not pdbfixing / rebuilding the system.
+    Args:
+      pdb_strs: List of PDB strings.
+      stiffness: kcal/mol A**2, spring constant of heavy atom restraining
+          potential.
+      restraint_set: Which atom types to restrain.
+      exclude_residues: An optional list of zero-indexed residues to exclude from
+          restraints.
+    Returns:
+      A list of initial energies in the same order as pdb_strs.
+    """
+    exclude_residues = exclude_residues or []
+    openmm_pdbs = [
+        openmm_app.PDBFile(PdbStructure(io.StringIO(p))) for p in pdb_strs
+    ]
+    force_field = openmm_app.ForceField("amber99sb.xml")
+    system = force_field.createSystem(
+        openmm_pdbs[0].topology, constraints=openmm_app.HBonds
+    )
+    stiffness = stiffness * ENERGY / (LENGTH ** 2)
+    if stiffness > 0 * ENERGY / (LENGTH ** 2):
+        _add_restraints(
+            system, openmm_pdbs[0], stiffness, restraint_set, exclude_residues
+        )
+    simulation = openmm_app.Simulation(
+        openmm_pdbs[0].topology,
+        system,
+        openmm.LangevinIntegrator(0, 0.01, 0.0),
+        openmm.Platform.getPlatformByName("CPU"),
+    )
+    energies = []
+    for pdb in openmm_pdbs:
+        try:
+            simulation.context.setPositions(pdb.positions)
+            state = simulation.context.getState(getEnergy=True)
+            energies.append(state.getPotentialEnergy().value_in_unit(ENERGY))
+        except Exception as e:  # pylint: disable=broad-except
+            logging.error(
+                "Error getting initial energy, returning large value %s", e
+            )
+            energies.append(unit.Quantity(1e20, ENERGY))
+    return energies

openfold/np/relax/cleanup.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Cleans up a PDB file using pdbfixer in preparation for OpenMM simulations.
+fix_pdb uses a third-party tool. We also support fixing some additional edge
+cases like removing chains of length one (see clean_structure).
+"""
+import io
+import pdbfixer
+from simtk.openmm import app
+from simtk.openmm.app import element
+def fix_pdb(pdbfile, alterations_info):
+    """Apply pdbfixer to the contents of a PDB file; return a PDB string result.
+    1) Replaces nonstandard residues.
+    2) Removes heterogens (non protein residues) including water.
+    3) Adds missing residues and missing atoms within existing residues.
+    4) Adds hydrogens assuming pH=7.0.
+    5) KeepIds is currently true, so the fixer must keep the existing chain and
+       residue identifiers. This will fail for some files in wider PDB that have
+       invalid IDs.
+    Args:
+      pdbfile: Input PDB file handle.
+      alterations_info: A dict that will store details of changes made.
+    Returns:
+      A PDB string representing the fixed structure.
+    """
+    fixer = pdbfixer.PDBFixer(pdbfile=pdbfile)
+    fixer.findNonstandardResidues()
+    alterations_info["nonstandard_residues"] = fixer.nonstandardResidues
+    fixer.replaceNonstandardResidues()
+    _remove_heterogens(fixer, alterations_info, keep_water=False)
+    fixer.findMissingResidues()
+    alterations_info["missing_residues"] = fixer.missingResidues
+    fixer.findMissingAtoms()
+    alterations_info["missing_heavy_atoms"] = fixer.missingAtoms
+    alterations_info["missing_terminals"] = fixer.missingTerminals
+    fixer.addMissingAtoms(seed=0)
+    fixer.addMissingHydrogens()
+    out_handle = io.StringIO()
+    app.PDBFile.writeFile(
+        fixer.topology, fixer.positions, out_handle, keepIds=True
+    )
+    return out_handle.getvalue()
+def clean_structure(pdb_structure, alterations_info):
+    """Applies additional fixes to an OpenMM structure, to handle edge cases.
+    Args:
+      pdb_structure: An OpenMM structure to modify and fix.
+      alterations_info: A dict that will store details of changes made.
+    """
+    _replace_met_se(pdb_structure, alterations_info)
+    _remove_chains_of_length_one(pdb_structure, alterations_info)
+def _remove_heterogens(fixer, alterations_info, keep_water):
+    """Removes the residues that Pdbfixer considers to be heterogens.
+    Args:
+      fixer: A Pdbfixer instance.
+      alterations_info: A dict that will store details of changes made.
+      keep_water: If True, water (HOH) is not considered to be a heterogen.
+    """
+    initial_resnames = set()
+    for chain in fixer.topology.chains():
+        for residue in chain.residues():
+            initial_resnames.add(residue.name)
+    fixer.removeHeterogens(keepWater=keep_water)
+    final_resnames = set()
+    for chain in fixer.topology.chains():
+        for residue in chain.residues():
+            final_resnames.add(residue.name)
+    alterations_info["removed_heterogens"] = initial_resnames.difference(
+        final_resnames
+    )
+def _replace_met_se(pdb_structure, alterations_info):
+    """Replace the Se in any MET residues that were not marked as modified."""
+    modified_met_residues = []
+    for res in pdb_structure.iter_residues():
+        name = res.get_name_with_spaces().strip()
+        if name == "MET":
+            s_atom = res.get_atom("SD")
+            if s_atom.element_symbol == "Se":
+                s_atom.element_symbol = "S"
+                s_atom.element = element.get_by_symbol("S")
+                modified_met_residues.append(s_atom.residue_number)
+    alterations_info["Se_in_MET"] = modified_met_residues
+def _remove_chains_of_length_one(pdb_structure, alterations_info):
+    """Removes chains that correspond to a single amino acid.
+    A single amino acid in a chain is both N and C terminus. There is no force
+    template for this case.
+    Args:
+      pdb_structure: An OpenMM pdb_structure to modify and fix.
+      alterations_info: A dict that will store details of changes made.
+    """
+    removed_chains = {}
+    for model in pdb_structure.iter_models():
+        valid_chains = [c for c in model.iter_chains() if len(c) > 1]
+        invalid_chain_ids = [
+            c.chain_id for c in model.iter_chains() if len(c) <= 1
+        ]
+        model.chains = valid_chains
+        for chain_id in invalid_chain_ids:
+            model.chains_by_id.pop(chain_id)
+        removed_chains[model.number] = invalid_chain_ids
+    alterations_info["removed_chains"] = removed_chains

openfold/np/relax/relax.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Amber relaxation."""
+from typing import Any, Dict, Sequence, Tuple
+from openfold.np import protein
+from openfold.np.relax import amber_minimize, utils
+import numpy as np
+class AmberRelaxation(object):
+    """Amber relaxation."""
+    def __init__(
+        self,
+        *,
+        max_iterations: int,
+        tolerance: float,
+        stiffness: float,
+        exclude_residues: Sequence[int],
+        max_outer_iterations: int,
+        use_gpu: bool,
+    ):
+        """Initialize Amber Relaxer.
+        Args:
+          max_iterations: Maximum number of L-BFGS iterations. 0 means no max.
+          tolerance: kcal/mol, the energy tolerance of L-BFGS.
+          stiffness: kcal/mol A**2, spring constant of heavy atom restraining
+            potential.
+          exclude_residues: Residues to exclude from per-atom restraining.
+            Zero-indexed.
+          max_outer_iterations: Maximum number of violation-informed relax
+           iterations. A value of 1 will run the non-iterative procedure used in
+           CASP14. Use 20 so that >95% of the bad cases are relaxed. Relax finishes
+           as soon as there are no violations, hence in most cases this causes no
+           slowdown. In the worst case we do 20 outer iterations.
+          use_gpu: Whether to run on GPU
+        """
+        self._max_iterations = max_iterations
+        self._tolerance = tolerance
+        self._stiffness = stiffness
+        self._exclude_residues = exclude_residues
+        self._max_outer_iterations = max_outer_iterations
+        self._use_gpu = use_gpu
+    def process(
+        self, *, prot: protein.Protein
+    ) -> Tuple[str, Dict[str, Any], np.ndarray]:
+        """Runs Amber relax on a prediction, adds hydrogens, returns PDB string."""
+        out = amber_minimize.run_pipeline(
+            prot=prot,
+            max_iterations=self._max_iterations,
+            tolerance=self._tolerance,
+            stiffness=self._stiffness,
+            exclude_residues=self._exclude_residues,
+            max_outer_iterations=self._max_outer_iterations,
+            use_gpu=self._use_gpu,
+        )
+        min_pos = out["pos"]
+        start_pos = out["posinit"]
+        rmsd = np.sqrt(np.sum((start_pos - min_pos) ** 2) / start_pos.shape[0])
+        debug_data = {
+            "initial_energy": out["einit"],
+            "final_energy": out["efinal"],
+            "attempts": out["min_attempts"],
+            "rmsd": rmsd,
+        }
+        pdb_str = amber_minimize.clean_protein(prot)
+        min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos)
+        min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors)
+        utils.assert_equal_nonterminal_atom_types(
+            protein.from_pdb_string(min_pdb).atom_mask, prot.atom_mask
+        )
+        violations = out["structural_violations"][
+            "total_per_residue_violations_mask"
+        ]
+        min_pdb = protein.add_pdb_headers(prot, min_pdb)
+        return min_pdb, debug_data, violations

openfold/np/relax/utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for minimization."""
+import io
+from openfold.np import residue_constants
+from Bio import PDB
+import numpy as np
+from simtk.openmm import app as openmm_app
+from simtk.openmm.app.internal.pdbstructure import PdbStructure
+def overwrite_pdb_coordinates(pdb_str: str, pos) -> str:
+    pdb_file = io.StringIO(pdb_str)
+    structure = PdbStructure(pdb_file)
+    topology = openmm_app.PDBFile(structure).getTopology()
+    with io.StringIO() as f:
+        openmm_app.PDBFile.writeFile(topology, pos, f)
+        return f.getvalue()
+def overwrite_b_factors(pdb_str: str, bfactors: np.ndarray) -> str:
+    """Overwrites the B-factors in pdb_str with contents of bfactors array.
+    Args:
+      pdb_str: An input PDB string.
+      bfactors: A numpy array with shape [1, n_residues, 37]. We assume that the
+        B-factors are per residue; i.e. that the nonzero entries are identical in
+        [0, i, :].
+    Returns:
+      A new PDB string with the B-factors replaced.
+    """
+    if bfactors.shape[-1] != residue_constants.atom_type_num:
+        raise ValueError(
+            f"Invalid final dimension size for bfactors: {bfactors.shape[-1]}."
+        )
+    parser = PDB.PDBParser(QUIET=True)
+    handle = io.StringIO(pdb_str)
+    structure = parser.get_structure("", handle)
+    curr_resid = ("", "", "")
+    idx = -1
+    for atom in structure.get_atoms():
+        atom_resid = atom.parent.get_id()
+        if atom_resid != curr_resid:
+            idx += 1
+            if idx >= bfactors.shape[0]:
+                raise ValueError(
+                    "Index into bfactors exceeds number of residues. "
+                    "B-factors shape: {shape}, idx: {idx}."
+                )
+        curr_resid = atom_resid
+        atom.bfactor = bfactors[idx, residue_constants.atom_order["CA"]]
+    new_pdb = io.StringIO()
+    pdb_io = PDB.PDBIO()
+    pdb_io.set_structure(structure)
+    pdb_io.save(new_pdb)
+    return new_pdb.getvalue()
+def assert_equal_nonterminal_atom_types(
+    atom_mask: np.ndarray, ref_atom_mask: np.ndarray
+):
+    """Checks that pre- and post-minimized proteins have same atom set."""
+    # Ignore any terminal OXT atoms which may have been added by minimization.
+    oxt = residue_constants.atom_order["OXT"]
+    no_oxt_mask = np.ones(shape=atom_mask.shape, dtype=np.bool)
+    no_oxt_mask[..., oxt] = False
+    np.testing.assert_almost_equal(
+        ref_atom_mask[no_oxt_mask], atom_mask[no_oxt_mask]
+    )

openfold/np/residue_constants.py ADDED Viewed

	@@ -0,0 +1,1310 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Constants used in AlphaFold."""
+import collections
+import functools
+from typing import Mapping, List, Tuple
+from importlib import resources
+import numpy as np
+import tree
+# Internal import (35fd).
+# Distance from one CA to next CA [trans configuration: omega = 180].
+ca_ca = 3.80209737096
+# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
+# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
+# chi angles so their chi angle lists are empty.
+chi_angles_atoms = {
+    "ALA": [],
+    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
+    "ARG": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "NE"],
+        ["CG", "CD", "NE", "CZ"],
+    ],
+    "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "CYS": [["N", "CA", "CB", "SG"]],
+    "GLN": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "OE1"],
+    ],
+    "GLU": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "OE1"],
+    ],
+    "GLY": [],
+    "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]],
+    "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]],
+    "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "LYS": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "CD"],
+        ["CB", "CG", "CD", "CE"],
+        ["CG", "CD", "CE", "NZ"],
+    ],
+    "MET": [
+        ["N", "CA", "CB", "CG"],
+        ["CA", "CB", "CG", "SD"],
+        ["CB", "CG", "SD", "CE"],
+    ],
+    "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]],
+    "SER": [["N", "CA", "CB", "OG"]],
+    "THR": [["N", "CA", "CB", "OG1"]],
+    "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "VAL": [["N", "CA", "CB", "CG1"]],
+}
+# If chi angles given in fixed-length array, this matrix determines how to mask
+# them for each AA type. The order is as per restype_order (see below).
+chi_angles_mask = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [1.0, 1.0, 1.0, 1.0],  # ARG
+    [1.0, 1.0, 0.0, 0.0],  # ASN
+    [1.0, 1.0, 0.0, 0.0],  # ASP
+    [1.0, 0.0, 0.0, 0.0],  # CYS
+    [1.0, 1.0, 1.0, 0.0],  # GLN
+    [1.0, 1.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [1.0, 1.0, 0.0, 0.0],  # HIS
+    [1.0, 1.0, 0.0, 0.0],  # ILE
+    [1.0, 1.0, 0.0, 0.0],  # LEU
+    [1.0, 1.0, 1.0, 1.0],  # LYS
+    [1.0, 1.0, 1.0, 0.0],  # MET
+    [1.0, 1.0, 0.0, 0.0],  # PHE
+    [1.0, 1.0, 0.0, 0.0],  # PRO
+    [1.0, 0.0, 0.0, 0.0],  # SER
+    [1.0, 0.0, 0.0, 0.0],  # THR
+    [1.0, 1.0, 0.0, 0.0],  # TRP
+    [1.0, 1.0, 0.0, 0.0],  # TYR
+    [1.0, 0.0, 0.0, 0.0],  # VAL
+]
+# The following chi angles are pi periodic: they can be rotated by a multiple
+# of pi without affecting the structure.
+chi_pi_periodic = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [0.0, 0.0, 0.0, 0.0],  # ARG
+    [0.0, 0.0, 0.0, 0.0],  # ASN
+    [0.0, 1.0, 0.0, 0.0],  # ASP
+    [0.0, 0.0, 0.0, 0.0],  # CYS
+    [0.0, 0.0, 0.0, 0.0],  # GLN
+    [0.0, 0.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [0.0, 0.0, 0.0, 0.0],  # HIS
+    [0.0, 0.0, 0.0, 0.0],  # ILE
+    [0.0, 0.0, 0.0, 0.0],  # LEU
+    [0.0, 0.0, 0.0, 0.0],  # LYS
+    [0.0, 0.0, 0.0, 0.0],  # MET
+    [0.0, 1.0, 0.0, 0.0],  # PHE
+    [0.0, 0.0, 0.0, 0.0],  # PRO
+    [0.0, 0.0, 0.0, 0.0],  # SER
+    [0.0, 0.0, 0.0, 0.0],  # THR
+    [0.0, 0.0, 0.0, 0.0],  # TRP
+    [0.0, 1.0, 0.0, 0.0],  # TYR
+    [0.0, 0.0, 0.0, 0.0],  # VAL
+    [0.0, 0.0, 0.0, 0.0],  # UNK
+]
+# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
+# psi and chi angles:
+# 0: 'backbone group',
+# 1: 'pre-omega-group', (empty)
+# 2: 'phi-group', (currently empty, because it defines only hydrogens)
+# 3: 'psi-group',
+# 4,5,6,7: 'chi1,2,3,4-group'
+# The atom positions are relative to the axis-end-atom of the corresponding
+# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
+# is defined such that the dihedral-angle-definiting atom (the last entry in
+# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
+# format: [atomname, group_idx, rel_position]
+rigid_group_atom_positions = {
+    "ALA": [
+        ["N", 0, (-0.525, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, -0.000, -0.000)],
+        ["CB", 0, (-0.529, -0.774, -1.205)],
+        ["O", 3, (0.627, 1.062, 0.000)],
+    ],
+    "ARG": [
+        ["N", 0, (-0.524, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, -0.000)],
+        ["CB", 0, (-0.524, -0.778, -1.209)],
+        ["O", 3, (0.626, 1.062, 0.000)],
+        ["CG", 4, (0.616, 1.390, -0.000)],
+        ["CD", 5, (0.564, 1.414, 0.000)],
+        ["NE", 6, (0.539, 1.357, -0.000)],
+        ["NH1", 7, (0.206, 2.301, 0.000)],
+        ["NH2", 7, (2.078, 0.978, -0.000)],
+        ["CZ", 7, (0.758, 1.093, -0.000)],
+    ],
+    "ASN": [
+        ["N", 0, (-0.536, 1.357, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, -0.000, -0.000)],
+        ["CB", 0, (-0.531, -0.787, -1.200)],
+        ["O", 3, (0.625, 1.062, 0.000)],
+        ["CG", 4, (0.584, 1.399, 0.000)],
+        ["ND2", 5, (0.593, -1.188, 0.001)],
+        ["OD1", 5, (0.633, 1.059, 0.000)],
+    ],
+    "ASP": [
+        ["N", 0, (-0.525, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, 0.000, -0.000)],
+        ["CB", 0, (-0.526, -0.778, -1.208)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.593, 1.398, -0.000)],
+        ["OD1", 5, (0.610, 1.091, 0.000)],
+        ["OD2", 5, (0.592, -1.101, -0.003)],
+    ],
+    "CYS": [
+        ["N", 0, (-0.522, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.524, 0.000, 0.000)],
+        ["CB", 0, (-0.519, -0.773, -1.212)],
+        ["O", 3, (0.625, 1.062, -0.000)],
+        ["SG", 4, (0.728, 1.653, 0.000)],
+    ],
+    "GLN": [
+        ["N", 0, (-0.526, 1.361, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, 0.000, 0.000)],
+        ["CB", 0, (-0.525, -0.779, -1.207)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.615, 1.393, 0.000)],
+        ["CD", 5, (0.587, 1.399, -0.000)],
+        ["NE2", 6, (0.593, -1.189, -0.001)],
+        ["OE1", 6, (0.634, 1.060, 0.000)],
+    ],
+    "GLU": [
+        ["N", 0, (-0.528, 1.361, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, -0.000, -0.000)],
+        ["CB", 0, (-0.526, -0.781, -1.207)],
+        ["O", 3, (0.626, 1.062, 0.000)],
+        ["CG", 4, (0.615, 1.392, 0.000)],
+        ["CD", 5, (0.600, 1.397, 0.000)],
+        ["OE1", 6, (0.607, 1.095, -0.000)],
+        ["OE2", 6, (0.589, -1.104, -0.001)],
+    ],
+    "GLY": [
+        ["N", 0, (-0.572, 1.337, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.517, -0.000, -0.000)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+    ],
+    "HIS": [
+        ["N", 0, (-0.527, 1.360, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, 0.000, 0.000)],
+        ["CB", 0, (-0.525, -0.778, -1.208)],
+        ["O", 3, (0.625, 1.063, 0.000)],
+        ["CG", 4, (0.600, 1.370, -0.000)],
+        ["CD2", 5, (0.889, -1.021, 0.003)],
+        ["ND1", 5, (0.744, 1.160, -0.000)],
+        ["CE1", 5, (2.030, 0.851, 0.002)],
+        ["NE2", 5, (2.145, -0.466, 0.004)],
+    ],
+    "ILE": [
+        ["N", 0, (-0.493, 1.373, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, -0.000, -0.000)],
+        ["CB", 0, (-0.536, -0.793, -1.213)],
+        ["O", 3, (0.627, 1.062, -0.000)],
+        ["CG1", 4, (0.534, 1.437, -0.000)],
+        ["CG2", 4, (0.540, -0.785, -1.199)],
+        ["CD1", 5, (0.619, 1.391, 0.000)],
+    ],
+    "LEU": [
+        ["N", 0, (-0.520, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, -0.000)],
+        ["CB", 0, (-0.522, -0.773, -1.214)],
+        ["O", 3, (0.625, 1.063, -0.000)],
+        ["CG", 4, (0.678, 1.371, 0.000)],
+        ["CD1", 5, (0.530, 1.430, -0.000)],
+        ["CD2", 5, (0.535, -0.774, 1.200)],
+    ],
+    "LYS": [
+        ["N", 0, (-0.526, 1.362, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, 0.000, 0.000)],
+        ["CB", 0, (-0.524, -0.778, -1.208)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.619, 1.390, 0.000)],
+        ["CD", 5, (0.559, 1.417, 0.000)],
+        ["CE", 6, (0.560, 1.416, 0.000)],
+        ["NZ", 7, (0.554, 1.387, 0.000)],
+    ],
+    "MET": [
+        ["N", 0, (-0.521, 1.364, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, 0.000, 0.000)],
+        ["CB", 0, (-0.523, -0.776, -1.210)],
+        ["O", 3, (0.625, 1.062, -0.000)],
+        ["CG", 4, (0.613, 1.391, -0.000)],
+        ["SD", 5, (0.703, 1.695, 0.000)],
+        ["CE", 6, (0.320, 1.786, -0.000)],
+    ],
+    "PHE": [
+        ["N", 0, (-0.518, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.524, 0.000, -0.000)],
+        ["CB", 0, (-0.525, -0.776, -1.212)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["CG", 4, (0.607, 1.377, 0.000)],
+        ["CD1", 5, (0.709, 1.195, -0.000)],
+        ["CD2", 5, (0.706, -1.196, 0.000)],
+        ["CE1", 5, (2.102, 1.198, -0.000)],
+        ["CE2", 5, (2.098, -1.201, -0.000)],
+        ["CZ", 5, (2.794, -0.003, -0.001)],
+    ],
+    "PRO": [
+        ["N", 0, (-0.566, 1.351, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, -0.000, 0.000)],
+        ["CB", 0, (-0.546, -0.611, -1.293)],
+        ["O", 3, (0.621, 1.066, 0.000)],
+        ["CG", 4, (0.382, 1.445, 0.0)],
+        # ['CD', 5, (0.427, 1.440, 0.0)],
+        ["CD", 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger
+    ],
+    "SER": [
+        ["N", 0, (-0.529, 1.360, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, -0.000)],
+        ["CB", 0, (-0.518, -0.777, -1.211)],
+        ["O", 3, (0.626, 1.062, -0.000)],
+        ["OG", 4, (0.503, 1.325, 0.000)],
+    ],
+    "THR": [
+        ["N", 0, (-0.517, 1.364, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.526, 0.000, -0.000)],
+        ["CB", 0, (-0.516, -0.793, -1.215)],
+        ["O", 3, (0.626, 1.062, 0.000)],
+        ["CG2", 4, (0.550, -0.718, -1.228)],
+        ["OG1", 4, (0.472, 1.353, 0.000)],
+    ],
+    "TRP": [
+        ["N", 0, (-0.521, 1.363, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.525, -0.000, 0.000)],
+        ["CB", 0, (-0.523, -0.776, -1.212)],
+        ["O", 3, (0.627, 1.062, 0.000)],
+        ["CG", 4, (0.609, 1.370, -0.000)],
+        ["CD1", 5, (0.824, 1.091, 0.000)],
+        ["CD2", 5, (0.854, -1.148, -0.005)],
+        ["CE2", 5, (2.186, -0.678, -0.007)],
+        ["CE3", 5, (0.622, -2.530, -0.007)],
+        ["NE1", 5, (2.140, 0.690, -0.004)],
+        ["CH2", 5, (3.028, -2.890, -0.013)],
+        ["CZ2", 5, (3.283, -1.543, -0.011)],
+        ["CZ3", 5, (1.715, -3.389, -0.011)],
+    ],
+    "TYR": [
+        ["N", 0, (-0.522, 1.362, 0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.524, -0.000, -0.000)],
+        ["CB", 0, (-0.522, -0.776, -1.213)],
+        ["O", 3, (0.627, 1.062, -0.000)],
+        ["CG", 4, (0.607, 1.382, -0.000)],
+        ["CD1", 5, (0.716, 1.195, -0.000)],
+        ["CD2", 5, (0.713, -1.194, -0.001)],
+        ["CE1", 5, (2.107, 1.200, -0.002)],
+        ["CE2", 5, (2.104, -1.201, -0.003)],
+        ["OH", 5, (4.168, -0.002, -0.005)],
+        ["CZ", 5, (2.791, -0.001, -0.003)],
+    ],
+    "VAL": [
+        ["N", 0, (-0.494, 1.373, -0.000)],
+        ["CA", 0, (0.000, 0.000, 0.000)],
+        ["C", 0, (1.527, -0.000, -0.000)],
+        ["CB", 0, (-0.533, -0.795, -1.213)],
+        ["O", 3, (0.627, 1.062, -0.000)],
+        ["CG1", 4, (0.540, 1.429, -0.000)],
+        ["CG2", 4, (0.533, -0.776, 1.203)],
+    ],
+}
+# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
+residue_atoms = {
+    "ALA": ["C", "CA", "CB", "N", "O"],
+    "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"],
+    "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"],
+    "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"],
+    "CYS": ["C", "CA", "CB", "N", "O", "SG"],
+    "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"],
+    "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"],
+    "GLY": ["C", "CA", "N", "O"],
+    "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"],
+    "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"],
+    "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"],
+    "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"],
+    "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"],
+    "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"],
+    "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"],
+    "SER": ["C", "CA", "CB", "N", "O", "OG"],
+    "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"],
+    "TRP": [
+        "C",
+        "CA",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE2",
+        "CE3",
+        "CZ2",
+        "CZ3",
+        "CH2",
+        "N",
+        "NE1",
+        "O",
+    ],
+    "TYR": [
+        "C",
+        "CA",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE1",
+        "CE2",
+        "CZ",
+        "N",
+        "O",
+        "OH",
+    ],
+    "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"],
+}
+# Naming swaps for ambiguous atom names.
+# Due to symmetries in the amino acids the naming of atoms is ambiguous in
+# 4 of the 20 amino acids.
+# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
+# in LEU, VAL and ARG can be resolved by using the 3d constellations of
+# the 'ambiguous' atoms and their neighbours)
+# TODO: ^ interpret this
+residue_atom_renaming_swaps = {
+    "ASP": {"OD1": "OD2"},
+    "GLU": {"OE1": "OE2"},
+    "PHE": {"CD1": "CD2", "CE1": "CE2"},
+    "TYR": {"CD1": "CD2", "CE1": "CE2"},
+}
+# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
+van_der_waals_radius = {
+    "C": 1.7,
+    "N": 1.55,
+    "O": 1.52,
+    "S": 1.8,
+}
+Bond = collections.namedtuple(
+    "Bond", ["atom1_name", "atom2_name", "length", "stddev"]
+)
+BondAngle = collections.namedtuple(
+    "BondAngle",
+    ["atom1_name", "atom2_name", "atom3name", "angle_rad", "stddev"],
+)
+@functools.lru_cache(maxsize=None)
+def load_stereo_chemical_props() -> Tuple[
+    Mapping[str, List[Bond]],
+    Mapping[str, List[Bond]],
+    Mapping[str, List[BondAngle]],
+]:
+    """Load stereo_chemical_props.txt into a nice structure.
+    Load literature values for bond lengths and bond angles and translate
+    bond angles into the length of the opposite edge of the triangle
+    ("residue_virtual_bonds").
+    Returns:
+      residue_bonds:  dict that maps resname --> list of Bond tuples
+      residue_virtual_bonds: dict that maps resname --> list of Bond tuples
+      residue_bond_angles: dict that maps resname --> list of BondAngle tuples
+    """
+    # TODO: this file should be downloaded in a setup script
+    stereo_chemical_props = resources.read_text("openfold.resources", "stereo_chemical_props.txt")
+    lines_iter = iter(stereo_chemical_props.splitlines())
+    # Load bond lengths.
+    residue_bonds = {}
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, length, stddev = line.split()
+        atom1, atom2 = bond.split("-")
+        if resname not in residue_bonds:
+            residue_bonds[resname] = []
+        residue_bonds[resname].append(
+            Bond(atom1, atom2, float(length), float(stddev))
+        )
+    residue_bonds["UNK"] = []
+    # Load bond angles.
+    residue_bond_angles = {}
+    next(lines_iter)  # Skip empty line.
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, angle_degree, stddev_degree = line.split()
+        atom1, atom2, atom3 = bond.split("-")
+        if resname not in residue_bond_angles:
+            residue_bond_angles[resname] = []
+        residue_bond_angles[resname].append(
+            BondAngle(
+                atom1,
+                atom2,
+                atom3,
+                float(angle_degree) / 180.0 * np.pi,
+                float(stddev_degree) / 180.0 * np.pi,
+            )
+        )
+    residue_bond_angles["UNK"] = []
+    def make_bond_key(atom1_name, atom2_name):
+        """Unique key to lookup bonds."""
+        return "-".join(sorted([atom1_name, atom2_name]))
+    # Translate bond angles into distances ("virtual bonds").
+    residue_virtual_bonds = {}
+    for resname, bond_angles in residue_bond_angles.items():
+        # Create a fast lookup dict for bond lengths.
+        bond_cache = {}
+        for b in residue_bonds[resname]:
+            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
+        residue_virtual_bonds[resname] = []
+        for ba in bond_angles:
+            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
+            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
+            # Compute distance between atom1 and atom3 using the law of cosines
+            # c^2 = a^2 + b^2 - 2ab*cos(gamma).
+            gamma = ba.angle_rad
+            length = np.sqrt(
+                bond1.length ** 2
+                + bond2.length ** 2
+                - 2 * bond1.length * bond2.length * np.cos(gamma)
+            )
+            # Propagation of uncertainty assuming uncorrelated errors.
+            dl_outer = 0.5 / length
+            dl_dgamma = (
+                2 * bond1.length * bond2.length * np.sin(gamma)
+            ) * dl_outer
+            dl_db1 = (
+                2 * bond1.length - 2 * bond2.length * np.cos(gamma)
+            ) * dl_outer
+            dl_db2 = (
+                2 * bond2.length - 2 * bond1.length * np.cos(gamma)
+            ) * dl_outer
+            stddev = np.sqrt(
+                (dl_dgamma * ba.stddev) ** 2
+                + (dl_db1 * bond1.stddev) ** 2
+                + (dl_db2 * bond2.stddev) ** 2
+            )
+            residue_virtual_bonds[resname].append(
+                Bond(ba.atom1_name, ba.atom3name, length, stddev)
+            )
+    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)
+# Between-residue bond lengths for general bonds (first element) and for Proline
+# (second element).
+between_res_bond_length_c_n = [1.329, 1.341]
+between_res_bond_length_stddev_c_n = [0.014, 0.016]
+# Between-residue cos_angles.
+between_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315
+between_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995
+# This mapping is used when we need to store atom data in a format that requires
+# fixed atom data size for every residue (e.g. a numpy array).
+atom_types = [
+    "N",
+    "CA",
+    "C",
+    "CB",
+    "O",
+    "CG",
+    "CG1",
+    "CG2",
+    "OG",
+    "OG1",
+    "SG",
+    "CD",
+    "CD1",
+    "CD2",
+    "ND1",
+    "ND2",
+    "OD1",
+    "OD2",
+    "SD",
+    "CE",
+    "CE1",
+    "CE2",
+    "CE3",
+    "NE",
+    "NE1",
+    "NE2",
+    "OE1",
+    "OE2",
+    "CH2",
+    "NH1",
+    "NH2",
+    "OH",
+    "CZ",
+    "CZ2",
+    "CZ3",
+    "NZ",
+    "OXT",
+]
+atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
+atom_type_num = len(atom_types)  # := 37.
+# A compact atom encoding with 14 columns
+# pylint: disable=line-too-long
+# pylint: disable=bad-whitespace
+restype_name_to_atom14_names = {
+    "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""],
+    "ARG": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD",
+        "NE",
+        "CZ",
+        "NH1",
+        "NH2",
+        "",
+        "",
+        "",
+    ],
+    "ASN": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "OD1",
+        "ND2",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "ASP": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "OD1",
+        "OD2",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""],
+    "GLN": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD",
+        "OE1",
+        "NE2",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "GLU": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD",
+        "OE1",
+        "OE2",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""],
+    "HIS": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "ND1",
+        "CD2",
+        "CE1",
+        "NE2",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "ILE": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG1",
+        "CG2",
+        "CD1",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "LEU": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "LYS": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD",
+        "CE",
+        "NZ",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "MET": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "SD",
+        "CE",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "PHE": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE1",
+        "CE2",
+        "CZ",
+        "",
+        "",
+        "",
+    ],
+    "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""],
+    "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""],
+    "THR": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "OG1",
+        "CG2",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "TRP": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE2",
+        "CE3",
+        "NE1",
+        "CZ2",
+        "CZ3",
+        "CH2",
+    ],
+    "TYR": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE1",
+        "CE2",
+        "CZ",
+        "OH",
+        "",
+        "",
+    ],
+    "VAL": [
+        "N",
+        "CA",
+        "C",
+        "O",
+        "CB",
+        "CG1",
+        "CG2",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+        "",
+    ],
+    "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
+}
+# pylint: enable=line-too-long
+# pylint: enable=bad-whitespace
+# This is the standard residue order when coding AA type as a number.
+# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
+restypes = [
+    "A",
+    "R",
+    "N",
+    "D",
+    "C",
+    "Q",
+    "E",
+    "G",
+    "H",
+    "I",
+    "L",
+    "K",
+    "M",
+    "F",
+    "P",
+    "S",
+    "T",
+    "W",
+    "Y",
+    "V",
+]
+restype_order = {restype: i for i, restype in enumerate(restypes)}
+restype_num = len(restypes)  # := 20.
+unk_restype_index = restype_num  # Catch-all index for unknown restypes.
+restypes_with_x = restypes + ["X"]
+restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
+def sequence_to_onehot(
+    sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False
+) -> np.ndarray:
+    """Maps the given sequence into a one-hot encoded matrix.
+    Args:
+      sequence: An amino acid sequence.
+      mapping: A dictionary mapping amino acids to integers.
+      map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+        mapped to the unknown amino acid 'X'. If the mapping doesn't contain
+        amino acid 'X', an error will be thrown. If False, any amino acid not in
+        the mapping will throw an error.
+    Returns:
+      A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
+      the sequence.
+    Raises:
+      ValueError: If the mapping doesn't contain values from 0 to
+        num_unique_aas - 1 without any gaps.
+    """
+    num_entries = max(mapping.values()) + 1
+    if sorted(set(mapping.values())) != list(range(num_entries)):
+        raise ValueError(
+            "The mapping must have values from 0 to num_unique_aas-1 "
+            "without any gaps. Got: %s" % sorted(mapping.values())
+        )
+    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
+    for aa_index, aa_type in enumerate(sequence):
+        if map_unknown_to_x:
+            if aa_type.isalpha() and aa_type.isupper():
+                aa_id = mapping.get(aa_type, mapping["X"])
+            else:
+                raise ValueError(
+                    f"Invalid character in the sequence: {aa_type}"
+                )
+        else:
+            aa_id = mapping[aa_type]
+        one_hot_arr[aa_index, aa_id] = 1
+    return one_hot_arr
+restype_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+}
+# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
+# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
+# many more, and less common, three letter names as keys and maps many of these
+# to the same one letter name (including 'X' and 'U' which we don't use here).
+restype_3to1 = {v: k for k, v in restype_1to3.items()}
+# Define a restype name for all unknown residues.
+unk_restype = "UNK"
+resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
+resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
+# The mapping here uses hhblits convention, so that B is mapped to D, J and O
+# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
+# remaining 20 amino acids are kept in alphabetical order.
+# There are 2 non-amino acid codes, X (representing any amino acid) and
+# "-" representing a missing amino acid in an alignment.  The id for these
+# codes is put at the end (20 and 21) so that they can easily be ignored if
+# desired.
+HHBLITS_AA_TO_ID = {
+    "A": 0,
+    "B": 2,
+    "C": 1,
+    "D": 2,
+    "E": 3,
+    "F": 4,
+    "G": 5,
+    "H": 6,
+    "I": 7,
+    "J": 20,
+    "K": 8,
+    "L": 9,
+    "M": 10,
+    "N": 11,
+    "O": 20,
+    "P": 12,
+    "Q": 13,
+    "R": 14,
+    "S": 15,
+    "T": 16,
+    "U": 1,
+    "V": 17,
+    "W": 18,
+    "X": 20,
+    "Y": 19,
+    "Z": 3,
+    "-": 21,
+}
+# Partial inversion of HHBLITS_AA_TO_ID.
+ID_TO_HHBLITS_AA = {
+    0: "A",
+    1: "C",  # Also U.
+    2: "D",  # Also B.
+    3: "E",  # Also Z.
+    4: "F",
+    5: "G",
+    6: "H",
+    7: "I",
+    8: "K",
+    9: "L",
+    10: "M",
+    11: "N",
+    12: "P",
+    13: "Q",
+    14: "R",
+    15: "S",
+    16: "T",
+    17: "V",
+    18: "W",
+    19: "Y",
+    20: "X",  # Includes J and O.
+    21: "-",
+}
+restypes_with_x_and_gap = restypes + ["X", "-"]
+MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
+    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
+    for i in range(len(restypes_with_x_and_gap))
+)
+def _make_standard_atom_mask() -> np.ndarray:
+    """Returns [num_res_types, num_atom_types] mask array."""
+    # +1 to account for unknown (all 0s).
+    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
+    for restype, restype_letter in enumerate(restypes):
+        restype_name = restype_1to3[restype_letter]
+        atom_names = residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = atom_order[atom_name]
+            mask[restype, atom_type] = 1
+    return mask
+STANDARD_ATOM_MASK = _make_standard_atom_mask()
+# A one hot representation for the first and second atoms defining the axis
+# of rotation for each chi-angle in each residue.
+def chi_angle_atom(atom_index: int) -> np.ndarray:
+    """Define chi-angle rigid groups via one-hot representations."""
+    chi_angles_index = {}
+    one_hots = []
+    for k, v in chi_angles_atoms.items():
+        indices = [atom_types.index(s[atom_index]) for s in v]
+        indices.extend([-1] * (4 - len(indices)))
+        chi_angles_index[k] = indices
+    for r in restypes:
+        res3 = restype_1to3[r]
+        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
+        one_hots.append(one_hot)
+    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
+    one_hot = np.stack(one_hots, axis=0)
+    one_hot = np.transpose(one_hot, [0, 2, 1])
+    return one_hot
+chi_atom_1_one_hot = chi_angle_atom(1)
+chi_atom_2_one_hot = chi_angle_atom(2)
+# An array like chi_angles_atoms but using indices rather than names.
+chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
+chi_angles_atom_indices = tree.map_structure(
+    lambda atom_name: atom_order[atom_name], chi_angles_atom_indices
+)
+chi_angles_atom_indices = np.array(
+    [
+        chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
+        for chi_atoms in chi_angles_atom_indices
+    ]
+)
+# Mapping from (res_name, atom_name) pairs to the atom's chi group index
+# and atom index within that group.
+chi_groups_for_atom = collections.defaultdict(list)
+for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
+    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
+        for atom_i, atom in enumerate(chi_group):
+            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
+chi_groups_for_atom = dict(chi_groups_for_atom)
+def _make_rigid_transformation_4x4(ex, ey, translation):
+    """Create a rigid 4x4 transformation matrix from two axes and transl."""
+    # Normalize ex.
+    ex_normalized = ex / np.linalg.norm(ex)
+    # make ey perpendicular to ex
+    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
+    ey_normalized /= np.linalg.norm(ey_normalized)
+    # compute ez as cross product
+    eznorm = np.cross(ex_normalized, ey_normalized)
+    m = np.stack(
+        [ex_normalized, ey_normalized, eznorm, translation]
+    ).transpose()
+    m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0)
+    return m
+# create an array with (restype, atomtype) --> rigid_group_idx
+# and an array with (restype, atomtype, coord) for the atom positions
+# and compute affine transformation matrices (4,4) from one rigid group to the
+# previous group
+restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int)
+restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
+restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int)
+restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
+restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
+restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
+def _make_rigid_group_constants():
+    """Fill the arrays above."""
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        for atomname, group_idx, atom_position in rigid_group_atom_positions[
+            resname
+        ]:
+            atomtype = atom_order[atomname]
+            restype_atom37_to_rigid_group[restype, atomtype] = group_idx
+            restype_atom37_mask[restype, atomtype] = 1
+            restype_atom37_rigid_group_positions[
+                restype, atomtype, :
+            ] = atom_position
+            atom14idx = restype_name_to_atom14_names[resname].index(atomname)
+            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
+            restype_atom14_mask[restype, atom14idx] = 1
+            restype_atom14_rigid_group_positions[
+                restype, atom14idx, :
+            ] = atom_position
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_positions = {
+            name: np.array(pos)
+            for name, _, pos in rigid_group_atom_positions[resname]
+        }
+        # backbone to backbone is the identity transform
+        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
+        # pre-omega-frame to backbone (currently dummy identity matrix)
+        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
+        # phi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["N"] - atom_positions["CA"],
+            ey=np.array([1.0, 0.0, 0.0]),
+            translation=atom_positions["N"],
+        )
+        restype_rigid_group_default_frame[restype, 2, :, :] = mat
+        # psi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["C"] - atom_positions["CA"],
+            ey=atom_positions["CA"] - atom_positions["N"],
+            translation=atom_positions["C"],
+        )
+        restype_rigid_group_default_frame[restype, 3, :, :] = mat
+        # chi1-frame to backbone
+        if chi_angles_mask[restype][0]:
+            base_atom_names = chi_angles_atoms[resname][0]
+            base_atom_positions = [
+                atom_positions[name] for name in base_atom_names
+            ]
+            mat = _make_rigid_transformation_4x4(
+                ex=base_atom_positions[2] - base_atom_positions[1],
+                ey=base_atom_positions[0] - base_atom_positions[1],
+                translation=base_atom_positions[2],
+            )
+            restype_rigid_group_default_frame[restype, 4, :, :] = mat
+        # chi2-frame to chi1-frame
+        # chi3-frame to chi2-frame
+        # chi4-frame to chi3-frame
+        # luckily all rotation axes for the next frame start at (0,0,0) of the
+        # previous frame
+        for chi_idx in range(1, 4):
+            if chi_angles_mask[restype][chi_idx]:
+                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
+                axis_end_atom_position = atom_positions[axis_end_atom_name]
+                mat = _make_rigid_transformation_4x4(
+                    ex=axis_end_atom_position,
+                    ey=np.array([-1.0, 0.0, 0.0]),
+                    translation=axis_end_atom_position,
+                )
+                restype_rigid_group_default_frame[
+                    restype, 4 + chi_idx, :, :
+                ] = mat
+_make_rigid_group_constants()
+def make_atom14_dists_bounds(
+    overlap_tolerance=1.5, bond_length_tolerance_factor=15
+):
+    """compute upper and lower bounds for bonds to assess violations."""
+    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
+    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_list = restype_name_to_atom14_names[resname]
+        # create lower and upper bounds for clashes
+        for atom1_idx, atom1_name in enumerate(atom_list):
+            if not atom1_name:
+                continue
+            atom1_radius = van_der_waals_radius[atom1_name[0]]
+            for atom2_idx, atom2_name in enumerate(atom_list):
+                if (not atom2_name) or atom1_idx == atom2_idx:
+                    continue
+                atom2_radius = van_der_waals_radius[atom2_name[0]]
+                lower = atom1_radius + atom2_radius - overlap_tolerance
+                upper = 1e10
+                restype_atom14_bond_lower_bound[
+                    restype, atom1_idx, atom2_idx
+                ] = lower
+                restype_atom14_bond_lower_bound[
+                    restype, atom2_idx, atom1_idx
+                ] = lower
+                restype_atom14_bond_upper_bound[
+                    restype, atom1_idx, atom2_idx
+                ] = upper
+                restype_atom14_bond_upper_bound[
+                    restype, atom2_idx, atom1_idx
+                ] = upper
+        # overwrite lower and upper bounds for bonds and angles
+        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
+            atom1_idx = atom_list.index(b.atom1_name)
+            atom2_idx = atom_list.index(b.atom2_name)
+            lower = b.length - bond_length_tolerance_factor * b.stddev
+            upper = b.length + bond_length_tolerance_factor * b.stddev
+            restype_atom14_bond_lower_bound[
+                restype, atom1_idx, atom2_idx
+            ] = lower
+            restype_atom14_bond_lower_bound[
+                restype, atom2_idx, atom1_idx
+            ] = lower
+            restype_atom14_bond_upper_bound[
+                restype, atom1_idx, atom2_idx
+            ] = upper
+            restype_atom14_bond_upper_bound[
+                restype, atom2_idx, atom1_idx
+            ] = upper
+            restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
+            restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
+    return {
+        "lower_bound": restype_atom14_bond_lower_bound,  # shape (21,14,14)
+        "upper_bound": restype_atom14_bond_upper_bound,  # shape (21,14,14)
+        "stddev": restype_atom14_bond_stddev,  # shape (21,14,14)
+    }
+restype_atom14_ambiguous_atoms = np.zeros((21, 14), dtype=np.float32)
+restype_atom14_ambiguous_atoms_swap_idx = np.tile(
+    np.arange(14, dtype=np.int), (21, 1)
+)
+def _make_atom14_ambiguity_feats():
+    for res, pairs in residue_atom_renaming_swaps.items():
+        res_idx = restype_order[restype_3to1[res]]
+        for atom1, atom2 in pairs.items():
+            atom1_idx = restype_name_to_atom14_names[res].index(atom1)
+            atom2_idx = restype_name_to_atom14_names[res].index(atom2)
+            restype_atom14_ambiguous_atoms[res_idx, atom1_idx] = 1
+            restype_atom14_ambiguous_atoms[res_idx, atom2_idx] = 1
+            restype_atom14_ambiguous_atoms_swap_idx[
+                res_idx, atom1_idx
+            ] = atom2_idx
+            restype_atom14_ambiguous_atoms_swap_idx[
+                res_idx, atom2_idx
+            ] = atom1_idx
+_make_atom14_ambiguity_feats()
+def aatype_to_str_sequence(aatype):
+    return ''.join([
+        restypes_with_x[aatype[i]]
+        for i in range(len(aatype))
+    ])

openfold/resources/__init__.py ADDED Viewed

File without changes

openfold/utils/feats.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from typing import Dict
+from openfold.np import protein
+import openfold.np.residue_constants as rc
+from openfold.utils.rigid_utils import Rotation, Rigid
+from openfold.utils.tensor_utils import (
+    batched_gather,
+    one_hot,
+    tree_map,
+    tensor_tree_map,
+)
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
+    is_gly = aatype == rc.restype_order["G"]
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+    if all_atom_masks is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly,
+            all_atom_masks[..., ca_idx],
+            all_atom_masks[..., cb_idx],
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+def atom14_to_atom37(atom14, batch):
+    atom37_data = batched_gather(
+        atom14,
+        batch["residx_atom37_to_atom14"],
+        dim=-2,
+        no_batch_dims=len(atom14.shape[:-2]),
+    )
+    atom37_data = atom37_data * batch["atom37_atom_exists"][..., None]
+    return atom37_data
+def build_template_angle_feat(template_feats):
+    template_aatype = template_feats["template_aatype"]
+    torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"]
+    alt_torsion_angles_sin_cos = template_feats[
+        "template_alt_torsion_angles_sin_cos"
+    ]
+    torsion_angles_mask = template_feats["template_torsion_angles_mask"]
+    template_angle_feat = torch.cat(
+        [
+            nn.functional.one_hot(template_aatype, 22),
+            torsion_angles_sin_cos.reshape(
+                *torsion_angles_sin_cos.shape[:-2], 14
+            ),
+            alt_torsion_angles_sin_cos.reshape(
+                *alt_torsion_angles_sin_cos.shape[:-2], 14
+            ),
+            torsion_angles_mask,
+        ],
+        dim=-1,
+    )
+    return template_angle_feat
+def build_template_pair_feat(
+    batch,
+    min_bin, max_bin, no_bins,
+    use_unit_vector=False,
+    eps=1e-20, inf=1e8
+):
+    template_mask = batch["template_pseudo_beta_mask"]
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+    # Compute distogram (this seems to differ slightly from Alg. 5)
+    tpb = batch["template_pseudo_beta"]
+    dgram = torch.sum(
+        (tpb[..., None, :] - tpb[..., None, :, :]) ** 2, dim=-1, keepdim=True
+    )
+    lower = torch.linspace(min_bin, max_bin, no_bins, device=tpb.device) ** 2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
+    to_concat = [dgram, template_mask_2d[..., None]]
+    aatype_one_hot = nn.functional.one_hot(
+        batch["template_aatype"],
+        rc.restype_num + 2,
+    )
+    n_res = batch["template_aatype"].shape[-1]
+    to_concat.append(
+        aatype_one_hot[..., None, :, :].expand(
+            *aatype_one_hot.shape[:-2], n_res, -1, -1
+        )
+    )
+    to_concat.append(
+        aatype_one_hot[..., None, :].expand(
+            *aatype_one_hot.shape[:-2], -1, n_res, -1
+        )
+    )
+    n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]]
+    rigids = Rigid.make_transform_from_reference(
+        n_xyz=batch["template_all_atom_positions"][..., n, :],
+        ca_xyz=batch["template_all_atom_positions"][..., ca, :],
+        c_xyz=batch["template_all_atom_positions"][..., c, :],
+        eps=eps,
+    )
+    points = rigids.get_trans()[..., None, :, :]
+    rigid_vec = rigids[..., None].invert_apply(points)
+    inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec ** 2, dim=-1))
+    t_aa_masks = batch["template_all_atom_mask"]
+    template_mask = (
+        t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c]
+    )
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+    inv_distance_scalar = inv_distance_scalar * template_mask_2d
+    unit_vector = rigid_vec * inv_distance_scalar[..., None]
+    if(not use_unit_vector):
+        unit_vector = unit_vector * 0.
+    to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1))
+    to_concat.append(template_mask_2d[..., None])
+    act = torch.cat(to_concat, dim=-1)
+    act = act * template_mask_2d[..., None]
+    return act
+def build_extra_msa_feat(batch):
+    msa_1hot = nn.functional.one_hot(batch["extra_msa"], 23)
+    msa_feat = [
+        msa_1hot,
+        batch["extra_has_deletion"].unsqueeze(-1),
+        batch["extra_deletion_value"].unsqueeze(-1),
+    ]
+    return torch.cat(msa_feat, dim=-1)
+def torsion_angles_to_frames(
+    r: Rigid,
+    alpha: torch.Tensor,
+    aatype: torch.Tensor,
+    rrgdf: torch.Tensor,
+):
+    # [*, N, 8, 4, 4]
+    default_4x4 = rrgdf[aatype, ...]
+    # [*, N, 8] transformations, i.e.
+    #   One [*, N, 8, 3, 3] rotation matrix and
+    #   One [*, N, 8, 3]    translation matrix
+    default_r = r.from_tensor_4x4(default_4x4)
+    bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2))
+    bb_rot[..., 1] = 1
+    # [*, N, 8, 2]
+    alpha = torch.cat(
+        [bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2
+    )
+    # [*, N, 8, 3, 3]
+    # Produces rotation matrices of the form:
+    # [
+    #   [1, 0  , 0  ],
+    #   [0, a_2,-a_1],
+    #   [0, a_1, a_2]
+    # ]
+    # This follows the original code rather than the supplement, which uses
+    # different indices.
+    all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape)
+    all_rots[..., 0, 0] = 1
+    all_rots[..., 1, 1] = alpha[..., 1]
+    all_rots[..., 1, 2] = -alpha[..., 0]
+    all_rots[..., 2, 1:] = alpha
+    all_rots = Rigid(Rotation(rot_mats=all_rots), None)
+    all_frames = default_r.compose(all_rots)
+    chi2_frame_to_frame = all_frames[..., 5]
+    chi3_frame_to_frame = all_frames[..., 6]
+    chi4_frame_to_frame = all_frames[..., 7]
+    chi1_frame_to_bb = all_frames[..., 4]
+    chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
+    chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
+    chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
+    all_frames_to_bb = Rigid.cat(
+        [
+            all_frames[..., :5],
+            chi2_frame_to_bb.unsqueeze(-1),
+            chi3_frame_to_bb.unsqueeze(-1),
+            chi4_frame_to_bb.unsqueeze(-1),
+        ],
+        dim=-1,
+    )
+    all_frames_to_global = r[..., None].compose(all_frames_to_bb)
+    return all_frames_to_global
+def frames_and_literature_positions_to_atom14_pos(
+    r: Rigid,
+    aatype: torch.Tensor,
+    default_frames,
+    group_idx,
+    atom_mask,
+    lit_positions,
+):
+    # [*, N, 14, 4, 4]
+    default_4x4 = default_frames[aatype, ...]
+    # [*, N, 14]
+    group_mask = group_idx[aatype, ...]
+    # [*, N, 14, 8]
+    group_mask = nn.functional.one_hot(
+        group_mask,
+        num_classes=default_frames.shape[-3],
+    )
+    # [*, N, 14, 8]
+    t_atoms_to_global = r[..., None, :] * group_mask
+    # [*, N, 14]
+    t_atoms_to_global = t_atoms_to_global.map_tensor_fn(
+        lambda x: torch.sum(x, dim=-1)
+    )
+    # [*, N, 14, 1]
+    atom_mask = atom_mask[aatype, ...].unsqueeze(-1)
+    # [*, N, 14, 3]
+    lit_positions = lit_positions[aatype, ...]
+    pred_positions = t_atoms_to_global.apply(lit_positions)
+    pred_positions = pred_positions * atom_mask
+    return pred_positions

openfold/utils/loss.py ADDED Viewed

	@@ -0,0 +1,1614 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import logging
+import ml_collections
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.distributions.bernoulli import Bernoulli
+from typing import Dict, Optional, Tuple
+from openfold.np import residue_constants
+from openfold.utils import feats
+from openfold.utils.rigid_utils import Rotation, Rigid
+from openfold.utils.tensor_utils import (
+    tree_map,
+    tensor_tree_map,
+    masked_mean,
+    permute_final_dims,
+    batched_gather,
+)
+def softmax_cross_entropy(logits, labels):
+    loss = -1 * torch.sum(
+        labels * torch.nn.functional.log_softmax(logits, dim=-1),
+        dim=-1,
+    )
+    return loss
+def sigmoid_cross_entropy(logits, labels):
+    logits_dtype = logits.dtype
+    logits = logits.double()
+    labels = labels.double()
+    log_p = torch.nn.functional.logsigmoid(logits)
+    # log_p = torch.log(torch.sigmoid(logits))
+    log_not_p = torch.nn.functional.logsigmoid(-1 * logits)
+    # log_not_p = torch.log(torch.sigmoid(-logits))
+    loss = (-1. * labels) * log_p - (1. - labels) * log_not_p
+    loss = loss.to(dtype=logits_dtype)
+    return loss
+def torsion_angle_loss(
+    a,  # [*, N, 7, 2]
+    a_gt,  # [*, N, 7, 2]
+    a_alt_gt,  # [*, N, 7, 2]
+):
+    # [*, N, 7]
+    norm = torch.norm(a, dim=-1)
+    # [*, N, 7, 2]
+    a = a / norm.unsqueeze(-1)
+    # [*, N, 7]
+    diff_norm_gt = torch.norm(a - a_gt, dim=-1)
+    diff_norm_alt_gt = torch.norm(a - a_alt_gt, dim=-1)
+    min_diff = torch.minimum(diff_norm_gt ** 2, diff_norm_alt_gt ** 2)
+    # [*]
+    l_torsion = torch.mean(min_diff, dim=(-1, -2))
+    l_angle_norm = torch.mean(torch.abs(norm - 1), dim=(-1, -2))
+    an_weight = 0.02
+    return l_torsion + an_weight * l_angle_norm
+def compute_fape(
+    pred_frames: Rigid,
+    target_frames: Rigid,
+    frames_mask: torch.Tensor,
+    pred_positions: torch.Tensor,
+    target_positions: torch.Tensor,
+    positions_mask: torch.Tensor,
+    length_scale: float,
+    l1_clamp_distance: Optional[float] = None,
+    eps=1e-8,
+) -> torch.Tensor:
+    """
+        Computes FAPE loss.
+        Args:
+            pred_frames:
+                [*, N_frames] Rigid object of predicted frames
+            target_frames:
+                [*, N_frames] Rigid object of ground truth frames
+            frames_mask:
+                [*, N_frames] binary mask for the frames
+            pred_positions:
+                [*, N_pts, 3] predicted atom positions
+            target_positions:
+                [*, N_pts, 3] ground truth positions
+            positions_mask:
+                [*, N_pts] positions mask
+            length_scale:
+                Length scale by which the loss is divided
+            l1_clamp_distance:
+                Cutoff above which distance errors are disregarded
+            eps:
+                Small value used to regularize denominators
+        Returns:
+            [*] loss tensor
+    """
+    # [*, N_frames, N_pts, 3]
+    local_pred_pos = pred_frames.invert()[..., None].apply(
+        pred_positions[..., None, :, :],
+    )
+    local_target_pos = target_frames.invert()[..., None].apply(
+        target_positions[..., None, :, :],
+    )
+    error_dist = torch.sqrt(
+        torch.sum((local_pred_pos - local_target_pos) ** 2, dim=-1) + eps
+    )
+    if l1_clamp_distance is not None:
+        error_dist = torch.clamp(error_dist, min=0, max=l1_clamp_distance)
+    normed_error = error_dist / length_scale
+    normed_error = normed_error * frames_mask[..., None]
+    normed_error = normed_error * positions_mask[..., None, :]
+    # FP16-friendly averaging. Roughly equivalent to:
+    #
+    # norm_factor = (
+    #     torch.sum(frames_mask, dim=-1) *
+    #     torch.sum(positions_mask, dim=-1)
+    # )
+    # normed_error = torch.sum(normed_error, dim=(-1, -2)) / (eps + norm_factor)
+    #
+    # ("roughly" because eps is necessarily duplicated in the latter)
+    normed_error = torch.sum(normed_error, dim=-1)
+    normed_error = (
+        normed_error / (eps + torch.sum(frames_mask, dim=-1))[..., None]
+    )
+    normed_error = torch.sum(normed_error, dim=-1)
+    normed_error = normed_error / (eps + torch.sum(positions_mask, dim=-1))
+    return normed_error
+def backbone_loss(
+    backbone_rigid_tensor: torch.Tensor,
+    backbone_rigid_mask: torch.Tensor,
+    traj: torch.Tensor,
+    use_clamped_fape: Optional[torch.Tensor] = None,
+    clamp_distance: float = 10.0,
+    loss_unit_distance: float = 10.0,
+    eps: float = 1e-4,
+    **kwargs,
+) -> torch.Tensor:
+    pred_aff = Rigid.from_tensor_7(traj)
+    pred_aff = Rigid(
+        Rotation(rot_mats=pred_aff.get_rots().get_rot_mats(), quats=None),
+        pred_aff.get_trans(),
+    )
+    # DISCREPANCY: DeepMind somehow gets a hold of a tensor_7 version of
+    # backbone tensor, normalizes it, and then turns it back to a rotation
+    # matrix. To avoid a potentially numerically unstable rotation matrix
+    # to quaternion conversion, we just use the original rotation matrix
+    # outright. This one hasn't been composed a bunch of times, though, so
+    # it might be fine.
+    gt_aff = Rigid.from_tensor_4x4(backbone_rigid_tensor)
+    fape_loss = compute_fape(
+        pred_aff,
+        gt_aff[None],
+        backbone_rigid_mask[None],
+        pred_aff.get_trans(),
+        gt_aff[None].get_trans(),
+        backbone_rigid_mask[None],
+        l1_clamp_distance=clamp_distance,
+        length_scale=loss_unit_distance,
+        eps=eps,
+    )
+    if use_clamped_fape is not None:
+        unclamped_fape_loss = compute_fape(
+            pred_aff,
+            gt_aff[None],
+            backbone_rigid_mask[None],
+            pred_aff.get_trans(),
+            gt_aff[None].get_trans(),
+            backbone_rigid_mask[None],
+            l1_clamp_distance=None,
+            length_scale=loss_unit_distance,
+            eps=eps,
+        )
+        fape_loss = fape_loss * use_clamped_fape + unclamped_fape_loss * (
+            1 - use_clamped_fape
+        )
+    # Average over the batch dimension
+    fape_loss = torch.mean(fape_loss)
+    return fape_loss
+def sidechain_loss(
+    sidechain_frames: torch.Tensor,
+    sidechain_atom_pos: torch.Tensor,
+    rigidgroups_gt_frames: torch.Tensor,
+    rigidgroups_alt_gt_frames: torch.Tensor,
+    rigidgroups_gt_exists: torch.Tensor,
+    renamed_atom14_gt_positions: torch.Tensor,
+    renamed_atom14_gt_exists: torch.Tensor,
+    alt_naming_is_better: torch.Tensor,
+    clamp_distance: float = 10.0,
+    length_scale: float = 10.0,
+    eps: float = 1e-4,
+    **kwargs,
+) -> torch.Tensor:
+    renamed_gt_frames = (
+        1.0 - alt_naming_is_better[..., None, None, None]
+    ) * rigidgroups_gt_frames + alt_naming_is_better[
+        ..., None, None, None
+    ] * rigidgroups_alt_gt_frames
+    # Steamroll the inputs
+    sidechain_frames = sidechain_frames[-1]
+    batch_dims = sidechain_frames.shape[:-4]
+    sidechain_frames = sidechain_frames.view(*batch_dims, -1, 4, 4)
+    sidechain_frames = Rigid.from_tensor_4x4(sidechain_frames)
+    renamed_gt_frames = renamed_gt_frames.view(*batch_dims, -1, 4, 4)
+    renamed_gt_frames = Rigid.from_tensor_4x4(renamed_gt_frames)
+    rigidgroups_gt_exists = rigidgroups_gt_exists.reshape(*batch_dims, -1)
+    sidechain_atom_pos = sidechain_atom_pos[-1]
+    sidechain_atom_pos = sidechain_atom_pos.view(*batch_dims, -1, 3)
+    renamed_atom14_gt_positions = renamed_atom14_gt_positions.view(
+        *batch_dims, -1, 3
+    )
+    renamed_atom14_gt_exists = renamed_atom14_gt_exists.view(*batch_dims, -1)
+    fape = compute_fape(
+        sidechain_frames,
+        renamed_gt_frames,
+        rigidgroups_gt_exists,
+        sidechain_atom_pos,
+        renamed_atom14_gt_positions,
+        renamed_atom14_gt_exists,
+        l1_clamp_distance=clamp_distance,
+        length_scale=length_scale,
+        eps=eps,
+    )
+    return fape
+def fape_loss(
+    out: Dict[str, torch.Tensor],
+    batch: Dict[str, torch.Tensor],
+    config: ml_collections.ConfigDict,
+) -> torch.Tensor:
+    bb_loss = backbone_loss(
+        traj=out["sm"]["frames"],
+        **{**batch, **config.backbone},
+    )
+    sc_loss = sidechain_loss(
+        out["sm"]["sidechain_frames"],
+        out["sm"]["positions"],
+        **{**batch, **config.sidechain},
+    )
+    loss = config.backbone.weight * bb_loss + config.sidechain.weight * sc_loss
+    # Average over the batch dimension
+    loss = torch.mean(loss)
+    return loss
+def supervised_chi_loss(
+    angles_sin_cos: torch.Tensor,
+    unnormalized_angles_sin_cos: torch.Tensor,
+    aatype: torch.Tensor,
+    seq_mask: torch.Tensor,
+    chi_mask: torch.Tensor,
+    chi_angles_sin_cos: torch.Tensor,
+    chi_weight: float,
+    angle_norm_weight: float,
+    eps=1e-6,
+    **kwargs,
+) -> torch.Tensor:
+    """
+        Implements Algorithm 27 (torsionAngleLoss)
+        Args:
+            angles_sin_cos:
+                [*, N, 7, 2] predicted angles
+            unnormalized_angles_sin_cos:
+                The same angles, but unnormalized
+            aatype:
+                [*, N] residue indices
+            seq_mask:
+                [*, N] sequence mask
+            chi_mask:
+                [*, N, 7] angle mask
+            chi_angles_sin_cos:
+                [*, N, 7, 2] ground truth angles
+            chi_weight:
+                Weight for the angle component of the loss
+            angle_norm_weight:
+                Weight for the normalization component of the loss
+        Returns:
+            [*] loss tensor
+    """
+    pred_angles = angles_sin_cos[..., 3:, :]
+    residue_type_one_hot = torch.nn.functional.one_hot(
+        aatype,
+        residue_constants.restype_num + 1,
+    )
+    chi_pi_periodic = torch.einsum(
+        "...ij,jk->ik",
+        residue_type_one_hot.type(angles_sin_cos.dtype),
+        angles_sin_cos.new_tensor(residue_constants.chi_pi_periodic),
+    )
+    true_chi = chi_angles_sin_cos[None]
+    shifted_mask = (1 - 2 * chi_pi_periodic).unsqueeze(-1)
+    true_chi_shifted = shifted_mask * true_chi
+    sq_chi_error = torch.sum((true_chi - pred_angles) ** 2, dim=-1)
+    sq_chi_error_shifted = torch.sum(
+        (true_chi_shifted - pred_angles) ** 2, dim=-1
+    )
+    sq_chi_error = torch.minimum(sq_chi_error, sq_chi_error_shifted)
+    # The ol' switcheroo
+    sq_chi_error = sq_chi_error.permute(
+        *range(len(sq_chi_error.shape))[1:-2], 0, -2, -1
+    )
+    sq_chi_loss = masked_mean(
+        chi_mask[..., None, :, :], sq_chi_error, dim=(-1, -2, -3)
+    )
+    loss = chi_weight * sq_chi_loss
+    angle_norm = torch.sqrt(
+        torch.sum(unnormalized_angles_sin_cos ** 2, dim=-1) + eps
+    )
+    norm_error = torch.abs(angle_norm - 1.0)
+    norm_error = norm_error.permute(
+        *range(len(norm_error.shape))[1:-2], 0, -2, -1
+    )
+    angle_norm_loss = masked_mean(
+        seq_mask[..., None, :, None], norm_error, dim=(-1, -2, -3)
+    )
+    loss = loss + angle_norm_weight * angle_norm_loss
+    # Average over the batch dimension
+    loss = torch.mean(loss)
+    return loss
+def compute_plddt(logits: torch.Tensor) -> torch.Tensor:
+    num_bins = logits.shape[-1]
+    bin_width = 1.0 / num_bins
+    bounds = torch.arange(
+        start=0.5 * bin_width, end=1.0, step=bin_width, device=logits.device
+    )
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    pred_lddt_ca = torch.sum(
+        probs * bounds.view(*((1,) * len(probs.shape[:-1])), *bounds.shape),
+        dim=-1,
+    )
+    return pred_lddt_ca * 100
+def lddt(
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    cutoff: float = 15.0,
+    eps: float = 1e-10,
+    per_residue: bool = True,
+) -> torch.Tensor:
+    n = all_atom_mask.shape[-2]
+    dmat_true = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                all_atom_positions[..., None, :]
+                - all_atom_positions[..., None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    dmat_pred = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                all_atom_pred_pos[..., None, :]
+                - all_atom_pred_pos[..., None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    dists_to_score = (
+        (dmat_true < cutoff)
+        * all_atom_mask
+        * permute_final_dims(all_atom_mask, (1, 0))
+        * (1.0 - torch.eye(n, device=all_atom_mask.device))
+    )
+    dist_l1 = torch.abs(dmat_true - dmat_pred)
+    score = (
+        (dist_l1 < 0.5).type(dist_l1.dtype)
+        + (dist_l1 < 1.0).type(dist_l1.dtype)
+        + (dist_l1 < 2.0).type(dist_l1.dtype)
+        + (dist_l1 < 4.0).type(dist_l1.dtype)
+    )
+    score = score * 0.25
+    dims = (-1,) if per_residue else (-2, -1)
+    norm = 1.0 / (eps + torch.sum(dists_to_score, dim=dims))
+    score = norm * (eps + torch.sum(dists_to_score * score, dim=dims))
+    return score
+def lddt_ca(
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    cutoff: float = 15.0,
+    eps: float = 1e-10,
+    per_residue: bool = True,
+) -> torch.Tensor:
+    ca_pos = residue_constants.atom_order["CA"]
+    all_atom_pred_pos = all_atom_pred_pos[..., ca_pos, :]
+    all_atom_positions = all_atom_positions[..., ca_pos, :]
+    all_atom_mask = all_atom_mask[..., ca_pos : (ca_pos + 1)]  # keep dim
+    return lddt(
+        all_atom_pred_pos,
+        all_atom_positions,
+        all_atom_mask,
+        cutoff=cutoff,
+        eps=eps,
+        per_residue=per_residue,
+    )
+def lddt_loss(
+    logits: torch.Tensor,
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    resolution: torch.Tensor,
+    cutoff: float = 15.0,
+    no_bins: int = 50,
+    min_resolution: float = 0.1,
+    max_resolution: float = 3.0,
+    eps: float = 1e-10,
+    **kwargs,
+) -> torch.Tensor:
+    n = all_atom_mask.shape[-2]
+    ca_pos = residue_constants.atom_order["CA"]
+    all_atom_pred_pos = all_atom_pred_pos[..., ca_pos, :]
+    all_atom_positions = all_atom_positions[..., ca_pos, :]
+    all_atom_mask = all_atom_mask[..., ca_pos : (ca_pos + 1)]  # keep dim
+    score = lddt(
+        all_atom_pred_pos,
+        all_atom_positions,
+        all_atom_mask,
+        cutoff=cutoff,
+        eps=eps
+    )
+    score = score.detach()
+    bin_index = torch.floor(score * no_bins).long()
+    bin_index = torch.clamp(bin_index, max=(no_bins - 1))
+    lddt_ca_one_hot = torch.nn.functional.one_hot(
+        bin_index, num_classes=no_bins
+    )
+    errors = softmax_cross_entropy(logits, lddt_ca_one_hot)
+    all_atom_mask = all_atom_mask.squeeze(-1)
+    loss = torch.sum(errors * all_atom_mask, dim=-1) / (
+        eps + torch.sum(all_atom_mask, dim=-1)
+    )
+    loss = loss * (
+        (resolution >= min_resolution) & (resolution <= max_resolution)
+    )
+    # Average over the batch dimension
+    loss = torch.mean(loss)
+    return loss
+def distogram_loss(
+    logits,
+    pseudo_beta,
+    pseudo_beta_mask,
+    min_bin=2.3125,
+    max_bin=21.6875,
+    no_bins=64,
+    eps=1e-6,
+    **kwargs,
+):
+    boundaries = torch.linspace(
+        min_bin,
+        max_bin,
+        no_bins - 1,
+        device=logits.device,
+    )
+    boundaries = boundaries ** 2
+    dists = torch.sum(
+        (pseudo_beta[..., None, :] - pseudo_beta[..., None, :, :]) ** 2,
+        dim=-1,
+        keepdims=True,
+    )
+    true_bins = torch.sum(dists > boundaries, dim=-1)
+    errors = softmax_cross_entropy(
+        logits,
+        torch.nn.functional.one_hot(true_bins, no_bins),
+    )
+    square_mask = pseudo_beta_mask[..., None] * pseudo_beta_mask[..., None, :]
+    # FP16-friendly sum. Equivalent to:
+    # mean = (torch.sum(errors * square_mask, dim=(-1, -2)) /
+    #         (eps + torch.sum(square_mask, dim=(-1, -2))))
+    denom = eps + torch.sum(square_mask, dim=(-1, -2))
+    mean = errors * square_mask
+    mean = torch.sum(mean, dim=-1)
+    mean = mean / denom[..., None]
+    mean = torch.sum(mean, dim=-1)
+    # Average over the batch dimensions
+    mean = torch.mean(mean)
+    return mean
+def _calculate_bin_centers(boundaries: torch.Tensor):
+    step = boundaries[1] - boundaries[0]
+    bin_centers = boundaries + step / 2
+    bin_centers = torch.cat(
+        [bin_centers, (bin_centers[-1] + step).unsqueeze(-1)], dim=0
+    )
+    return bin_centers
+def _calculate_expected_aligned_error(
+    alignment_confidence_breaks: torch.Tensor,
+    aligned_distance_error_probs: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    bin_centers = _calculate_bin_centers(alignment_confidence_breaks)
+    return (
+        torch.sum(aligned_distance_error_probs * bin_centers, dim=-1),
+        bin_centers[-1],
+    )
+def compute_predicted_aligned_error(
+    logits: torch.Tensor,
+    max_bin: int = 31,
+    no_bins: int = 64,
+    **kwargs,
+) -> Dict[str, torch.Tensor]:
+    """Computes aligned confidence metrics from logits.
+    Args:
+      logits: [*, num_res, num_res, num_bins] the logits output from
+        PredictedAlignedErrorHead.
+      max_bin: Maximum bin value
+      no_bins: Number of bins
+    Returns:
+      aligned_confidence_probs: [*, num_res, num_res, num_bins] the predicted
+        aligned error probabilities over bins for each residue pair.
+      predicted_aligned_error: [*, num_res, num_res] the expected aligned distance
+        error for each pair of residues.
+      max_predicted_aligned_error: [*] the maximum predicted error possible.
+    """
+    boundaries = torch.linspace(
+        0, max_bin, steps=(no_bins - 1), device=logits.device
+    )
+    aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1)
+    (
+        predicted_aligned_error,
+        max_predicted_aligned_error,
+    ) = _calculate_expected_aligned_error(
+        alignment_confidence_breaks=boundaries,
+        aligned_distance_error_probs=aligned_confidence_probs,
+    )
+    return {
+        "aligned_confidence_probs": aligned_confidence_probs,
+        "predicted_aligned_error": predicted_aligned_error,
+        "max_predicted_aligned_error": max_predicted_aligned_error,
+    }
+def compute_tm(
+    logits: torch.Tensor,
+    residue_weights: Optional[torch.Tensor] = None,
+    max_bin: int = 31,
+    no_bins: int = 64,
+    eps: float = 1e-8,
+    **kwargs,
+) -> torch.Tensor:
+    if residue_weights is None:
+        residue_weights = logits.new_ones(logits.shape[-2])
+    boundaries = torch.linspace(
+        0, max_bin, steps=(no_bins - 1), device=logits.device
+    )
+    bin_centers = _calculate_bin_centers(boundaries)
+    torch.sum(residue_weights)
+    n = logits.shape[-2]
+    clipped_n = max(n, 19)
+    d0 = 1.24 * (clipped_n - 15) ** (1.0 / 3) - 1.8
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    tm_per_bin = 1.0 / (1 + (bin_centers ** 2) / (d0 ** 2))
+    predicted_tm_term = torch.sum(probs * tm_per_bin, dim=-1)
+    normed_residue_mask = residue_weights / (eps + residue_weights.sum())
+    per_alignment = torch.sum(predicted_tm_term * normed_residue_mask, dim=-1)
+    weighted = per_alignment * residue_weights
+    argmax = (weighted == torch.max(weighted)).nonzero()[0]
+    return per_alignment[tuple(argmax)]
+def tm_loss(
+    logits,
+    final_affine_tensor,
+    backbone_rigid_tensor,
+    backbone_rigid_mask,
+    resolution,
+    max_bin=31,
+    no_bins=64,
+    min_resolution: float = 0.1,
+    max_resolution: float = 3.0,
+    eps=1e-8,
+    **kwargs,
+):
+    pred_affine = Rigid.from_tensor_7(final_affine_tensor)
+    backbone_rigid = Rigid.from_tensor_4x4(backbone_rigid_tensor)
+    def _points(affine):
+        pts = affine.get_trans()[..., None, :, :]
+        return affine.invert()[..., None].apply(pts)
+    sq_diff = torch.sum(
+        (_points(pred_affine) - _points(backbone_rigid)) ** 2, dim=-1
+    )
+    sq_diff = sq_diff.detach()
+    boundaries = torch.linspace(
+        0, max_bin, steps=(no_bins - 1), device=logits.device
+    )
+    boundaries = boundaries ** 2
+    true_bins = torch.sum(sq_diff[..., None] > boundaries, dim=-1)
+    errors = softmax_cross_entropy(
+        logits, torch.nn.functional.one_hot(true_bins, no_bins)
+    )
+    square_mask = (
+        backbone_rigid_mask[..., None] * backbone_rigid_mask[..., None, :]
+    )
+    loss = torch.sum(errors * square_mask, dim=-1)
+    scale = 0.5  # hack to help FP16 training along
+    denom = eps + torch.sum(scale * square_mask, dim=(-1, -2))
+    loss = loss / denom[..., None]
+    loss = torch.sum(loss, dim=-1)
+    loss = loss * scale
+    loss = loss * (
+        (resolution >= min_resolution) & (resolution <= max_resolution)
+    )
+    # Average over the loss dimension
+    loss = torch.mean(loss)
+    return loss
+def between_residue_bond_loss(
+    pred_atom_positions: torch.Tensor,  # (*, N, 37/14, 3)
+    pred_atom_mask: torch.Tensor,  # (*, N, 37/14)
+    residue_index: torch.Tensor,  # (*, N)
+    aatype: torch.Tensor,  # (*, N)
+    tolerance_factor_soft=12.0,
+    tolerance_factor_hard=12.0,
+    eps=1e-6,
+) -> Dict[str, torch.Tensor]:
+    """Flat-bottom loss to penalize structural violations between residues.
+    This is a loss penalizing any violation of the geometry around the peptide
+    bond between consecutive amino acids. This loss corresponds to
+    Jumper et al. (2021) Suppl. Sec. 1.9.11, eq 44, 45.
+    Args:
+      pred_atom_positions: Atom positions in atom37/14 representation
+      pred_atom_mask: Atom mask in atom37/14 representation
+      residue_index: Residue index for given amino acid, this is assumed to be
+        monotonically increasing.
+      aatype: Amino acid type of given residue
+      tolerance_factor_soft: soft tolerance factor measured in standard deviations
+        of pdb distributions
+      tolerance_factor_hard: hard tolerance factor measured in standard deviations
+        of pdb distributions
+    Returns:
+      Dict containing:
+        * 'c_n_loss_mean': Loss for peptide bond length violations
+        * 'ca_c_n_loss_mean': Loss for violations of bond angle around C spanned
+            by CA, C, N
+        * 'c_n_ca_loss_mean': Loss for violations of bond angle around N spanned
+            by C, N, CA
+        * 'per_residue_loss_sum': sum of all losses for each residue
+        * 'per_residue_violation_mask': mask denoting all residues with violation
+            present.
+    """
+    # Get the positions of the relevant backbone atoms.
+    this_ca_pos = pred_atom_positions[..., :-1, 1, :]
+    this_ca_mask = pred_atom_mask[..., :-1, 1]
+    this_c_pos = pred_atom_positions[..., :-1, 2, :]
+    this_c_mask = pred_atom_mask[..., :-1, 2]
+    next_n_pos = pred_atom_positions[..., 1:, 0, :]
+    next_n_mask = pred_atom_mask[..., 1:, 0]
+    next_ca_pos = pred_atom_positions[..., 1:, 1, :]
+    next_ca_mask = pred_atom_mask[..., 1:, 1]
+    has_no_gap_mask = (residue_index[..., 1:] - residue_index[..., :-1]) == 1.0
+    # Compute loss for the C--N bond.
+    c_n_bond_length = torch.sqrt(
+        eps + torch.sum((this_c_pos - next_n_pos) ** 2, dim=-1)
+    )
+    # The C-N bond to proline has slightly different length because of the ring.
+    next_is_proline = aatype[..., 1:] == residue_constants.resname_to_idx["PRO"]
+    gt_length = (
+        ~next_is_proline
+    ) * residue_constants.between_res_bond_length_c_n[
+        0
+    ] + next_is_proline * residue_constants.between_res_bond_length_c_n[
+        1
+    ]
+    gt_stddev = (
+        ~next_is_proline
+    ) * residue_constants.between_res_bond_length_stddev_c_n[
+        0
+    ] + next_is_proline * residue_constants.between_res_bond_length_stddev_c_n[
+        1
+    ]
+    c_n_bond_length_error = torch.sqrt(eps + (c_n_bond_length - gt_length) ** 2)
+    c_n_loss_per_residue = torch.nn.functional.relu(
+        c_n_bond_length_error - tolerance_factor_soft * gt_stddev
+    )
+    mask = this_c_mask * next_n_mask * has_no_gap_mask
+    c_n_loss = torch.sum(mask * c_n_loss_per_residue, dim=-1) / (
+        torch.sum(mask, dim=-1) + eps
+    )
+    c_n_violation_mask = mask * (
+        c_n_bond_length_error > (tolerance_factor_hard * gt_stddev)
+    )
+    # Compute loss for the angles.
+    ca_c_bond_length = torch.sqrt(
+        eps + torch.sum((this_ca_pos - this_c_pos) ** 2, dim=-1)
+    )
+    n_ca_bond_length = torch.sqrt(
+        eps + torch.sum((next_n_pos - next_ca_pos) ** 2, dim=-1)
+    )
+    c_ca_unit_vec = (this_ca_pos - this_c_pos) / ca_c_bond_length[..., None]
+    c_n_unit_vec = (next_n_pos - this_c_pos) / c_n_bond_length[..., None]
+    n_ca_unit_vec = (next_ca_pos - next_n_pos) / n_ca_bond_length[..., None]
+    ca_c_n_cos_angle = torch.sum(c_ca_unit_vec * c_n_unit_vec, dim=-1)
+    gt_angle = residue_constants.between_res_cos_angles_ca_c_n[0]
+    gt_stddev = residue_constants.between_res_bond_length_stddev_c_n[0]
+    ca_c_n_cos_angle_error = torch.sqrt(
+        eps + (ca_c_n_cos_angle - gt_angle) ** 2
+    )
+    ca_c_n_loss_per_residue = torch.nn.functional.relu(
+        ca_c_n_cos_angle_error - tolerance_factor_soft * gt_stddev
+    )
+    mask = this_ca_mask * this_c_mask * next_n_mask * has_no_gap_mask
+    ca_c_n_loss = torch.sum(mask * ca_c_n_loss_per_residue, dim=-1) / (
+        torch.sum(mask, dim=-1) + eps
+    )
+    ca_c_n_violation_mask = mask * (
+        ca_c_n_cos_angle_error > (tolerance_factor_hard * gt_stddev)
+    )
+    c_n_ca_cos_angle = torch.sum((-c_n_unit_vec) * n_ca_unit_vec, dim=-1)
+    gt_angle = residue_constants.between_res_cos_angles_c_n_ca[0]
+    gt_stddev = residue_constants.between_res_cos_angles_c_n_ca[1]
+    c_n_ca_cos_angle_error = torch.sqrt(
+        eps + torch.square(c_n_ca_cos_angle - gt_angle)
+    )
+    c_n_ca_loss_per_residue = torch.nn.functional.relu(
+        c_n_ca_cos_angle_error - tolerance_factor_soft * gt_stddev
+    )
+    mask = this_c_mask * next_n_mask * next_ca_mask * has_no_gap_mask
+    c_n_ca_loss = torch.sum(mask * c_n_ca_loss_per_residue, dim=-1) / (
+        torch.sum(mask, dim=-1) + eps
+    )
+    c_n_ca_violation_mask = mask * (
+        c_n_ca_cos_angle_error > (tolerance_factor_hard * gt_stddev)
+    )
+    # Compute a per residue loss (equally distribute the loss to both
+    # neighbouring residues).
+    per_residue_loss_sum = (
+        c_n_loss_per_residue + ca_c_n_loss_per_residue + c_n_ca_loss_per_residue
+    )
+    per_residue_loss_sum = 0.5 * (
+        torch.nn.functional.pad(per_residue_loss_sum, (0, 1))
+        + torch.nn.functional.pad(per_residue_loss_sum, (1, 0))
+    )
+    # Compute hard violations.
+    violation_mask = torch.max(
+        torch.stack(
+            [c_n_violation_mask, ca_c_n_violation_mask, c_n_ca_violation_mask],
+            dim=-2,
+        ),
+        dim=-2,
+    )[0]
+    violation_mask = torch.maximum(
+        torch.nn.functional.pad(violation_mask, (0, 1)),
+        torch.nn.functional.pad(violation_mask, (1, 0)),
+    )
+    return {
+        "c_n_loss_mean": c_n_loss,
+        "ca_c_n_loss_mean": ca_c_n_loss,
+        "c_n_ca_loss_mean": c_n_ca_loss,
+        "per_residue_loss_sum": per_residue_loss_sum,
+        "per_residue_violation_mask": violation_mask,
+    }
+def between_residue_clash_loss(
+    atom14_pred_positions: torch.Tensor,
+    atom14_atom_exists: torch.Tensor,
+    atom14_atom_radius: torch.Tensor,
+    residue_index: torch.Tensor,
+    overlap_tolerance_soft=1.5,
+    overlap_tolerance_hard=1.5,
+    eps=1e-10,
+) -> Dict[str, torch.Tensor]:
+    """Loss to penalize steric clashes between residues.
+    This is a loss penalizing any steric clashes due to non bonded atoms in
+    different peptides coming too close. This loss corresponds to the part with
+    different residues of
+    Jumper et al. (2021) Suppl. Sec. 1.9.11, eq 46.
+    Args:
+      atom14_pred_positions: Predicted positions of atoms in
+        global prediction frame
+      atom14_atom_exists: Mask denoting whether atom at positions exists for given
+        amino acid type
+      atom14_atom_radius: Van der Waals radius for each atom.
+      residue_index: Residue index for given amino acid.
+      overlap_tolerance_soft: Soft tolerance factor.
+      overlap_tolerance_hard: Hard tolerance factor.
+    Returns:
+      Dict containing:
+        * 'mean_loss': average clash loss
+        * 'per_atom_loss_sum': sum of all clash losses per atom, shape (N, 14)
+        * 'per_atom_clash_mask': mask whether atom clashes with any other atom
+            shape (N, 14)
+    """
+    fp_type = atom14_pred_positions.dtype
+    # Create the distance matrix.
+    # (N, N, 14, 14)
+    dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_pred_positions[..., :, None, :, None, :]
+                - atom14_pred_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    # Create the mask for valid distances.
+    # shape (N, N, 14, 14)
+    dists_mask = (
+        atom14_atom_exists[..., :, None, :, None]
+        * atom14_atom_exists[..., None, :, None, :]
+    ).type(fp_type)
+    # Mask out all the duplicate entries in the lower triangular matrix.
+    # Also mask out the diagonal (atom-pairs from the same residue) -- these atoms
+    # are handled separately.
+    dists_mask = dists_mask * (
+        residue_index[..., :, None, None, None]
+        < residue_index[..., None, :, None, None]
+    )
+    # Backbone C--N bond between subsequent residues is no clash.
+    c_one_hot = torch.nn.functional.one_hot(
+        residue_index.new_tensor(2), num_classes=14
+    )
+    c_one_hot = c_one_hot.reshape(
+        *((1,) * len(residue_index.shape[:-1])), *c_one_hot.shape
+    )
+    c_one_hot = c_one_hot.type(fp_type)
+    n_one_hot = torch.nn.functional.one_hot(
+        residue_index.new_tensor(0), num_classes=14
+    )
+    n_one_hot = n_one_hot.reshape(
+        *((1,) * len(residue_index.shape[:-1])), *n_one_hot.shape
+    )
+    n_one_hot = n_one_hot.type(fp_type)
+    neighbour_mask = (
+        residue_index[..., :, None, None, None] + 1
+    ) == residue_index[..., None, :, None, None]
+    c_n_bonds = (
+        neighbour_mask
+        * c_one_hot[..., None, None, :, None]
+        * n_one_hot[..., None, None, None, :]
+    )
+    dists_mask = dists_mask * (1.0 - c_n_bonds)
+    # Disulfide bridge between two cysteines is no clash.
+    cys = residue_constants.restype_name_to_atom14_names["CYS"]
+    cys_sg_idx = cys.index("SG")
+    cys_sg_idx = residue_index.new_tensor(cys_sg_idx)
+    cys_sg_idx = cys_sg_idx.reshape(
+        *((1,) * len(residue_index.shape[:-1])), 1
+    ).squeeze(-1)
+    cys_sg_one_hot = torch.nn.functional.one_hot(cys_sg_idx, num_classes=14)
+    disulfide_bonds = (
+        cys_sg_one_hot[..., None, None, :, None]
+        * cys_sg_one_hot[..., None, None, None, :]
+    )
+    dists_mask = dists_mask * (1.0 - disulfide_bonds)
+    # Compute the lower bound for the allowed distances.
+    # shape (N, N, 14, 14)
+    dists_lower_bound = dists_mask * (
+        atom14_atom_radius[..., :, None, :, None]
+        + atom14_atom_radius[..., None, :, None, :]
+    )
+    # Compute the error.
+    # shape (N, N, 14, 14)
+    dists_to_low_error = dists_mask * torch.nn.functional.relu(
+        dists_lower_bound - overlap_tolerance_soft - dists
+    )
+    # Compute the mean loss.
+    # shape ()
+    mean_loss = torch.sum(dists_to_low_error) / (1e-6 + torch.sum(dists_mask))
+    # Compute the per atom loss sum.
+    # shape (N, 14)
+    per_atom_loss_sum = torch.sum(dists_to_low_error, dim=(-4, -2)) + torch.sum(
+        dists_to_low_error, axis=(-3, -1)
+    )
+    # Compute the hard clash mask.
+    # shape (N, N, 14, 14)
+    clash_mask = dists_mask * (
+        dists < (dists_lower_bound - overlap_tolerance_hard)
+    )
+    # Compute the per atom clash.
+    # shape (N, 14)
+    per_atom_clash_mask = torch.maximum(
+        torch.amax(clash_mask, axis=(-4, -2)),
+        torch.amax(clash_mask, axis=(-3, -1)),
+    )
+    return {
+        "mean_loss": mean_loss,  # shape ()
+        "per_atom_loss_sum": per_atom_loss_sum,  # shape (N, 14)
+        "per_atom_clash_mask": per_atom_clash_mask,  # shape (N, 14)
+    }
+def within_residue_violations(
+    atom14_pred_positions: torch.Tensor,
+    atom14_atom_exists: torch.Tensor,
+    atom14_dists_lower_bound: torch.Tensor,
+    atom14_dists_upper_bound: torch.Tensor,
+    tighten_bounds_for_loss=0.0,
+    eps=1e-10,
+) -> Dict[str, torch.Tensor]:
+    """Loss to penalize steric clashes within residues.
+    This is a loss penalizing any steric violations or clashes of non-bonded atoms
+    in a given peptide. This loss corresponds to the part with
+    the same residues of
+    Jumper et al. (2021) Suppl. Sec. 1.9.11, eq 46.
+    Args:
+        atom14_pred_positions ([*, N, 14, 3]):
+            Predicted positions of atoms in global prediction frame.
+        atom14_atom_exists ([*, N, 14]):
+            Mask denoting whether atom at positions exists for given
+            amino acid type
+        atom14_dists_lower_bound ([*, N, 14]):
+            Lower bound on allowed distances.
+        atom14_dists_upper_bound ([*, N, 14]):
+            Upper bound on allowed distances
+        tighten_bounds_for_loss ([*, N]):
+            Extra factor to tighten loss
+    Returns:
+      Dict containing:
+        * 'per_atom_loss_sum' ([*, N, 14]):
+              sum of all clash losses per atom, shape
+        * 'per_atom_clash_mask' ([*, N, 14]):
+              mask whether atom clashes with any other atom shape
+    """
+    # Compute the mask for each residue.
+    dists_masks = 1.0 - torch.eye(14, device=atom14_atom_exists.device)[None]
+    dists_masks = dists_masks.reshape(
+        *((1,) * len(atom14_atom_exists.shape[:-2])), *dists_masks.shape
+    )
+    dists_masks = (
+        atom14_atom_exists[..., :, :, None]
+        * atom14_atom_exists[..., :, None, :]
+        * dists_masks
+    )
+    # Distance matrix
+    dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_pred_positions[..., :, :, None, :]
+                - atom14_pred_positions[..., :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    # Compute the loss.
+    dists_to_low_error = torch.nn.functional.relu(
+        atom14_dists_lower_bound + tighten_bounds_for_loss - dists
+    )
+    dists_to_high_error = torch.nn.functional.relu(
+        dists - (atom14_dists_upper_bound - tighten_bounds_for_loss)
+    )
+    loss = dists_masks * (dists_to_low_error + dists_to_high_error)
+    # Compute the per atom loss sum.
+    per_atom_loss_sum = torch.sum(loss, dim=-2) + torch.sum(loss, dim=-1)
+    # Compute the violations mask.
+    violations = dists_masks * (
+        (dists < atom14_dists_lower_bound) | (dists > atom14_dists_upper_bound)
+    )
+    # Compute the per atom violations.
+    per_atom_violations = torch.maximum(
+        torch.max(violations, dim=-2)[0], torch.max(violations, axis=-1)[0]
+    )
+    return {
+        "per_atom_loss_sum": per_atom_loss_sum,
+        "per_atom_violations": per_atom_violations,
+    }
+def find_structural_violations(
+    batch: Dict[str, torch.Tensor],
+    atom14_pred_positions: torch.Tensor,
+    violation_tolerance_factor: float,
+    clash_overlap_tolerance: float,
+    **kwargs,
+) -> Dict[str, torch.Tensor]:
+    """Computes several checks for structural violations."""
+    # Compute between residue backbone violations of bonds and angles.
+    connection_violations = between_residue_bond_loss(
+        pred_atom_positions=atom14_pred_positions,
+        pred_atom_mask=batch["atom14_atom_exists"],
+        residue_index=batch["residue_index"],
+        aatype=batch["aatype"],
+        tolerance_factor_soft=violation_tolerance_factor,
+        tolerance_factor_hard=violation_tolerance_factor,
+    )
+    # Compute the Van der Waals radius for every atom
+    # (the first letter of the atom name is the element type).
+    # Shape: (N, 14).
+    atomtype_radius = [
+        residue_constants.van_der_waals_radius[name[0]]
+        for name in residue_constants.atom_types
+    ]
+    atomtype_radius = atom14_pred_positions.new_tensor(atomtype_radius)
+    atom14_atom_radius = (
+        batch["atom14_atom_exists"]
+        * atomtype_radius[batch["residx_atom14_to_atom37"]]
+    )
+    # Compute the between residue clash loss.
+    between_residue_clashes = between_residue_clash_loss(
+        atom14_pred_positions=atom14_pred_positions,
+        atom14_atom_exists=batch["atom14_atom_exists"],
+        atom14_atom_radius=atom14_atom_radius,
+        residue_index=batch["residue_index"],
+        overlap_tolerance_soft=clash_overlap_tolerance,
+        overlap_tolerance_hard=clash_overlap_tolerance,
+    )
+    # Compute all within-residue violations (clashes,
+    # bond length and angle violations).
+    restype_atom14_bounds = residue_constants.make_atom14_dists_bounds(
+        overlap_tolerance=clash_overlap_tolerance,
+        bond_length_tolerance_factor=violation_tolerance_factor,
+    )
+    atom14_atom_exists = batch["atom14_atom_exists"]
+    atom14_dists_lower_bound = atom14_pred_positions.new_tensor(
+        restype_atom14_bounds["lower_bound"]
+    )[batch["aatype"]]
+    atom14_dists_upper_bound = atom14_pred_positions.new_tensor(
+        restype_atom14_bounds["upper_bound"]
+    )[batch["aatype"]]
+    residue_violations = within_residue_violations(
+        atom14_pred_positions=atom14_pred_positions,
+        atom14_atom_exists=batch["atom14_atom_exists"],
+        atom14_dists_lower_bound=atom14_dists_lower_bound,
+        atom14_dists_upper_bound=atom14_dists_upper_bound,
+        tighten_bounds_for_loss=0.0,
+    )
+    # Combine them to a single per-residue violation mask (used later for LDDT).
+    per_residue_violations_mask = torch.max(
+        torch.stack(
+            [
+                connection_violations["per_residue_violation_mask"],
+                torch.max(
+                    between_residue_clashes["per_atom_clash_mask"], dim=-1
+                )[0],
+                torch.max(residue_violations["per_atom_violations"], dim=-1)[0],
+            ],
+            dim=-1,
+        ),
+        dim=-1,
+    )[0]
+    return {
+        "between_residues": {
+            "bonds_c_n_loss_mean": connection_violations["c_n_loss_mean"],  # ()
+            "angles_ca_c_n_loss_mean": connection_violations[
+                "ca_c_n_loss_mean"
+            ],  # ()
+            "angles_c_n_ca_loss_mean": connection_violations[
+                "c_n_ca_loss_mean"
+            ],  # ()
+            "connections_per_residue_loss_sum": connection_violations[
+                "per_residue_loss_sum"
+            ],  # (N)
+            "connections_per_residue_violation_mask": connection_violations[
+                "per_residue_violation_mask"
+            ],  # (N)
+            "clashes_mean_loss": between_residue_clashes["mean_loss"],  # ()
+            "clashes_per_atom_loss_sum": between_residue_clashes[
+                "per_atom_loss_sum"
+            ],  # (N, 14)
+            "clashes_per_atom_clash_mask": between_residue_clashes[
+                "per_atom_clash_mask"
+            ],  # (N, 14)
+        },
+        "within_residues": {
+            "per_atom_loss_sum": residue_violations[
+                "per_atom_loss_sum"
+            ],  # (N, 14)
+            "per_atom_violations": residue_violations[
+                "per_atom_violations"
+            ],  # (N, 14),
+        },
+        "total_per_residue_violations_mask": per_residue_violations_mask,  # (N)
+    }
+def find_structural_violations_np(
+    batch: Dict[str, np.ndarray],
+    atom14_pred_positions: np.ndarray,
+    config: ml_collections.ConfigDict,
+) -> Dict[str, np.ndarray]:
+    to_tensor = lambda x: torch.tensor(x)
+    batch = tree_map(to_tensor, batch, np.ndarray)
+    atom14_pred_positions = to_tensor(atom14_pred_positions)
+    out = find_structural_violations(batch, atom14_pred_positions, **config)
+    to_np = lambda x: np.array(x)
+    np_out = tensor_tree_map(to_np, out)
+    return np_out
+def extreme_ca_ca_distance_violations(
+    pred_atom_positions: torch.Tensor,  # (N, 37(14), 3)
+    pred_atom_mask: torch.Tensor,  # (N, 37(14))
+    residue_index: torch.Tensor,  # (N)
+    max_angstrom_tolerance=1.5,
+    eps=1e-6,
+) -> torch.Tensor:
+    """Counts residues whose Ca is a large distance from its neighbour.
+    Measures the fraction of CA-CA pairs between consecutive amino acids that are
+    more than 'max_angstrom_tolerance' apart.
+    Args:
+      pred_atom_positions: Atom positions in atom37/14 representation
+      pred_atom_mask: Atom mask in atom37/14 representation
+      residue_index: Residue index for given amino acid, this is assumed to be
+        monotonically increasing.
+      max_angstrom_tolerance: Maximum distance allowed to not count as violation.
+    Returns:
+      Fraction of consecutive CA-CA pairs with violation.
+    """
+    this_ca_pos = pred_atom_positions[..., :-1, 1, :]
+    this_ca_mask = pred_atom_mask[..., :-1, 1]
+    next_ca_pos = pred_atom_positions[..., 1:, 1, :]
+    next_ca_mask = pred_atom_mask[..., 1:, 1]
+    has_no_gap_mask = (residue_index[..., 1:] - residue_index[..., :-1]) == 1.0
+    ca_ca_distance = torch.sqrt(
+        eps + torch.sum((this_ca_pos - next_ca_pos) ** 2, dim=-1)
+    )
+    violations = (
+        ca_ca_distance - residue_constants.ca_ca
+    ) > max_angstrom_tolerance
+    mask = this_ca_mask * next_ca_mask * has_no_gap_mask
+    mean = masked_mean(mask, violations, -1)
+    return mean
+def compute_violation_metrics(
+    batch: Dict[str, torch.Tensor],
+    atom14_pred_positions: torch.Tensor,  # (N, 14, 3)
+    violations: Dict[str, torch.Tensor],
+) -> Dict[str, torch.Tensor]:
+    """Compute several metrics to assess the structural violations."""
+    ret = {}
+    extreme_ca_ca_violations = extreme_ca_ca_distance_violations(
+        pred_atom_positions=atom14_pred_positions,
+        pred_atom_mask=batch["atom14_atom_exists"],
+        residue_index=batch["residue_index"],
+    )
+    ret["violations_extreme_ca_ca_distance"] = extreme_ca_ca_violations
+    ret["violations_between_residue_bond"] = masked_mean(
+        batch["seq_mask"],
+        violations["between_residues"][
+            "connections_per_residue_violation_mask"
+        ],
+        dim=-1,
+    )
+    ret["violations_between_residue_clash"] = masked_mean(
+        mask=batch["seq_mask"],
+        value=torch.max(
+            violations["between_residues"]["clashes_per_atom_clash_mask"],
+            dim=-1,
+        )[0],
+        dim=-1,
+    )
+    ret["violations_within_residue"] = masked_mean(
+        mask=batch["seq_mask"],
+        value=torch.max(
+            violations["within_residues"]["per_atom_violations"], dim=-1
+        )[0],
+        dim=-1,
+    )
+    ret["violations_per_residue"] = masked_mean(
+        mask=batch["seq_mask"],
+        value=violations["total_per_residue_violations_mask"],
+        dim=-1,
+    )
+    return ret
+def compute_violation_metrics_np(
+    batch: Dict[str, np.ndarray],
+    atom14_pred_positions: np.ndarray,
+    violations: Dict[str, np.ndarray],
+) -> Dict[str, np.ndarray]:
+    to_tensor = lambda x: torch.tensor(x)
+    batch = tree_map(to_tensor, batch, np.ndarray)
+    atom14_pred_positions = to_tensor(atom14_pred_positions)
+    violations = tree_map(to_tensor, violations, np.ndarray)
+    out = compute_violation_metrics(batch, atom14_pred_positions, violations)
+    to_np = lambda x: np.array(x)
+    return tree_map(to_np, out, torch.Tensor)
+def violation_loss(
+    violations: Dict[str, torch.Tensor],
+    atom14_atom_exists: torch.Tensor,
+    eps=1e-6,
+    **kwargs,
+) -> torch.Tensor:
+    num_atoms = torch.sum(atom14_atom_exists)
+    l_clash = torch.sum(
+        violations["between_residues"]["clashes_per_atom_loss_sum"]
+        + violations["within_residues"]["per_atom_loss_sum"]
+    )
+    l_clash = l_clash / (eps + num_atoms)
+    loss = (
+        violations["between_residues"]["bonds_c_n_loss_mean"]
+        + violations["between_residues"]["angles_ca_c_n_loss_mean"]
+        + violations["between_residues"]["angles_c_n_ca_loss_mean"]
+        + l_clash
+    )
+    return loss
+def compute_renamed_ground_truth(
+    batch: Dict[str, torch.Tensor],
+    atom14_pred_positions: torch.Tensor,
+    eps=1e-10,
+) -> Dict[str, torch.Tensor]:
+    """
+    Find optimal renaming of ground truth based on the predicted positions.
+    Alg. 26 "renameSymmetricGroundTruthAtoms"
+    This renamed ground truth is then used for all losses,
+    such that each loss moves the atoms in the same direction.
+    Args:
+      batch: Dictionary containing:
+        * atom14_gt_positions: Ground truth positions.
+        * atom14_alt_gt_positions: Ground truth positions with renaming swaps.
+        * atom14_atom_is_ambiguous: 1.0 for atoms that are affected by
+            renaming swaps.
+        * atom14_gt_exists: Mask for which atoms exist in ground truth.
+        * atom14_alt_gt_exists: Mask for which atoms exist in ground truth
+            after renaming.
+        * atom14_atom_exists: Mask for whether each atom is part of the given
+            amino acid type.
+      atom14_pred_positions: Array of atom positions in global frame with shape
+    Returns:
+      Dictionary containing:
+        alt_naming_is_better: Array with 1.0 where alternative swap is better.
+        renamed_atom14_gt_positions: Array of optimal ground truth positions
+          after renaming swaps are performed.
+        renamed_atom14_gt_exists: Mask after renaming swap is performed.
+    """
+    pred_dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_pred_positions[..., None, :, None, :]
+                - atom14_pred_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    atom14_gt_positions = batch["atom14_gt_positions"]
+    gt_dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_gt_positions[..., None, :, None, :]
+                - atom14_gt_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    atom14_alt_gt_positions = batch["atom14_alt_gt_positions"]
+    alt_gt_dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_alt_gt_positions[..., None, :, None, :]
+                - atom14_alt_gt_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    lddt = torch.sqrt(eps + (pred_dists - gt_dists) ** 2)
+    alt_lddt = torch.sqrt(eps + (pred_dists - alt_gt_dists) ** 2)
+    atom14_gt_exists = batch["atom14_gt_exists"]
+    atom14_atom_is_ambiguous = batch["atom14_atom_is_ambiguous"]
+    mask = (
+        atom14_gt_exists[..., None, :, None]
+        * atom14_atom_is_ambiguous[..., None, :, None]
+        * atom14_gt_exists[..., None, :, None, :]
+        * (1.0 - atom14_atom_is_ambiguous[..., None, :, None, :])
+    )
+    per_res_lddt = torch.sum(mask * lddt, dim=(-1, -2, -3))
+    alt_per_res_lddt = torch.sum(mask * alt_lddt, dim=(-1, -2, -3))
+    fp_type = atom14_pred_positions.dtype
+    alt_naming_is_better = (alt_per_res_lddt < per_res_lddt).type(fp_type)
+    renamed_atom14_gt_positions = (
+        1.0 - alt_naming_is_better[..., None, None]
+    ) * atom14_gt_positions + alt_naming_is_better[
+        ..., None, None
+    ] * atom14_alt_gt_positions
+    renamed_atom14_gt_mask = (
+        1.0 - alt_naming_is_better[..., None]
+    ) * atom14_gt_exists + alt_naming_is_better[..., None] * batch[
+        "atom14_alt_gt_exists"
+    ]
+    return {
+        "alt_naming_is_better": alt_naming_is_better,
+        "renamed_atom14_gt_positions": renamed_atom14_gt_positions,
+        "renamed_atom14_gt_exists": renamed_atom14_gt_mask,
+    }
+def experimentally_resolved_loss(
+    logits: torch.Tensor,
+    atom37_atom_exists: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    resolution: torch.Tensor,
+    min_resolution: float,
+    max_resolution: float,
+    eps: float = 1e-8,
+    **kwargs,
+) -> torch.Tensor:
+    errors = sigmoid_cross_entropy(logits, all_atom_mask)
+    loss = torch.sum(errors * atom37_atom_exists, dim=-1)
+    loss = loss / (eps + torch.sum(atom37_atom_exists, dim=(-1, -2)))
+    loss = torch.sum(loss, dim=-1)
+    loss = loss * (
+        (resolution >= min_resolution) & (resolution <= max_resolution)
+    )
+    loss = torch.mean(loss)
+    return loss
+def masked_msa_loss(logits, true_msa, bert_mask, eps=1e-8, **kwargs):
+    """
+    Computes BERT-style masked MSA loss. Implements subsection 1.9.9.
+    Args:
+        logits: [*, N_seq, N_res, 23] predicted residue distribution
+        true_msa: [*, N_seq, N_res] true MSA
+        bert_mask: [*, N_seq, N_res] MSA mask
+    Returns:
+        Masked MSA loss
+    """
+    errors = softmax_cross_entropy(
+        logits, torch.nn.functional.one_hot(true_msa, num_classes=23)
+    )
+    # FP16-friendly averaging. Equivalent to:
+    # loss = (
+    #     torch.sum(errors * bert_mask, dim=(-1, -2)) /
+    #     (eps + torch.sum(bert_mask, dim=(-1, -2)))
+    # )
+    loss = errors * bert_mask
+    loss = torch.sum(loss, dim=-1)
+    scale = 0.5
+    denom = eps + torch.sum(scale * bert_mask, dim=(-1, -2))
+    loss = loss / denom[..., None]
+    loss = torch.sum(loss, dim=-1)
+    loss = loss * scale
+    loss = torch.mean(loss)
+    return loss
+class AlphaFoldLoss(nn.Module):
+    """Aggregation of the various losses described in the supplement"""
+    def __init__(self, config):
+        super(AlphaFoldLoss, self).__init__()
+        self.config = config
+    def forward(self, out, batch, _return_breakdown=False):
+        if "violation" not in out.keys():
+            out["violation"] = find_structural_violations(
+                batch,
+                out["sm"]["positions"][-1],
+                **self.config.violation,
+            )
+        if "renamed_atom14_gt_positions" not in out.keys():
+            batch.update(
+                compute_renamed_ground_truth(
+                    batch,
+                    out["sm"]["positions"][-1],
+                )
+            )
+        loss_fns = {
+            "distogram": lambda: distogram_loss(
+                logits=out["distogram_logits"],
+                **{**batch, **self.config.distogram},
+            ),
+            "experimentally_resolved": lambda: experimentally_resolved_loss(
+                logits=out["experimentally_resolved_logits"],
+                **{**batch, **self.config.experimentally_resolved},
+            ),
+            "fape": lambda: fape_loss(
+                out,
+                batch,
+                self.config.fape,
+            ),
+            "lddt": lambda: lddt_loss(
+                logits=out["lddt_logits"],
+                all_atom_pred_pos=out["final_atom_positions"],
+                **{**batch, **self.config.lddt},
+            ),
+            "masked_msa": lambda: masked_msa_loss(
+                logits=out["masked_msa_logits"],
+                **{**batch, **self.config.masked_msa},
+            ),
+            "supervised_chi": lambda: supervised_chi_loss(
+                out["sm"]["angles"],
+                out["sm"]["unnormalized_angles"],
+                **{**batch, **self.config.supervised_chi},
+            ),
+            "violation": lambda: violation_loss(
+                out["violation"],
+                **batch,
+            ),
+        }
+        if(self.config.tm.enabled):
+            loss_fns["tm"] = lambda: tm_loss(
+                logits=out["tm_logits"],
+                **{**batch, **out, **self.config.tm},
+            )
+        cum_loss = 0.
+        losses = {}
+        for loss_name, loss_fn in loss_fns.items():
+            weight = self.config[loss_name].weight
+            loss = loss_fn()
+            if(torch.isnan(loss) or torch.isinf(loss)):
+                #for k,v in batch.items():
+                #    if(torch.any(torch.isnan(v)) or torch.any(torch.isinf(v))):
+                #        logging.warning(f"{k}: is nan")
+                #logging.warning(f"{loss_name}: {loss}")
+                logging.warning(f"{loss_name} loss is NaN. Skipping...")
+                loss = loss.new_tensor(0., requires_grad=True)
+            cum_loss = cum_loss + weight * loss
+            losses[loss_name] = loss.detach().clone()
+        losses["unscaled_loss"] = cum_loss.detach().clone()
+        # Scale the loss by the square root of the minimum of the crop size and
+        # the (average) sequence length. See subsection 1.9.
+        seq_len = torch.mean(batch["seq_length"].float())
+        crop_len = batch["aatype"].shape[-1]
+        cum_loss = cum_loss * torch.sqrt(min(seq_len, crop_len))
+        losses["loss"] = cum_loss.detach().clone()
+        if(not _return_breakdown):
+            return cum_loss
+        return cum_loss, losses

openfold/utils/rigid_utils.py ADDED Viewed

	@@ -0,0 +1,1367 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Tuple, Any, Sequence, Callable, Optional
+import numpy as np
+import torch
+def rot_matmul(
+    a: torch.Tensor,
+    b: torch.Tensor
+) -> torch.Tensor:
+    """
+        Performs matrix multiplication of two rotation matrix tensors. Written
+        out by hand to avoid AMP downcasting.
+        Args:
+            a: [*, 3, 3] left multiplicand
+            b: [*, 3, 3] right multiplicand
+        Returns:
+            The product ab
+    """
+    def row_mul(i):
+        return torch.stack(
+            [
+                a[..., i, 0] * b[..., 0, 0]
+                + a[..., i, 1] * b[..., 1, 0]
+                + a[..., i, 2] * b[..., 2, 0],
+                a[..., i, 0] * b[..., 0, 1]
+                + a[..., i, 1] * b[..., 1, 1]
+                + a[..., i, 2] * b[..., 2, 1],
+                a[..., i, 0] * b[..., 0, 2]
+                + a[..., i, 1] * b[..., 1, 2]
+                + a[..., i, 2] * b[..., 2, 2],
+            ],
+            dim=-1,
+        )
+    return torch.stack(
+        [
+            row_mul(0),
+            row_mul(1),
+            row_mul(2),
+        ],
+        dim=-2
+    )
+def rot_vec_mul(
+    r: torch.Tensor,
+    t: torch.Tensor
+) -> torch.Tensor:
+    """
+        Applies a rotation to a vector. Written out by hand to avoid transfer
+        to avoid AMP downcasting.
+        Args:
+            r: [*, 3, 3] rotation matrices
+            t: [*, 3] coordinate tensors
+        Returns:
+            [*, 3] rotated coordinates
+    """
+    x, y, z = torch.unbind(t, dim=-1)
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+def identity_rot_mats(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    rots = torch.eye(
+        3, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    rots = rots.view(*((1,) * len(batch_dims)), 3, 3)
+    rots = rots.expand(*batch_dims, -1, -1)
+    return rots
+def identity_trans(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    trans = torch.zeros(
+        (*batch_dims, 3),
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad
+    )
+    return trans
+def identity_quats(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    quat = torch.zeros(
+        (*batch_dims, 4),
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad
+    )
+    with torch.no_grad():
+        quat[..., 0] = 1
+    return quat
+_quat_elements = ["a", "b", "c", "d"]
+_qtr_keys = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements]
+_qtr_ind_dict = {key: ind for ind, key in enumerate(_qtr_keys)}
+def _to_mat(pairs):
+    mat = np.zeros((4, 4))
+    for pair in pairs:
+        key, value = pair
+        ind = _qtr_ind_dict[key]
+        mat[ind // 4][ind % 4] = value
+    return mat
+_QTR_MAT = np.zeros((4, 4, 3, 3))
+_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)])
+_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)])
+_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)])
+_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)])
+_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)])
+_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)])
+_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)])
+_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)])
+_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)])
+def quat_to_rot(quat: torch.Tensor) -> torch.Tensor:
+    """
+        Converts a quaternion to a rotation matrix.
+        Args:
+            quat: [*, 4] quaternions
+        Returns:
+            [*, 3, 3] rotation matrices
+    """
+    # [*, 4, 4]
+    quat = quat[..., None] * quat[..., None, :]
+    # [4, 4, 3, 3]
+    mat = quat.new_tensor(_QTR_MAT, requires_grad=False)
+    # [*, 4, 4, 3, 3]
+    shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape)
+    quat = quat[..., None, None] * shaped_qtr_mat
+    # [*, 3, 3]
+    return torch.sum(quat, dim=(-3, -4))
+def rot_to_quat(
+    rot: torch.Tensor,
+):
+    if(rot.shape[-2:] != (3, 3)):
+        raise ValueError("Input rotation is incorrectly shaped")
+    rot = [[rot[..., i, j] for j in range(3)] for i in range(3)]
+    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = rot
+    k = [
+        [ xx + yy + zz,      zy - yz,      xz - zx,      yx - xy,],
+        [      zy - yz, xx - yy - zz,      xy + yx,      xz + zx,],
+        [      xz - zx,      xy + yx, yy - xx - zz,      yz + zy,],
+        [      yx - xy,      xz + zx,      yz + zy, zz - xx - yy,]
+    ]
+    k = (1./3.) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2)
+    _, vectors = torch.linalg.eigh(k)
+    return vectors[..., -1]
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[ 1, 0, 0, 0],
+                          [ 0,-1, 0, 0],
+                          [ 0, 0,-1, 0],
+                          [ 0, 0, 0,-1]]
+_QUAT_MULTIPLY[:, :, 1] = [[ 0, 1, 0, 0],
+                          [ 1, 0, 0, 0],
+                          [ 0, 0, 0, 1],
+                          [ 0, 0,-1, 0]]
+_QUAT_MULTIPLY[:, :, 2] = [[ 0, 0, 1, 0],
+                          [ 0, 0, 0,-1],
+                          [ 1, 0, 0, 0],
+                          [ 0, 1, 0, 0]]
+_QUAT_MULTIPLY[:, :, 3] = [[ 0, 0, 0, 1],
+                          [ 0, 0, 1, 0],
+                          [ 0,-1, 0, 0],
+                          [ 1, 0, 0, 0]]
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+def quat_multiply(quat1, quat2):
+    """Multiply a quaternion by another quaternion."""
+    mat = quat1.new_tensor(_QUAT_MULTIPLY)
+    reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape)
+    return torch.sum(
+        reshaped_mat *
+        quat1[..., :, None, None] *
+        quat2[..., None, :, None],
+        dim=(-3, -2)
+      )
+def quat_multiply_by_vec(quat, vec):
+    """Multiply a quaternion by a pure-vector quaternion."""
+    mat = quat.new_tensor(_QUAT_MULTIPLY_BY_VEC)
+    reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape)
+    return torch.sum(
+        reshaped_mat *
+        quat[..., :, None, None] *
+        vec[..., None, :, None],
+        dim=(-3, -2)
+    )
+def invert_rot_mat(rot_mat: torch.Tensor):
+    return rot_mat.transpose(-1, -2)
+def invert_quat(quat: torch.Tensor):
+    quat_prime = quat.clone()
+    quat_prime[..., 1:] *= -1
+    inv = quat_prime / torch.sum(quat ** 2, dim=-1, keepdim=True)
+    return inv
+class Rotation:
+    """
+        A 3D rotation. Depending on how the object is initialized, the
+        rotation is represented by either a rotation matrix or a
+        quaternion, though both formats are made available by helper functions.
+        To simplify gradient computation, the underlying format of the
+        rotation cannot be changed in-place. Like Rigid, the class is designed
+        to mimic the behavior of a torch Tensor, almost as if each Rotation
+        object were a tensor of rotations, in one format or another.
+    """
+    def __init__(self,
+        rot_mats: Optional[torch.Tensor] = None,
+        quats: Optional[torch.Tensor] = None,
+        normalize_quats: bool = True,
+    ):
+        """
+            Args:
+                rot_mats:
+                    A [*, 3, 3] rotation matrix tensor. Mutually exclusive with
+                    quats
+                quats:
+                    A [*, 4] quaternion. Mutually exclusive with rot_mats. If
+                    normalize_quats is not True, must be a unit quaternion
+                normalize_quats:
+                    If quats is specified, whether to normalize quats
+        """
+        if((rot_mats is None and quats is None) or
+            (rot_mats is not None and quats is not None)):
+            raise ValueError("Exactly one input argument must be specified")
+        if((rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or
+            (quats is not None and quats.shape[-1] != 4)):
+            raise ValueError(
+                "Incorrectly shaped rotation matrix or quaternion"
+            )
+        # Force full-precision
+        if(quats is not None):
+            quats = quats.to(dtype=torch.float32)
+        if(rot_mats is not None):
+            rot_mats = rot_mats.to(dtype=torch.float32)
+        if(quats is not None and normalize_quats):
+            quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+        self._rot_mats = rot_mats
+        self._quats = quats
+    @staticmethod
+    def identity(
+        shape,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rotation:
+        """
+            Returns an identity Rotation.
+            Args:
+                shape:
+                    The "shape" of the resulting Rotation object. See documentation
+                    for the shape property
+                dtype:
+                    The torch dtype for the rotation
+                device:
+                    The torch device for the new rotation
+                requires_grad:
+                    Whether the underlying tensors in the new rotation object
+                    should require gradient computation
+                fmt:
+                    One of "quat" or "rot_mat". Determines the underlying format
+                    of the new object's rotation
+            Returns:
+                A new identity rotation
+        """
+        if(fmt == "rot_mat"):
+            rot_mats = identity_rot_mats(
+                shape, dtype, device, requires_grad,
+            )
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(fmt == "quat"):
+            quats = identity_quats(shape, dtype, device, requires_grad)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError(f"Invalid format: f{fmt}")
+    # Magic methods
+    def __getitem__(self, index: Any) -> Rotation:
+        """
+            Allows torch-style indexing over the virtual shape of the rotation
+            object. See documentation for the shape property.
+            Args:
+                index:
+                    A torch index. E.g. (1, 3, 2), or (slice(None,))
+            Returns:
+                The indexed rotation
+        """
+        if type(index) != tuple:
+            index = (index,)
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats[index + (slice(None), slice(None))]
+            return Rotation(rot_mats=rot_mats)
+        elif(self._quats is not None):
+            quats = self._quats[index + (slice(None),)]
+            return Rotation(quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def __mul__(self,
+        right: torch.Tensor,
+    ) -> Rotation:
+        """
+            Pointwise left multiplication of the rotation with a tensor. Can be
+            used to e.g. mask the Rotation.
+            Args:
+                right:
+                    The tensor multiplicand
+            Returns:
+                The product
+        """
+        if not(isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats * right[..., None, None]
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = self._quats * right[..., None]
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def __rmul__(self,
+        left: torch.Tensor,
+    ) -> Rotation:
+        """
+            Reverse pointwise multiplication of the rotation with a tensor.
+            Args:
+                left:
+                    The left multiplicand
+            Returns:
+                The product
+        """
+        return self.__mul__(left)
+    # Properties
+    @property
+    def shape(self) -> torch.Size:
+        """
+            Returns the virtual shape of the rotation object. This shape is
+            defined as the batch dimensions of the underlying rotation matrix
+            or quaternion. If the Rotation was initialized with a [10, 3, 3]
+            rotation matrix tensor, for example, the resulting shape would be
+            [10].
+            Returns:
+                The virtual shape of the rotation object
+        """
+        s = None
+        if(self._quats is not None):
+            s = self._quats.shape[:-1]
+        else:
+            s = self._rot_mats.shape[:-2]
+        return s
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+            Returns the dtype of the underlying rotation.
+            Returns:
+                The dtype of the underlying rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.dtype
+        elif(self._quats is not None):
+            return self._quats.dtype
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def device(self) -> torch.device:
+        """
+            The device of the underlying rotation
+            Returns:
+                The device of the underlying rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.device
+        elif(self._quats is not None):
+            return self._quats.device
+        else:
+            raise ValueError("Both rotations are None")
+    @property
+    def requires_grad(self) -> bool:
+        """
+            Returns the requires_grad property of the underlying rotation
+            Returns:
+                The requires_grad property of the underlying tensor
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.requires_grad
+        elif(self._quats is not None):
+            return self._quats.requires_grad
+        else:
+            raise ValueError("Both rotations are None")
+    def get_rot_mats(self) -> torch.Tensor:
+        """
+            Returns the underlying rotation as a rotation matrix tensor.
+            Returns:
+                The rotation as a rotation matrix tensor
+        """
+        rot_mats = self._rot_mats
+        if(rot_mats is None):
+            if(self._quats is None):
+                raise ValueError("Both rotations are None")
+            else:
+                rot_mats = quat_to_rot(self._quats)
+        return rot_mats
+    def get_quats(self) -> torch.Tensor:
+        """
+            Returns the underlying rotation as a quaternion tensor.
+            Depending on whether the Rotation was initialized with a
+            quaternion, this function may call torch.linalg.eigh.
+            Returns:
+                The rotation as a quaternion tensor.
+        """
+        quats = self._quats
+        if(quats is None):
+            if(self._rot_mats is None):
+                raise ValueError("Both rotations are None")
+            else:
+                quats = rot_to_quat(self._rot_mats)
+        return quats
+    def get_cur_rot(self) -> torch.Tensor:
+        """
+            Return the underlying rotation in its current form
+            Returns:
+                The stored rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats
+        elif(self._quats is not None):
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+    # Rotation functions
+    def compose_q_update_vec(self,
+        q_update_vec: torch.Tensor,
+        normalize_quats: bool = True
+    ) -> Rotation:
+        """
+            Returns a new quaternion Rotation after updating the current
+            object's underlying rotation with a quaternion update, formatted
+            as a [*, 3] tensor whose final three columns represent x, y, z such
+            that (1, x, y, z) is the desired (not necessarily unit) quaternion
+            update.
+            Args:
+                q_update_vec:
+                    A [*, 3] quaternion update tensor
+                normalize_quats:
+                    Whether to normalize the output quaternion
+            Returns:
+                An updated Rotation
+        """
+        quats = self.get_quats()
+        new_quats = quats + quat_multiply_by_vec(quats, q_update_vec)
+        return Rotation(
+            rot_mats=None,
+            quats=new_quats,
+            normalize_quats=normalize_quats,
+        )
+    def compose_r(self, r: Rotation) -> Rotation:
+        """
+            Compose the rotation matrices of the current Rotation object with
+            those of another.
+            Args:
+                r:
+                    An update rotation object
+            Returns:
+                An updated rotation object
+        """
+        r1 = self.get_rot_mats()
+        r2 = r.get_rot_mats()
+        new_rot_mats = rot_matmul(r1, r2)
+        return Rotation(rot_mats=new_rot_mats, quats=None)
+    def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation:
+        """
+            Compose the quaternions of the current Rotation object with those
+            of another.
+            Depending on whether either Rotation was initialized with
+            quaternions, this function may call torch.linalg.eigh.
+            Args:
+                r:
+                    An update rotation object
+            Returns:
+                An updated rotation object
+        """
+        q1 = self.get_quats()
+        q2 = r.get_quats()
+        new_quats = quat_multiply(q1, q2)
+        return Rotation(
+            rot_mats=None, quats=new_quats, normalize_quats=normalize_quats
+        )
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+            Apply the current Rotation as a rotation matrix to a set of 3D
+            coordinates.
+            Args:
+                pts:
+                    A [*, 3] set of points
+            Returns:
+                [*, 3] rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        return rot_vec_mul(rot_mats, pts)
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+            The inverse of the apply() method.
+            Args:
+                pts:
+                    A [*, 3] set of points
+            Returns:
+                [*, 3] inverse-rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        inv_rot_mats = invert_rot_mat(rot_mats)
+        return rot_vec_mul(inv_rot_mats, pts)
+    def invert(self) -> Rotation:
+        """
+            Returns the inverse of the current Rotation.
+            Returns:
+                The inverse of the current Rotation
+        """
+        if(self._rot_mats is not None):
+            return Rotation(
+                rot_mats=invert_rot_mat(self._rot_mats),
+                quats=None
+            )
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=invert_quat(self._quats),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    # "Tensor" stuff
+    def unsqueeze(self,
+        dim: int,
+    ) -> Rigid:
+        """
+            Analogous to torch.unsqueeze. The dimension is relative to the
+            shape of the Rotation object.
+            Args:
+                dim: A positive or negative dimension index.
+            Returns:
+                The unsqueezed Rotation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2)
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    @staticmethod
+    def cat(
+        rs: Sequence[Rotation],
+        dim: int,
+    ) -> Rigid:
+        """
+            Concatenates rotations along one of the batch dimensions. Analogous
+            to torch.cat().
+            Note that the output of this operation is always a rotation matrix,
+            regardless of the format of input rotations.
+            Args:
+                rs:
+                    A list of rotation objects
+                dim:
+                    The dimension along which the rotations should be
+                    concatenated
+            Returns:
+                A concatenated Rotation object in rotation matrix format
+        """
+        rot_mats = [r.get_rot_mats() for r in rs]
+        rot_mats = torch.cat(rot_mats, dim=dim if dim >= 0 else dim - 2)
+        return Rotation(rot_mats=rot_mats, quats=None)
+    def map_tensor_fn(self,
+        fn: Callable[torch.Tensor, torch.Tensor]
+    ) -> Rotation:
+        """
+            Apply a Tensor -> Tensor function to underlying rotation tensors,
+            mapping over the rotation dimension(s). Can be used e.g. to sum out
+            a one-hot batch dimension.
+            Args:
+                fn:
+                    A Tensor -> Tensor function to be mapped over the Rotation
+            Returns:
+                The transformed Rotation object
+        """
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,))
+            rot_mats = torch.stack(
+                list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1
+            )
+            rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3))
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = torch.stack(
+                list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1
+            )
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    def cuda(self) -> Rotation:
+        """
+            Analogous to the cuda() method of torch Tensors
+            Returns:
+                A copy of the Rotation in CUDA memory
+        """
+        if(self._rot_mats is not None):
+            return Rotation(rot_mats=self._rot_mats.cuda(), quats=None)
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.cuda(),
+                normalize_quats=False
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    def to(self,
+        device: Optional[torch.device],
+        dtype: Optional[torch.dtype]
+    ) -> Rotation:
+        """
+            Analogous to the to() method of torch Tensors
+            Args:
+                device:
+                    A torch device
+                dtype:
+                    A torch dtype
+            Returns:
+                A copy of the Rotation using the new device and dtype
+        """
+        if(self._rot_mats is not None):
+            return Rotation(
+                rot_mats=self._rot_mats.to(device=device, dtype=dtype),
+                quats=None,
+            )
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.to(device=device, dtype=dtype),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+    def detach(self) -> Rotation:
+        """
+            Returns a copy of the Rotation whose underlying Tensor has been
+            detached from its torch graph.
+            Returns:
+                A copy of the Rotation whose underlying Tensor has been detached
+                from its torch graph
+        """
+        if(self._rot_mats is not None):
+            return Rotation(rot_mats=self._rot_mats.detach(), quats=None)
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.detach(),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+class Rigid:
+    """
+        A class representing a rigid transformation. Little more than a wrapper
+        around two objects: a Rotation object and a [*, 3] translation
+        Designed to behave approximately like a single torch tensor with the
+        shape of the shared batch dimensions of its component parts.
+    """
+    def __init__(self,
+        rots: Optional[Rotation],
+        trans: Optional[torch.Tensor],
+    ):
+        """
+            Args:
+                rots: A [*, 3, 3] rotation tensor
+                trans: A corresponding [*, 3] translation tensor
+        """
+        # (we need device, dtype, etc. from at least one input)
+        batch_dims, dtype, device, requires_grad = None, None, None, None
+        if(trans is not None):
+            batch_dims = trans.shape[:-1]
+            dtype = trans.dtype
+            device = trans.device
+            requires_grad = trans.requires_grad
+        elif(rots is not None):
+            batch_dims = rots.shape
+            dtype = rots.dtype
+            device = rots.device
+            requires_grad = rots.requires_grad
+        else:
+            raise ValueError("At least one input argument must be specified")
+        if(rots is None):
+            rots = Rotation.identity(
+                batch_dims, dtype, device, requires_grad,
+            )
+        elif(trans is None):
+            trans = identity_trans(
+                batch_dims, dtype, device, requires_grad,
+            )
+        if((rots.shape != trans.shape[:-1]) or
+           (rots.device != trans.device)):
+            raise ValueError("Rots and trans incompatible")
+        # Force full precision. Happens to the rotations automatically.
+        trans = trans.to(dtype=torch.float32)
+        self._rots = rots
+        self._trans = trans
+    @staticmethod
+    def identity(
+        shape: Tuple[int],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rigid:
+        """
+            Constructs an identity transformation.
+            Args:
+                shape:
+                    The desired shape
+                dtype:
+                    The dtype of both internal tensors
+                device:
+                    The device of both internal tensors
+                requires_grad:
+                    Whether grad should be enabled for the internal tensors
+            Returns:
+                The identity transformation
+        """
+        return Rigid(
+            Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt),
+            identity_trans(shape, dtype, device, requires_grad),
+        )
+    def __getitem__(self,
+        index: Any,
+    ) -> Rigid:
+        """
+            Indexes the affine transformation with PyTorch-style indices.
+            The index is applied to the shared dimensions of both the rotation
+            and the translation.
+            E.g.::
+                r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None)
+                t = Rigid(r, torch.rand(10, 10, 3))
+                indexed = t[3, 4:6]
+                assert(indexed.shape == (2,))
+                assert(indexed.get_rots().shape == (2,))
+                assert(indexed.get_trans().shape == (2, 3))
+            Args:
+                index: A standard torch tensor index. E.g. 8, (10, None, 3),
+                or (3, slice(0, 1, None))
+            Returns:
+                The indexed tensor
+        """
+        if type(index) != tuple:
+            index = (index,)
+        return Rigid(
+            self._rots[index],
+            self._trans[index + (slice(None),)],
+        )
+    def __mul__(self,
+        right: torch.Tensor,
+    ) -> Rigid:
+        """
+            Pointwise left multiplication of the transformation with a tensor.
+            Can be used to e.g. mask the Rigid.
+            Args:
+                right:
+                    The tensor multiplicand
+            Returns:
+                The product
+        """
+        if not(isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+        new_rots = self._rots * right
+        new_trans = self._trans * right[..., None]
+        return Rigid(new_rots, new_trans)
+    def __rmul__(self,
+        left: torch.Tensor,
+    ) -> Rigid:
+        """
+            Reverse pointwise multiplication of the transformation with a
+            tensor.
+            Args:
+                left:
+                    The left multiplicand
+            Returns:
+                The product
+        """
+        return self.__mul__(left)
+    @property
+    def shape(self) -> torch.Size:
+        """
+            Returns the shape of the shared dimensions of the rotation and
+            the translation.
+            Returns:
+                The shape of the transformation
+        """
+        s = self._trans.shape[:-1]
+        return s
+    @property
+    def device(self) -> torch.device:
+        """
+            Returns the device on which the Rigid's tensors are located.
+            Returns:
+                The device on which the Rigid's tensors are located
+        """
+        return self._trans.device
+    def get_rots(self) -> Rotation:
+        """
+            Getter for the rotation.
+            Returns:
+                The rotation object
+        """
+        return self._rots
+    def get_trans(self) -> torch.Tensor:
+        """
+            Getter for the translation.
+            Returns:
+                The stored translation
+        """
+        return self._trans
+    def compose_q_update_vec(self,
+        q_update_vec: torch.Tensor,
+    ) -> Rigid:
+        """
+            Composes the transformation with a quaternion update vector of
+            shape [*, 6], where the final 6 columns represent the x, y, and
+            z values of a quaternion of form (1, x, y, z) followed by a 3D
+            translation.
+            Args:
+                q_vec: The quaternion update vector.
+            Returns:
+                The composed transformation.
+        """
+        q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:]
+        new_rots = self._rots.compose_q_update_vec(q_vec)
+        trans_update = self._rots.apply(t_vec)
+        new_translation = self._trans + trans_update
+        return Rigid(new_rots, new_translation)
+    def compose(self,
+        r: Rigid,
+    ) -> Rigid:
+        """
+            Composes the current rigid object with another.
+            Args:
+                r:
+                    Another Rigid object
+            Returns:
+                The composition of the two transformations
+        """
+        new_rot = self._rots.compose_r(r._rots)
+        new_trans = self._rots.apply(r._trans) + self._trans
+        return Rigid(new_rot, new_trans)
+    def apply(self,
+        pts: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+            Applies the transformation to a coordinate tensor.
+            Args:
+                pts: A [*, 3] coordinate tensor.
+            Returns:
+                The transformed points.
+        """
+        rotated = self._rots.apply(pts)
+        return rotated + self._trans
+    def invert_apply(self,
+        pts: torch.Tensor
+    ) -> torch.Tensor:
+        """
+            Applies the inverse of the transformation to a coordinate tensor.
+            Args:
+                pts: A [*, 3] coordinate tensor
+            Returns:
+                The transformed points.
+        """
+        pts = pts - self._trans
+        return self._rots.invert_apply(pts)
+    def invert(self) -> Rigid:
+        """
+            Inverts the transformation.
+            Returns:
+                The inverse transformation.
+        """
+        rot_inv = self._rots.invert()
+        trn_inv = rot_inv.apply(self._trans)
+        return Rigid(rot_inv, -1 * trn_inv)
+    def map_tensor_fn(self,
+        fn: Callable[torch.Tensor, torch.Tensor]
+    ) -> Rigid:
+        """
+            Apply a Tensor -> Tensor function to underlying translation and
+            rotation tensors, mapping over the translation/rotation dimensions
+            respectively.
+            Args:
+                fn:
+                    A Tensor -> Tensor function to be mapped over the Rigid
+            Returns:
+                The transformed Rigid object
+        """
+        new_rots = self._rots.map_tensor_fn(fn)
+        new_trans = torch.stack(
+            list(map(fn, torch.unbind(self._trans, dim=-1))),
+            dim=-1
+        )
+        return Rigid(new_rots, new_trans)
+    def to_tensor_4x4(self) -> torch.Tensor:
+        """
+            Converts a transformation to a homogenous transformation tensor.
+            Returns:
+                A [*, 4, 4] homogenous transformation tensor
+        """
+        tensor = self._trans.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._rots.get_rot_mats()
+        tensor[..., :3, 3] = self._trans
+        tensor[..., 3, 3] = 1
+        return tensor
+    @staticmethod
+    def from_tensor_4x4(
+        t: torch.Tensor
+    ) -> Rigid:
+        """
+            Constructs a transformation from a homogenous transformation
+            tensor.
+            Args:
+                t: [*, 4, 4] homogenous transformation tensor
+            Returns:
+                T object with shape [*]
+        """
+        if(t.shape[-2:] != (4, 4)):
+            raise ValueError("Incorrectly shaped input tensor")
+        rots = Rotation(rot_mats=t[..., :3, :3], quats=None)
+        trans = t[..., :3, 3]
+        return Rigid(rots, trans)
+    def to_tensor_7(self) -> torch.Tensor:
+        """
+            Converts a transformation to a tensor with 7 final columns, four
+            for the quaternion followed by three for the translation.
+            Returns:
+                A [*, 7] tensor representation of the transformation
+        """
+        tensor = self._trans.new_zeros((*self.shape, 7))
+        tensor[..., :4] = self._rots.get_quats()
+        tensor[..., 4:] = self._trans
+        return tensor
+    @staticmethod
+    def from_tensor_7(
+        t: torch.Tensor,
+        normalize_quats: bool = False,
+    ) -> Rigid:
+        if(t.shape[-1] != 7):
+            raise ValueError("Incorrectly shaped input tensor")
+        quats, trans = t[..., :4], t[..., 4:]
+        rots = Rotation(
+            rot_mats=None,
+            quats=quats,
+            normalize_quats=normalize_quats
+        )
+        return Rigid(rots, trans)
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor,
+        origin: torch.Tensor,
+        p_xy_plane: torch.Tensor,
+        eps: float = 1e-8
+    ) -> Rigid:
+        """
+            Implements algorithm 21. Constructs transformations from sets of 3
+            points using the Gram-Schmidt algorithm.
+            Args:
+                p_neg_x_axis: [*, 3] coordinates
+                origin: [*, 3] coordinates used as frame origins
+                p_xy_plane: [*, 3] coordinates
+                eps: Small epsilon value
+            Returns:
+                A transformation object of shape [*]
+        """
+        p_neg_x_axis = torch.unbind(p_neg_x_axis, dim=-1)
+        origin = torch.unbind(origin, dim=-1)
+        p_xy_plane = torch.unbind(p_xy_plane, dim=-1)
+        e0 = [c1 - c2 for c1, c2 in zip(origin, p_neg_x_axis)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane, origin)]
+        denom = torch.sqrt(sum((c * c for c in e0)) + eps)
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum((c * c for c in e1)) + eps)
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+        return Rigid(rot_obj, torch.stack(origin, dim=-1))
+    def unsqueeze(self,
+        dim: int,
+    ) -> Rigid:
+        """
+            Analogous to torch.unsqueeze. The dimension is relative to the
+            shared dimensions of the rotation/translation.
+            Args:
+                dim: A positive or negative dimension index.
+            Returns:
+                The unsqueezed transformation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        rots = self._rots.unsqueeze(dim)
+        trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1)
+        return Rigid(rots, trans)
+    @staticmethod
+    def cat(
+        ts: Sequence[Rigid],
+        dim: int,
+    ) -> Rigid:
+        """
+            Concatenates transformations along a new dimension.
+            Args:
+                ts:
+                    A list of T objects
+                dim:
+                    The dimension along which the transformations should be
+                    concatenated
+            Returns:
+                A concatenated transformation object
+        """
+        rots = Rotation.cat([t._rots for t in ts], dim)
+        trans = torch.cat(
+            [t._trans for t in ts], dim=dim if dim >= 0 else dim - 1
+        )
+        return Rigid(rots, trans)
+    def apply_rot_fn(self, fn: Callable[Rotation, Rotation]) -> Rigid:
+        """
+            Applies a Rotation -> Rotation function to the stored rotation
+            object.
+            Args:
+                fn: A function of type Rotation -> Rotation
+            Returns:
+                A transformation object with a transformed rotation.
+        """
+        return Rigid(fn(self._rots), self._trans)
+    def apply_trans_fn(self, fn: Callable[torch.Tensor, torch.Tensor]) -> Rigid:
+        """
+            Applies a Tensor -> Tensor function to the stored translation.
+            Args:
+                fn:
+                    A function of type Tensor -> Tensor to be applied to the
+                    translation
+            Returns:
+                A transformation object with a transformed translation.
+        """
+        return Rigid(self._rots, fn(self._trans))
+    def scale_translation(self, trans_scale_factor: float) -> Rigid:
+        """
+            Scales the translation by a constant factor.
+            Args:
+                trans_scale_factor:
+                    The constant factor
+            Returns:
+                A transformation object with a scaled translation.
+        """
+        fn = lambda t: t * trans_scale_factor
+        return self.apply_trans_fn(fn)
+    def stop_rot_gradient(self) -> Rigid:
+        """
+            Detaches the underlying rotation object
+            Returns:
+                A transformation object with detached rotations
+        """
+        fn = lambda r: r.detach()
+        return self.apply_rot_fn(fn)
+    @staticmethod
+    def make_transform_from_reference(n_xyz, ca_xyz, c_xyz, eps=1e-20):
+        """
+            Returns a transformation object from reference coordinates.
+            Note that this method does not take care of symmetries. If you
+            provide the atom positions in the non-standard way, the N atom will
+            end up not at [-0.527250, 1.359329, 0.0] but instead at
+            [-0.527250, -1.359329, 0.0]. You need to take care of such cases in
+            your code.
+            Args:
+                n_xyz: A [*, 3] tensor of nitrogen xyz coordinates.
+                ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates.
+                c_xyz: A [*, 3] tensor of carbon xyz coordinates.
+            Returns:
+                A transformation object. After applying the translation and
+                rotation to the reference backbone, the coordinates will
+                approximately equal to the input coordinates.
+        """
+        translation = -1 * ca_xyz
+        n_xyz = n_xyz + translation
+        c_xyz = c_xyz + translation
+        c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x ** 2 + c_y ** 2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+        zeros = sin_c1.new_zeros(sin_c1.shape)
+        ones = sin_c1.new_ones(sin_c1.shape)
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+        norm = torch.sqrt(eps + c_x ** 2 + c_y ** 2 + c_z ** 2)
+        sin_c2 = c_z / norm
+        cos_c2 = torch.sqrt(c_x ** 2 + c_y ** 2) / norm
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c2_rots[..., 2, 0] = -1 * sin_c2
+        c2_rots[..., 2, 2] = cos_c2
+        c_rots = rot_matmul(c2_rots, c1_rots)
+        n_xyz = rot_vec_mul(c_rots, n_xyz)
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y ** 2 + n_z ** 2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+        rots = rot_matmul(n_rots, c_rots)
+        rots = rots.transpose(-1, -2)
+        translation = -1 * translation
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+        return Rigid(rot_obj, translation)
+    def cuda(self) -> Rigid:
+        """
+            Moves the transformation object to GPU memory
+            Returns:
+                A version of the transformation on GPU
+        """
+        return Rigid(self._rots.cuda(), self._trans.cuda())

openfold/utils/tensor_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import logging
+from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
+import torch
+import torch.nn as nn
+def add(m1, m2, inplace):
+    # The first operation in a checkpoint can't be in-place, but it's
+    # nice to have in-place addition during inference. Thus...
+    if(not inplace):
+        m1 = m1 + m2
+    else:
+        m1 += m2
+    return m1
+def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+def flatten_final_dims(t: torch.Tensor, no_dims: int):
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+def masked_mean(mask, value, dim, eps=1e-4):
+    mask = mask.expand(*value.shape)
+    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
+def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
+    boundaries = torch.linspace(
+        min_bin, max_bin, no_bins - 1, device=pts.device
+    )
+    dists = torch.sqrt(
+        torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
+    )
+    return torch.bucketize(dists, boundaries)
+def dict_multimap(fn, dicts):
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if type(v) is dict:
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+    return new_dict
+def one_hot(x, v_bins):
+    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
+    diffs = x[..., None] - reshaped_bins
+    am = torch.argmin(torch.abs(diffs), dim=-1)
+    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
+def batched_gather(data, inds, dim=0, no_batch_dims=0):
+    ranges = []
+    for i, s in enumerate(data.shape[:no_batch_dims]):
+        r = torch.arange(s)
+        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
+        ranges.append(r)
+    remaining_dims = [
+        slice(None) for _ in range(len(data.shape) - no_batch_dims)
+    ]
+    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
+    ranges.extend(remaining_dims)
+    return data[ranges]
+# With tree_map, a poor man's JAX tree_map
+def dict_map(fn, dic, leaf_type):
+    new_dict = {}
+    for k, v in dic.items():
+        if type(v) is dict:
+            new_dict[k] = dict_map(fn, v, leaf_type)
+        else:
+            new_dict[k] = tree_map(fn, v, leaf_type)
+    return new_dict
+def tree_map(fn, tree, leaf_type):
+    if isinstance(tree, dict):
+        return dict_map(fn, tree, leaf_type)
+    elif isinstance(tree, list):
+        return [tree_map(fn, x, leaf_type) for x in tree]
+    elif isinstance(tree, tuple):
+        return tuple([tree_map(fn, x, leaf_type) for x in tree])
+    elif isinstance(tree, leaf_type):
+        return fn(tree)
+    else:
+        print(type(tree))
+        raise ValueError("Not supported")
+tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+biopython==1.79
+filelock==3.13.1
+fsspec==2024.3.1
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.23.5
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.1.105
+ProDy==2.4.1
+pyparsing==3.1.1
+scipy==1.12.0
+sympy==1.12
+torch==2.2.1
+triton==2.2.0
+typing_extensions==4.10.0
+ml-collections==0.1.1
+dm-tree==0.1.8

run.py ADDED Viewed

	@@ -0,0 +1,990 @@

+import argparse
+import copy
+import json
+import os.path
+import random
+import sys
+import numpy as np
+import torch
+from data_utils import (
+    alphabet,
+    element_dict_rev,
+    featurize,
+    get_score,
+    get_seq_rec,
+    parse_PDB,
+    restype_1to3,
+    restype_int_to_str,
+    restype_str_to_int,
+    write_full_PDB,
+)
+from model_utils import ProteinMPNN
+from prody import writePDB
+from sc_utils import Packer, pack_side_chains
+def main(args) -> None:
+    """
+    Inference function
+    """
+    if args.seed:
+        seed = args.seed
+    else:
+        seed = int(np.random.randint(0, high=99999, size=1, dtype=int)[0])
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
+    folder_for_outputs = args.out_folder
+    base_folder = folder_for_outputs
+    if base_folder[-1] != "/":
+        base_folder = base_folder + "/"
+    if not os.path.exists(base_folder):
+        os.makedirs(base_folder, exist_ok=True)
+    if not os.path.exists(base_folder + "seqs"):
+        os.makedirs(base_folder + "seqs", exist_ok=True)
+    if not os.path.exists(base_folder + "backbones"):
+        os.makedirs(base_folder + "backbones", exist_ok=True)
+    if not os.path.exists(base_folder + "packed"):
+        os.makedirs(base_folder + "packed", exist_ok=True)
+    if args.save_stats:
+        if not os.path.exists(base_folder + "stats"):
+            os.makedirs(base_folder + "stats", exist_ok=True)
+    if args.model_type == "protein_mpnn":
+        checkpoint_path = args.checkpoint_protein_mpnn
+    elif args.model_type == "ligand_mpnn":
+        checkpoint_path = args.checkpoint_ligand_mpnn
+    elif args.model_type == "per_residue_label_membrane_mpnn":
+        checkpoint_path = args.checkpoint_per_residue_label_membrane_mpnn
+    elif args.model_type == "global_label_membrane_mpnn":
+        checkpoint_path = args.checkpoint_global_label_membrane_mpnn
+    elif args.model_type == "soluble_mpnn":
+        checkpoint_path = args.checkpoint_soluble_mpnn
+    else:
+        print("Choose one of the available models")
+        sys.exit()
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    if args.model_type == "ligand_mpnn":
+        atom_context_num = checkpoint["atom_context_num"]
+        ligand_mpnn_use_side_chain_context = args.ligand_mpnn_use_side_chain_context
+        k_neighbors = checkpoint["num_edges"]
+    else:
+        atom_context_num = 1
+        ligand_mpnn_use_side_chain_context = 0
+        k_neighbors = checkpoint["num_edges"]
+    model = ProteinMPNN(
+        node_features=128,
+        edge_features=128,
+        hidden_dim=128,
+        num_encoder_layers=3,
+        num_decoder_layers=3,
+        k_neighbors=k_neighbors,
+        device=device,
+        atom_context_num=atom_context_num,
+        model_type=args.model_type,
+        ligand_mpnn_use_side_chain_context=ligand_mpnn_use_side_chain_context,
+    )
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.to(device)
+    model.eval()
+    if args.pack_side_chains:
+        model_sc = Packer(
+            node_features=128,
+            edge_features=128,
+            num_positional_embeddings=16,
+            num_chain_embeddings=16,
+            num_rbf=16,
+            hidden_dim=128,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            atom_context_num=16,
+            lower_bound=0.0,
+            upper_bound=20.0,
+            top_k=32,
+            dropout=0.0,
+            augment_eps=0.0,
+            atom37_order=False,
+            device=device,
+            num_mix=3,
+        )
+        checkpoint_sc = torch.load(args.checkpoint_path_sc, map_location=device)
+        model_sc.load_state_dict(checkpoint_sc["model_state_dict"])
+        model_sc.to(device)
+        model_sc.eval()
+    if args.pdb_path_multi:
+        with open(args.pdb_path_multi, "r") as fh:
+            pdb_paths = list(json.load(fh))
+    else:
+        pdb_paths = [args.pdb_path]
+    if args.fixed_residues_multi:
+        with open(args.fixed_residues_multi, "r") as fh:
+            fixed_residues_multi = json.load(fh)
+            fixed_residues_multi = {key:value.split() for key,value in fixed_residues_multi.items()}
+    else:
+        fixed_residues = [item for item in args.fixed_residues.split()]
+        fixed_residues_multi = {}
+        for pdb in pdb_paths:
+            fixed_residues_multi[pdb] = fixed_residues
+    if args.redesigned_residues_multi:
+        with open(args.redesigned_residues_multi, "r") as fh:
+            redesigned_residues_multi = json.load(fh)
+            redesigned_residues_multi = {key:value.split() for key,value in redesigned_residues_multi.items()}
+    else:
+        redesigned_residues = [item for item in args.redesigned_residues.split()]
+        redesigned_residues_multi = {}
+        for pdb in pdb_paths:
+            redesigned_residues_multi[pdb] = redesigned_residues
+    bias_AA = torch.zeros([21], device=device, dtype=torch.float32)
+    if args.bias_AA:
+        tmp = [item.split(":") for item in args.bias_AA.split(",")]
+        a1 = [b[0] for b in tmp]
+        a2 = [float(b[1]) for b in tmp]
+        for i, AA in enumerate(a1):
+            bias_AA[restype_str_to_int[AA]] = a2[i]
+    if args.bias_AA_per_residue_multi:
+        with open(args.bias_AA_per_residue_multi, "r") as fh:
+            bias_AA_per_residue_multi = json.load(
+                fh
+            )  # {"pdb_path" : {"A12": {"G": 1.1}}}
+    else:
+        if args.bias_AA_per_residue:
+            with open(args.bias_AA_per_residue, "r") as fh:
+                bias_AA_per_residue = json.load(fh)  # {"A12": {"G": 1.1}}
+            bias_AA_per_residue_multi = {}
+            for pdb in pdb_paths:
+                bias_AA_per_residue_multi[pdb] = bias_AA_per_residue
+    if args.omit_AA_per_residue_multi:
+        with open(args.omit_AA_per_residue_multi, "r") as fh:
+            omit_AA_per_residue_multi = json.load(
+                fh
+            )  # {"pdb_path" : {"A12": "PQR", "A13": "QS"}}
+    else:
+        if args.omit_AA_per_residue:
+            with open(args.omit_AA_per_residue, "r") as fh:
+                omit_AA_per_residue = json.load(fh)  # {"A12": "PG"}
+            omit_AA_per_residue_multi = {}
+            for pdb in pdb_paths:
+                omit_AA_per_residue_multi[pdb] = omit_AA_per_residue
+    omit_AA_list = args.omit_AA
+    omit_AA = torch.tensor(
+        np.array([AA in omit_AA_list for AA in alphabet]).astype(np.float32),
+        device=device,
+    )
+    if len(args.parse_these_chains_only) != 0:
+        parse_these_chains_only_list = args.parse_these_chains_only.split(",")
+    else:
+        parse_these_chains_only_list = []
+    # loop over PDB paths
+    for pdb in pdb_paths:
+        if args.verbose:
+            print("Designing protein from this path:", pdb)
+        fixed_residues = fixed_residues_multi[pdb]
+        redesigned_residues = redesigned_residues_multi[pdb]
+        parse_all_atoms_flag = args.ligand_mpnn_use_side_chain_context or (
+            args.pack_side_chains and not args.repack_everything
+        )
+        protein_dict, backbone, other_atoms, icodes, _ = parse_PDB(
+            pdb,
+            device=device,
+            chains=parse_these_chains_only_list,
+            parse_all_atoms=parse_all_atoms_flag,
+            parse_atoms_with_zero_occupancy=args.parse_atoms_with_zero_occupancy,
+        )
+        # make chain_letter + residue_idx + insertion_code mapping to integers
+        R_idx_list = list(protein_dict["R_idx"].cpu().numpy())  # residue indices
+        chain_letters_list = list(protein_dict["chain_letters"])  # chain letters
+        encoded_residues = []
+        for i, R_idx_item in enumerate(R_idx_list):
+            tmp = str(chain_letters_list[i]) + str(R_idx_item) + icodes[i]
+            encoded_residues.append(tmp)
+        encoded_residue_dict = dict(zip(encoded_residues, range(len(encoded_residues))))
+        encoded_residue_dict_rev = dict(
+            zip(list(range(len(encoded_residues))), encoded_residues)
+        )
+        bias_AA_per_residue = torch.zeros(
+            [len(encoded_residues), 21], device=device, dtype=torch.float32
+        )
+        if args.bias_AA_per_residue_multi or args.bias_AA_per_residue:
+            bias_dict = bias_AA_per_residue_multi[pdb]
+            for residue_name, v1 in bias_dict.items():
+                if residue_name in encoded_residues:
+                    i1 = encoded_residue_dict[residue_name]
+                    for amino_acid, v2 in v1.items():
+                        if amino_acid in alphabet:
+                            j1 = restype_str_to_int[amino_acid]
+                            bias_AA_per_residue[i1, j1] = v2
+        omit_AA_per_residue = torch.zeros(
+            [len(encoded_residues), 21], device=device, dtype=torch.float32
+        )
+        if args.omit_AA_per_residue_multi or args.omit_AA_per_residue:
+            omit_dict = omit_AA_per_residue_multi[pdb]
+            for residue_name, v1 in omit_dict.items():
+                if residue_name in encoded_residues:
+                    i1 = encoded_residue_dict[residue_name]
+                    for amino_acid in v1:
+                        if amino_acid in alphabet:
+                            j1 = restype_str_to_int[amino_acid]
+                            omit_AA_per_residue[i1, j1] = 1.0
+        fixed_positions = torch.tensor(
+            [int(item not in fixed_residues) for item in encoded_residues],
+            device=device,
+        )
+        redesigned_positions = torch.tensor(
+            [int(item not in redesigned_residues) for item in encoded_residues],
+            device=device,
+        )
+        # specify which residues are buried for checkpoint_per_residue_label_membrane_mpnn model
+        if args.transmembrane_buried:
+            buried_residues = [item for item in args.transmembrane_buried.split()]
+            buried_positions = torch.tensor(
+                [int(item in buried_residues) for item in encoded_residues],
+                device=device,
+            )
+        else:
+            buried_positions = torch.zeros_like(fixed_positions)
+        if args.transmembrane_interface:
+            interface_residues = [item for item in args.transmembrane_interface.split()]
+            interface_positions = torch.tensor(
+                [int(item in interface_residues) for item in encoded_residues],
+                device=device,
+            )
+        else:
+            interface_positions = torch.zeros_like(fixed_positions)
+        protein_dict["membrane_per_residue_labels"] = 2 * buried_positions * (
+            1 - interface_positions
+        ) + 1 * interface_positions * (1 - buried_positions)
+        if args.model_type == "global_label_membrane_mpnn":
+            protein_dict["membrane_per_residue_labels"] = (
+                args.global_transmembrane_label + 0 * fixed_positions
+            )
+        if len(args.chains_to_design) != 0:
+            chains_to_design_list = args.chains_to_design.split(",")
+        else:
+            chains_to_design_list = protein_dict["chain_letters"]
+        chain_mask = torch.tensor(
+            np.array(
+                [
+                    item in chains_to_design_list
+                    for item in protein_dict["chain_letters"]
+                ],
+                dtype=np.int32,
+            ),
+            device=device,
+        )
+        # create chain_mask to notify which residues are fixed (0) and which need to be designed (1)
+        if redesigned_residues:
+            protein_dict["chain_mask"] = chain_mask * (1 - redesigned_positions)
+        elif fixed_residues:
+            protein_dict["chain_mask"] = chain_mask * fixed_positions
+        else:
+            protein_dict["chain_mask"] = chain_mask
+        if args.verbose:
+            PDB_residues_to_be_redesigned = [
+                encoded_residue_dict_rev[item]
+                for item in range(protein_dict["chain_mask"].shape[0])
+                if protein_dict["chain_mask"][item] == 1
+            ]
+            PDB_residues_to_be_fixed = [
+                encoded_residue_dict_rev[item]
+                for item in range(protein_dict["chain_mask"].shape[0])
+                if protein_dict["chain_mask"][item] == 0
+            ]
+            print("These residues will be redesigned: ", PDB_residues_to_be_redesigned)
+            print("These residues will be fixed: ", PDB_residues_to_be_fixed)
+        # specify which residues are linked
+        if args.symmetry_residues:
+            symmetry_residues_list_of_lists = [
+                x.split(",") for x in args.symmetry_residues.split("|")
+            ]
+            remapped_symmetry_residues = []
+            for t_list in symmetry_residues_list_of_lists:
+                tmp_list = []
+                for t in t_list:
+                    tmp_list.append(encoded_residue_dict[t])
+                remapped_symmetry_residues.append(tmp_list)
+        else:
+            remapped_symmetry_residues = [[]]
+        # specify linking weights
+        if args.symmetry_weights:
+            symmetry_weights = [
+                [float(item) for item in x.split(",")]
+                for x in args.symmetry_weights.split("|")
+            ]
+        else:
+            symmetry_weights = [[]]
+        if args.homo_oligomer:
+            if args.verbose:
+                print("Designing HOMO-OLIGOMER")
+            chain_letters_set = list(set(chain_letters_list))
+            reference_chain = chain_letters_set[0]
+            lc = len(reference_chain)
+            residue_indices = [
+                item[lc:] for item in encoded_residues if item[:lc] == reference_chain
+            ]
+            remapped_symmetry_residues = []
+            symmetry_weights = []
+            for res in residue_indices:
+                tmp_list = []
+                tmp_w_list = []
+                for chain in chain_letters_set:
+                    name = chain + res
+                    tmp_list.append(encoded_residue_dict[name])
+                    tmp_w_list.append(1 / len(chain_letters_set))
+                remapped_symmetry_residues.append(tmp_list)
+                symmetry_weights.append(tmp_w_list)
+        # set other atom bfactors to 0.0
+        if other_atoms:
+            other_bfactors = other_atoms.getBetas()
+            other_atoms.setBetas(other_bfactors * 0.0)
+        # adjust input PDB name by dropping .pdb if it does exist
+        name = pdb[pdb.rfind("/") + 1 :]
+        if name[-4:] == ".pdb":
+            name = name[:-4]
+        with torch.no_grad():
+            # run featurize to remap R_idx and add batch dimension
+            if args.verbose:
+                if "Y" in list(protein_dict):
+                    atom_coords = protein_dict["Y"].cpu().numpy()
+                    atom_types = list(protein_dict["Y_t"].cpu().numpy())
+                    atom_mask = list(protein_dict["Y_m"].cpu().numpy())
+                    number_of_atoms_parsed = np.sum(atom_mask)
+                else:
+                    print("No ligand atoms parsed")
+                    number_of_atoms_parsed = 0
+                    atom_types = ""
+                    atom_coords = []
+                if number_of_atoms_parsed == 0:
+                    print("No ligand atoms parsed")
+                elif args.model_type == "ligand_mpnn":
+                    print(
+                        f"The number of ligand atoms parsed is equal to: {number_of_atoms_parsed}"
+                    )
+                    for i, atom_type in enumerate(atom_types):
+                        print(
+                            f"Type: {element_dict_rev[atom_type]}, Coords {atom_coords[i]}, Mask {atom_mask[i]}"
+                        )
+            feature_dict = featurize(
+                protein_dict,
+                cutoff_for_score=args.ligand_mpnn_cutoff_for_score,
+                use_atom_context=args.ligand_mpnn_use_atom_context,
+                number_of_ligand_atoms=atom_context_num,
+                model_type=args.model_type,
+            )
+            feature_dict["batch_size"] = args.batch_size
+            B, L, _, _ = feature_dict["X"].shape  # batch size should be 1 for now.
+            # add additional keys to the feature dictionary
+            feature_dict["temperature"] = args.temperature
+            feature_dict["bias"] = (
+                (-1e8 * omit_AA[None, None, :] + bias_AA).repeat([1, L, 1])
+                + bias_AA_per_residue[None]
+                - 1e8 * omit_AA_per_residue[None]
+            )
+            feature_dict["symmetry_residues"] = remapped_symmetry_residues
+            feature_dict["symmetry_weights"] = symmetry_weights
+            sampling_probs_list = []
+            log_probs_list = []
+            decoding_order_list = []
+            S_list = []
+            loss_list = []
+            loss_per_residue_list = []
+            loss_XY_list = []
+            for _ in range(args.number_of_batches):
+                feature_dict["randn"] = torch.randn(
+                    [feature_dict["batch_size"], feature_dict["mask"].shape[1]],
+                    device=device,
+                )
+                output_dict = model.sample(feature_dict)
+                # compute confidence scores
+                loss, loss_per_residue = get_score(
+                    output_dict["S"],
+                    output_dict["log_probs"],
+                    feature_dict["mask"] * feature_dict["chain_mask"],
+                )
+                if args.model_type == "ligand_mpnn":
+                    combined_mask = (
+                        feature_dict["mask"]
+                        * feature_dict["mask_XY"]
+                        * feature_dict["chain_mask"]
+                    )
+                else:
+                    combined_mask = feature_dict["mask"] * feature_dict["chain_mask"]
+                loss_XY, _ = get_score(
+                    output_dict["S"], output_dict["log_probs"], combined_mask
+                )
+                # -----
+                S_list.append(output_dict["S"])
+                log_probs_list.append(output_dict["log_probs"])
+                sampling_probs_list.append(output_dict["sampling_probs"])
+                decoding_order_list.append(output_dict["decoding_order"])
+                loss_list.append(loss)
+                loss_per_residue_list.append(loss_per_residue)
+                loss_XY_list.append(loss_XY)
+            S_stack = torch.cat(S_list, 0)
+            log_probs_stack = torch.cat(log_probs_list, 0)
+            sampling_probs_stack = torch.cat(sampling_probs_list, 0)
+            decoding_order_stack = torch.cat(decoding_order_list, 0)
+            loss_stack = torch.cat(loss_list, 0)
+            loss_per_residue_stack = torch.cat(loss_per_residue_list, 0)
+            loss_XY_stack = torch.cat(loss_XY_list, 0)
+            rec_mask = feature_dict["mask"][:1] * feature_dict["chain_mask"][:1]
+            rec_stack = get_seq_rec(feature_dict["S"][:1], S_stack, rec_mask)
+            native_seq = "".join(
+                [restype_int_to_str[AA] for AA in feature_dict["S"][0].cpu().numpy()]
+            )
+            seq_np = np.array(list(native_seq))
+            seq_out_str = []
+            for mask in protein_dict["mask_c"]:
+                seq_out_str += list(seq_np[mask.cpu().numpy()])
+                seq_out_str += [args.fasta_seq_separation]
+            seq_out_str = "".join(seq_out_str)[:-1]
+            output_fasta = base_folder + "/seqs/" + name + args.file_ending + ".fa"
+            output_backbones = base_folder + "/backbones/"
+            output_packed = base_folder + "/packed/"
+            output_stats_path = base_folder + "stats/" + name + args.file_ending + ".pt"
+            out_dict = {}
+            out_dict["generated_sequences"] = S_stack.cpu()
+            out_dict["sampling_probs"] = sampling_probs_stack.cpu()
+            out_dict["log_probs"] = log_probs_stack.cpu()
+            out_dict["decoding_order"] = decoding_order_stack.cpu()
+            out_dict["native_sequence"] = feature_dict["S"][0].cpu()
+            out_dict["mask"] = feature_dict["mask"][0].cpu()
+            out_dict["chain_mask"] = feature_dict["chain_mask"][0].cpu()
+            out_dict["seed"] = seed
+            out_dict["temperature"] = args.temperature
+            if args.save_stats:
+                torch.save(out_dict, output_stats_path)
+            if args.pack_side_chains:
+                if args.verbose:
+                    print("Packing side chains...")
+                feature_dict_ = featurize(
+                    protein_dict,
+                    cutoff_for_score=8.0,
+                    use_atom_context=args.pack_with_ligand_context,
+                    number_of_ligand_atoms=16,
+                    model_type="ligand_mpnn",
+                )
+                sc_feature_dict = copy.deepcopy(feature_dict_)
+                B = args.batch_size
+                for k, v in sc_feature_dict.items():
+                    if k != "S":
+                        try:
+                            num_dim = len(v.shape)
+                            if num_dim == 2:
+                                sc_feature_dict[k] = v.repeat(B, 1)
+                            elif num_dim == 3:
+                                sc_feature_dict[k] = v.repeat(B, 1, 1)
+                            elif num_dim == 4:
+                                sc_feature_dict[k] = v.repeat(B, 1, 1, 1)
+                            elif num_dim == 5:
+                                sc_feature_dict[k] = v.repeat(B, 1, 1, 1, 1)
+                        except:
+                            pass
+                X_stack_list = []
+                X_m_stack_list = []
+                b_factor_stack_list = []
+                for _ in range(args.number_of_packs_per_design):
+                    X_list = []
+                    X_m_list = []
+                    b_factor_list = []
+                    for c in range(args.number_of_batches):
+                        sc_feature_dict["S"] = S_list[c]
+                        sc_dict = pack_side_chains(
+                            sc_feature_dict,
+                            model_sc,
+                            args.sc_num_denoising_steps,
+                            args.sc_num_samples,
+                            args.repack_everything,
+                        )
+                        X_list.append(sc_dict["X"])
+                        X_m_list.append(sc_dict["X_m"])
+                        b_factor_list.append(sc_dict["b_factors"])
+                    X_stack = torch.cat(X_list, 0)
+                    X_m_stack = torch.cat(X_m_list, 0)
+                    b_factor_stack = torch.cat(b_factor_list, 0)
+                    X_stack_list.append(X_stack)
+                    X_m_stack_list.append(X_m_stack)
+                    b_factor_stack_list.append(b_factor_stack)
+            with open(output_fasta, "w") as f:
+                f.write(
+                    ">{}, T={}, seed={}, num_res={}, num_ligand_res={}, use_ligand_context={}, ligand_cutoff_distance={}, batch_size={}, number_of_batches={}, model_path={}\n{}\n".format(
+                        name,
+                        args.temperature,
+                        seed,
+                        torch.sum(rec_mask).cpu().numpy(),
+                        torch.sum(combined_mask[:1]).cpu().numpy(),
+                        bool(args.ligand_mpnn_use_atom_context),
+                        float(args.ligand_mpnn_cutoff_for_score),
+                        args.batch_size,
+                        args.number_of_batches,
+                        checkpoint_path,
+                        seq_out_str,
+                    )
+                )
+                for ix in range(S_stack.shape[0]):
+                    ix_suffix = ix
+                    if not args.zero_indexed:
+                        ix_suffix += 1
+                    seq_rec_print = np.format_float_positional(
+                        rec_stack[ix].cpu().numpy(), unique=False, precision=4
+                    )
+                    loss_np = np.format_float_positional(
+                        np.exp(-loss_stack[ix].cpu().numpy()), unique=False, precision=4
+                    )
+                    loss_XY_np = np.format_float_positional(
+                        np.exp(-loss_XY_stack[ix].cpu().numpy()),
+                        unique=False,
+                        precision=4,
+                    )
+                    seq = "".join(
+                        [restype_int_to_str[AA] for AA in S_stack[ix].cpu().numpy()]
+                    )
+                    # write new sequences into PDB with backbone coordinates
+                    seq_prody = np.array([restype_1to3[AA] for AA in list(seq)])[
+                        None,
+                    ].repeat(4, 1)
+                    bfactor_prody = (
+                        loss_per_residue_stack[ix].cpu().numpy()[None, :].repeat(4, 1)
+                    )
+                    backbone.setResnames(seq_prody)
+                    backbone.setBetas(
+                        np.exp(-bfactor_prody)
+                        * (bfactor_prody > 0.01).astype(np.float32)
+                    )
+                    if other_atoms:
+                        writePDB(
+                            output_backbones
+                            + name
+                            + "_"
+                            + str(ix_suffix)
+                            + args.file_ending
+                            + ".pdb",
+                            backbone + other_atoms,
+                        )
+                    else:
+                        writePDB(
+                            output_backbones
+                            + name
+                            + "_"
+                            + str(ix_suffix)
+                            + args.file_ending
+                            + ".pdb",
+                            backbone,
+                        )
+                    # write full PDB files
+                    if args.pack_side_chains:
+                        for c_pack in range(args.number_of_packs_per_design):
+                            X_stack = X_stack_list[c_pack]
+                            X_m_stack = X_m_stack_list[c_pack]
+                            b_factor_stack = b_factor_stack_list[c_pack]
+                            write_full_PDB(
+                                output_packed
+                                + name
+                                + args.packed_suffix
+                                + "_"
+                                + str(ix_suffix)
+                                + "_"
+                                + str(c_pack + 1)
+                                + args.file_ending
+                                + ".pdb",
+                                X_stack[ix].cpu().numpy(),
+                                X_m_stack[ix].cpu().numpy(),
+                                b_factor_stack[ix].cpu().numpy(),
+                                feature_dict["R_idx_original"][0].cpu().numpy(),
+                                protein_dict["chain_letters"],
+                                S_stack[ix].cpu().numpy(),
+                                other_atoms=other_atoms,
+                                icodes=icodes,
+                                force_hetatm=args.force_hetatm,
+                            )
+                    # -----
+                    # write fasta lines
+                    seq_np = np.array(list(seq))
+                    seq_out_str = []
+                    for mask in protein_dict["mask_c"]:
+                        seq_out_str += list(seq_np[mask.cpu().numpy()])
+                        seq_out_str += [args.fasta_seq_separation]
+                    seq_out_str = "".join(seq_out_str)[:-1]
+                    if ix == S_stack.shape[0] - 1:
+                        # final 2 lines
+                        f.write(
+                            ">{}, id={}, T={}, seed={}, overall_confidence={}, ligand_confidence={}, seq_rec={}\n{}".format(
+                                name,
+                                ix_suffix,
+                                args.temperature,
+                                seed,
+                                loss_np,
+                                loss_XY_np,
+                                seq_rec_print,
+                                seq_out_str,
+                            )
+                        )
+                    else:
+                        f.write(
+                            ">{}, id={}, T={}, seed={}, overall_confidence={}, ligand_confidence={}, seq_rec={}\n{}\n".format(
+                                name,
+                                ix_suffix,
+                                args.temperature,
+                                seed,
+                                loss_np,
+                                loss_XY_np,
+                                seq_rec_print,
+                                seq_out_str,
+                            )
+                        )
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "--model_type",
+        type=str,
+        default="protein_mpnn",
+        help="Choose your model: protein_mpnn, ligand_mpnn, per_residue_label_membrane_mpnn, global_label_membrane_mpnn, soluble_mpnn",
+    )
+    # protein_mpnn - original ProteinMPNN trained on the whole PDB exluding non-protein atoms
+    # ligand_mpnn - atomic context aware model trained with small molecules, nucleotides, metals etc on the whole PDB
+    # per_residue_label_membrane_mpnn - ProteinMPNN model trained with addition label per residue specifying if that residue is buried or exposed
+    # global_label_membrane_mpnn - ProteinMPNN model trained with global label per PDB id to specify if protein is transmembrane
+    # soluble_mpnn - ProteinMPNN trained only on soluble PDB ids
+    argparser.add_argument(
+        "--checkpoint_protein_mpnn",
+        type=str,
+        default="./model_params/proteinmpnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_ligand_mpnn",
+        type=str,
+        default="./model_params/ligandmpnn_v_32_010_25.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_per_residue_label_membrane_mpnn",
+        type=str,
+        default="./model_params/per_residue_label_membrane_mpnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_global_label_membrane_mpnn",
+        type=str,
+        default="./model_params/global_label_membrane_mpnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_soluble_mpnn",
+        type=str,
+        default="./model_params/solublempnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--fasta_seq_separation",
+        type=str,
+        default=":",
+        help="Symbol to use between sequences from different chains",
+    )
+    argparser.add_argument("--verbose", type=int, default=1, help="Print stuff")
+    argparser.add_argument(
+        "--pdb_path", type=str, default="", help="Path to the input PDB."
+    )
+    argparser.add_argument(
+        "--pdb_path_multi",
+        type=str,
+        default="",
+        help="Path to json listing PDB paths. {'/path/to/pdb': ''} - only keys will be used.",
+    )
+    argparser.add_argument(
+        "--fixed_residues",
+        type=str,
+        default="",
+        help="Provide fixed residues, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--fixed_residues_multi",
+        type=str,
+        default="",
+        help="Path to json mapping of fixed residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}",
+    )
+    argparser.add_argument(
+        "--redesigned_residues",
+        type=str,
+        default="",
+        help="Provide to be redesigned residues, everything else will be fixed, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--redesigned_residues_multi",
+        type=str,
+        default="",
+        help="Path to json mapping of redesigned residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}",
+    )
+    argparser.add_argument(
+        "--bias_AA",
+        type=str,
+        default="",
+        help="Bias generation of amino acids, e.g. 'A:-1.024,P:2.34,C:-12.34'",
+    )
+    argparser.add_argument(
+        "--bias_AA_per_residue",
+        type=str,
+        default="",
+        help="Path to json mapping of bias {'A12': {'G': -0.3, 'C': -2.0, 'H': 0.8}, 'A13': {'G': -1.3}}",
+    )
+    argparser.add_argument(
+        "--bias_AA_per_residue_multi",
+        type=str,
+        default="",
+        help="Path to json mapping of bias {'pdb_path': {'A12': {'G': -0.3, 'C': -2.0, 'H': 0.8}, 'A13': {'G': -1.3}}}",
+    )
+    argparser.add_argument(
+        "--omit_AA",
+        type=str,
+        default="",
+        help="Bias generation of amino acids, e.g. 'ACG'",
+    )
+    argparser.add_argument(
+        "--omit_AA_per_residue",
+        type=str,
+        default="",
+        help="Path to json mapping of bias {'A12': 'APQ', 'A13': 'QST'}",
+    )
+    argparser.add_argument(
+        "--omit_AA_per_residue_multi",
+        type=str,
+        default="",
+        help="Path to json mapping of bias {'pdb_path': {'A12': 'QSPC', 'A13': 'AGE'}}",
+    )
+    argparser.add_argument(
+        "--symmetry_residues",
+        type=str,
+        default="",
+        help="Add list of lists for which residues need to be symmetric, e.g. 'A12,A13,A14|C2,C3|A5,B6'",
+    )
+    argparser.add_argument(
+        "--symmetry_weights",
+        type=str,
+        default="",
+        help="Add weights that match symmetry_residues, e.g. '1.01,1.0,1.0|-1.0,2.0|2.0,2.3'",
+    )
+    argparser.add_argument(
+        "--homo_oligomer",
+        type=int,
+        default=0,
+        help="Setting this to 1 will automatically set --symmetry_residues and --symmetry_weights to do homooligomer design with equal weighting.",
+    )
+    argparser.add_argument(
+        "--out_folder",
+        type=str,
+        help="Path to a folder to output sequences, e.g. /home/out/",
+    )
+    argparser.add_argument(
+        "--file_ending", type=str, default="", help="adding_string_to_the_end"
+    )
+    argparser.add_argument(
+        "--zero_indexed",
+        type=str,
+        default=0,
+        help="1 - to start output PDB numbering with 0",
+    )
+    argparser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Set seed for torch, numpy, and python random.",
+    )
+    argparser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="Number of sequence to generate per one pass.",
+    )
+    argparser.add_argument(
+        "--number_of_batches",
+        type=int,
+        default=1,
+        help="Number of times to design sequence using a chosen batch size.",
+    )
+    argparser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.1,
+        help="Temperature to sample sequences.",
+    )
+    argparser.add_argument(
+        "--save_stats", type=int, default=0, help="Save output statistics"
+    )
+    argparser.add_argument(
+        "--ligand_mpnn_use_atom_context",
+        type=int,
+        default=1,
+        help="1 - use atom context, 0 - do not use atom context.",
+    )
+    argparser.add_argument(
+        "--ligand_mpnn_cutoff_for_score",
+        type=float,
+        default=8.0,
+        help="Cutoff in angstroms between protein and context atoms to select residues for reporting score.",
+    )
+    argparser.add_argument(
+        "--ligand_mpnn_use_side_chain_context",
+        type=int,
+        default=0,
+        help="Flag to use side chain atoms as ligand context for the fixed residues",
+    )
+    argparser.add_argument(
+        "--chains_to_design",
+        type=str,
+        default="",
+        help="Specify which chains to redesign, all others will be kept fixed, 'A,B,C,F'",
+    )
+    argparser.add_argument(
+        "--parse_these_chains_only",
+        type=str,
+        default="",
+        help="Provide chains letters for parsing backbones, 'A,B,C,F'",
+    )
+    argparser.add_argument(
+        "--transmembrane_buried",
+        type=str,
+        default="",
+        help="Provide buried residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--transmembrane_interface",
+        type=str,
+        default="",
+        help="Provide interface residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--global_transmembrane_label",
+        type=int,
+        default=0,
+        help="Provide global label for global_label_membrane_mpnn model. 1 - transmembrane, 0 - soluble",
+    )
+    argparser.add_argument(
+        "--parse_atoms_with_zero_occupancy",
+        type=int,
+        default=0,
+        help="To parse atoms with zero occupancy in the PDB input files. 0 - do not parse, 1 - parse atoms with zero occupancy",
+    )
+    argparser.add_argument(
+        "--pack_side_chains",
+        type=int,
+        default=0,
+        help="1 - to run side chain packer, 0 - do not run it",
+    )
+    argparser.add_argument(
+        "--checkpoint_path_sc",
+        type=str,
+        default="./model_params/ligandmpnn_sc_v_32_002_16.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--number_of_packs_per_design",
+        type=int,
+        default=4,
+        help="Number of independent side chain packing samples to return per design",
+    )
+    argparser.add_argument(
+        "--sc_num_denoising_steps",
+        type=int,
+        default=3,
+        help="Number of denoising/recycling steps to make for side chain packing",
+    )
+    argparser.add_argument(
+        "--sc_num_samples",
+        type=int,
+        default=16,
+        help="Number of samples to draw from a mixture distribution and then take a sample with the highest likelihood.",
+    )
+    argparser.add_argument(
+        "--repack_everything",
+        type=int,
+        default=0,
+        help="1 - repacks side chains of all residues including the fixed ones; 0 - keeps the side chains fixed for fixed residues",
+    )
+    argparser.add_argument(
+        "--force_hetatm",
+        type=int,
+        default=0,
+        help="To force ligand atoms to be written as HETATM to PDB file after packing.",
+    )
+    argparser.add_argument(
+        "--packed_suffix",
+        type=str,
+        default="_packed",
+        help="Suffix for packed PDB paths",
+    )
+    argparser.add_argument(
+        "--pack_with_ligand_context",
+        type=int,
+        default=1,
+        help="1-pack side chains using ligand context, 0 - do not use it.",
+    )
+    args = argparser.parse_args()
+    main(args)

run_examples.sh ADDED Viewed

	@@ -0,0 +1,244 @@

+#!/bin/bash
+#1
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/default"
+#2
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --temperature 0.05 \
+        --out_folder "./outputs/temperature"
+#3
+python run.py \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/random_seed"
+#4
+python run.py \
+        --seed 111 \
+        --verbose 0 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/verbose"
+#5
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/save_stats" \
+        --save_stats 1
+#6
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/fix_residues" \
+        --fixed_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" \
+        --bias_AA "A:10.0"
+#7
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/redesign_residues" \
+        --redesigned_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10" \
+        --bias_AA "A:10.0"
+#8
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/batch_size" \
+        --batch_size 3 \
+        --number_of_batches 5
+#9
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --bias_AA "W:3.0,P:3.0,C:3.0,A:-3.0" \
+        --out_folder "./outputs/global_bias"
+#10
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --bias_AA_per_residue "./inputs/bias_AA_per_residue.json" \
+        --out_folder "./outputs/per_residue_bias"
+#11
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --omit_AA "CDFGHILMNPQRSTVWY" \
+        --out_folder "./outputs/global_omit"
+#12
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --omit_AA_per_residue "./inputs/omit_AA_per_residue.json" \
+        --out_folder "./outputs/per_residue_omit"
+#13
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/symmetry" \
+        --symmetry_residues "C1,C2,C3|C4,C5|C6,C7" \
+        --symmetry_weights "0.33,0.33,0.33|0.5,0.5|0.5,0.5"
+#14
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/4GYT.pdb" \
+        --out_folder "./outputs/homooligomer" \
+        --homo_oligomer 1 \
+        --number_of_batches 2
+#15
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/file_ending" \
+        --file_ending "_xyz"
+#16
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/zero_indexed" \
+        --zero_indexed 1 \
+        --number_of_batches 2
+#17
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/4GYT.pdb" \
+        --out_folder "./outputs/chains_to_design" \
+        --chains_to_design "A,B"
+#18
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/4GYT.pdb" \
+        --out_folder "./outputs/parse_these_chains_only" \
+        --parse_these_chains_only "A,B"
+#19
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/ligandmpnn_default"
+#20
+python run.py \
+        --checkpoint_ligand_mpnn "./model_params/ligandmpnn_v_32_005_25.pt" \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/ligandmpnn_v_32_005_25"
+#21
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/ligandmpnn_no_context" \
+        --ligand_mpnn_use_atom_context 0
+#22
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/ligandmpnn_use_side_chain_atoms" \
+        --ligand_mpnn_use_side_chain_context 1 \
+        --fixed_residues "C1 C2 C3 C4 C5 C6 C7 C8 C9 C10"
+#23
+python run.py \
+        --model_type "soluble_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/soluble_mpnn_default"
+#24
+python run.py \
+        --model_type "global_label_membrane_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/global_label_membrane_mpnn_0" \
+        --global_transmembrane_label 0
+#25
+python run.py \
+        --model_type "per_residue_label_membrane_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/per_residue_label_membrane_mpnn_default" \
+        --transmembrane_buried "C1 C2 C3 C11" \
+        --transmembrane_interface "C4 C5 C6 C22"
+#26
+python run.py \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/fasta_seq_separation" \
+        --fasta_seq_separation ":"
+#27
+python run.py \
+        --pdb_path_multi "./inputs/pdb_ids.json" \
+        --out_folder "./outputs/pdb_path_multi" \
+        --seed 111
+#28
+python run.py \
+        --pdb_path_multi "./inputs/pdb_ids.json" \
+        --fixed_residues_multi "./inputs/fix_residues_multi.json" \
+        --out_folder "./outputs/fixed_residues_multi" \
+        --seed 111
+#29
+python run.py \
+        --pdb_path_multi "./inputs/pdb_ids.json" \
+        --redesigned_residues_multi "./inputs/redesigned_residues_multi.json" \
+        --out_folder "./outputs/redesigned_residues_multi" \
+        --seed 111
+#30
+python run.py \
+        --pdb_path_multi "./inputs/pdb_ids.json" \
+        --omit_AA_per_residue_multi "./inputs/omit_AA_per_residue_multi.json" \
+        --out_folder "./outputs/omit_AA_per_residue_multi" \
+        --seed 111
+#31
+python run.py \
+        --pdb_path_multi "./inputs/pdb_ids.json" \
+        --bias_AA_per_residue_multi "./inputs/bias_AA_per_residue_multi.json" \
+        --out_folder "./outputs/bias_AA_per_residue_multi" \
+        --seed 111
+#32
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --ligand_mpnn_cutoff_for_score "6.0" \
+        --out_folder "./outputs/ligand_mpnn_cutoff_for_score"
+#33
+python run.py \
+        --seed 111 \
+        --pdb_path "./inputs/2GFB.pdb" \
+        --out_folder "./outputs/insertion_code" \
+        --redesigned_residues "B82 B82A B82B B82C" \
+        --parse_these_chains_only "B"

sc_examples.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#1 design a new sequence and pack side chains (return 1 side chain packing sample - fast)
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/sc_default_fast" \
+        --pack_side_chains 1 \
+        --number_of_packs_per_design 0 \
+        --pack_with_ligand_context 1
+#2 design a new sequence and pack side chains (return 4 side chain packing samples)
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/sc_default" \
+        --pack_side_chains 1 \
+        --number_of_packs_per_design 4 \
+        --pack_with_ligand_context 1
+#3 fix specific residues for design and packing
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/sc_fixed_residues" \
+        --pack_side_chains 1 \
+        --number_of_packs_per_design 4 \
+        --pack_with_ligand_context 1 \
+        --fixed_residues "C6 C7 C8 C9 C10 C11 C12 C13 C14 C15" \
+        --repack_everything 0
+#4 fix specific residues for sequence design but repack everything
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/sc_fixed_residues_full_repack" \
+        --pack_side_chains 1 \
+        --number_of_packs_per_design 4 \
+        --pack_with_ligand_context 1 \
+        --fixed_residues "C6 C7 C8 C9 C10 C11 C12 C13 C14 C15" \
+        --repack_everything 1
+#5 design a new sequence using LigandMPNN but pack side chains without considering ligand/DNA etc atoms
+python run.py \
+        --model_type "ligand_mpnn" \
+        --seed 111 \
+        --pdb_path "./inputs/1BC8.pdb" \
+        --out_folder "./outputs/sc_no_context" \
+        --pack_side_chains 1 \
+        --number_of_packs_per_design 4 \
+        --pack_with_ligand_context 0

sc_utils.py ADDED Viewed

	@@ -0,0 +1,1158 @@

+import sys
+import numpy as np
+import torch
+import torch.distributions as D
+import torch.nn as nn
+from model_utils import (
+    DecLayer,
+    DecLayerJ,
+    EncLayer,
+    PositionalEncodings,
+    cat_neighbors_nodes,
+    gather_edges,
+    gather_nodes,
+)
+from openfold.data.data_transforms import atom37_to_torsion_angles, make_atom14_masks
+from openfold.np.residue_constants import (
+    restype_atom14_mask,
+    restype_atom14_rigid_group_positions,
+    restype_atom14_to_rigid_group,
+    restype_rigid_group_default_frame,
+)
+from openfold.utils import feats
+from openfold.utils.rigid_utils import Rigid
+torch_pi = torch.tensor(np.pi, device="cpu")
+map_mpnn_to_af2_seq = torch.tensor(
+    [
+        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+    ],
+    device="cpu",
+)
+def pack_side_chains(
+    feature_dict,
+    model_sc,
+    num_denoising_steps,
+    num_samples=10,
+    repack_everything=True,
+    num_context_atoms=16,
+):
+    device = feature_dict["X"].device
+    torsion_dict = make_torsion_features(feature_dict, repack_everything)
+    feature_dict["X"] = torsion_dict["xyz14_noised"]
+    feature_dict["X_m"] = torsion_dict["xyz14_m"]
+    if "Y" not in list(feature_dict):
+        feature_dict["Y"] = torch.zeros(
+            [
+                feature_dict["X"].shape[0],
+                feature_dict["X"].shape[1],
+                num_context_atoms,
+                3,
+            ],
+            device=device,
+        )
+        feature_dict["Y_t"] = torch.zeros(
+            [feature_dict["X"].shape[0], feature_dict["X"].shape[1], num_context_atoms],
+            device=device,
+        )
+        feature_dict["Y_m"] = torch.zeros(
+            [feature_dict["X"].shape[0], feature_dict["X"].shape[1], num_context_atoms],
+            device=device,
+        )
+    h_V, h_E, E_idx = model_sc.encode(feature_dict)
+    feature_dict["h_V"] = h_V
+    feature_dict["h_E"] = h_E
+    feature_dict["E_idx"] = E_idx
+    for step in range(num_denoising_steps):
+        mean, concentration, mix_logits = model_sc.decode(feature_dict)
+        mix = D.Categorical(logits=mix_logits)
+        comp = D.VonMises(mean, concentration)
+        pred_dist = D.MixtureSameFamily(mix, comp)
+        predicted_samples = pred_dist.sample([num_samples])
+        log_probs_of_samples = pred_dist.log_prob(predicted_samples)
+        sample = torch.gather(
+            predicted_samples, dim=0, index=torch.argmax(log_probs_of_samples, 0)[None,]
+        )[0,]
+        torsions_pred_unit = torch.cat(
+            [torch.sin(sample[:, :, :, None]), torch.cos(sample[:, :, :, None])], -1
+        )
+        torsion_dict["torsions_noised"][:, :, 3:] = torsions_pred_unit * torsion_dict[
+            "mask_fix_sc"
+        ] + torsion_dict["torsions_true"] * (1 - torsion_dict["mask_fix_sc"])
+        pred_frames = feats.torsion_angles_to_frames(
+            torsion_dict["rigids"],
+            torsion_dict["torsions_noised"],
+            torsion_dict["aatype"],
+            torch.tensor(restype_rigid_group_default_frame, device=device),
+        )
+        xyz14_noised = feats.frames_and_literature_positions_to_atom14_pos(
+            pred_frames,
+            torsion_dict["aatype"],
+            torch.tensor(restype_rigid_group_default_frame, device=device),
+            torch.tensor(restype_atom14_to_rigid_group, device=device),
+            torch.tensor(restype_atom14_mask, device=device),
+            torch.tensor(restype_atom14_rigid_group_positions, device=device),
+        )
+        xyz14_noised = xyz14_noised * feature_dict["X_m"][:, :, :, None]
+        feature_dict["X"] = xyz14_noised
+        S_af2 = torsion_dict["S_af2"]
+    feature_dict["X"] = xyz14_noised
+    log_prob = pred_dist.log_prob(sample) * torsion_dict["mask_fix_sc"][
+        ..., 0
+    ] + 2.0 * (1 - torsion_dict["mask_fix_sc"][..., 0])
+    tmp_types = torch.tensor(restype_atom14_to_rigid_group, device=device)[S_af2]
+    tmp_types[tmp_types < 4] = 4
+    tmp_types -= 4
+    atom_types_for_b_factor = torch.nn.functional.one_hot(tmp_types, 4)  # [B, L, 14, 4]
+    uncertainty = log_prob[:, :, None, :] * atom_types_for_b_factor  # [B,L,14,4]
+    b_factor_pred = uncertainty.sum(-1)  # [B, L, 14]
+    feature_dict["b_factors"] = b_factor_pred
+    feature_dict["mean"] = mean
+    feature_dict["concentration"] = concentration
+    feature_dict["mix_logits"] = mix_logits
+    feature_dict["log_prob"] = log_prob
+    feature_dict["sample"] = sample
+    feature_dict["true_torsion_sin_cos"] = torsion_dict["torsions_true"]
+    return feature_dict
+def make_torsion_features(feature_dict, repack_everything=True):
+    device = feature_dict["mask"].device
+    mask = feature_dict["mask"]
+    B, L = mask.shape
+    xyz37 = torch.zeros([B, L, 37, 3], device=device, dtype=torch.float32)
+    xyz37[:, :, :3] = feature_dict["X"][:, :, :3]
+    xyz37[:, :, 4] = feature_dict["X"][:, :, 3]
+    S_af2 = torch.argmax(
+        torch.nn.functional.one_hot(feature_dict["S"], 21).float()
+        @ map_mpnn_to_af2_seq.to(device).float(),
+        -1,
+    )
+    masks14_37 = make_atom14_masks({"aatype": S_af2})
+    temp_dict = {
+        "aatype": S_af2,
+        "all_atom_positions": xyz37,
+        "all_atom_mask": masks14_37["atom37_atom_exists"],
+    }
+    torsion_dict = atom37_to_torsion_angles("")(temp_dict)
+    rigids = Rigid.make_transform_from_reference(
+        n_xyz=xyz37[:, :, 0, :],
+        ca_xyz=xyz37[:, :, 1, :],
+        c_xyz=xyz37[:, :, 2, :],
+        eps=1e-9,
+    )
+    if not repack_everything:
+        xyz37_true = feature_dict["xyz_37"]
+        temp_dict_true = {
+            "aatype": S_af2,
+            "all_atom_positions": xyz37_true,
+            "all_atom_mask": masks14_37["atom37_atom_exists"],
+        }
+        torsion_dict_true = atom37_to_torsion_angles("")(temp_dict_true)
+        torsions_true = torch.clone(torsion_dict_true["torsion_angles_sin_cos"])[
+            :, :, 3:
+        ]
+        mask_fix_sc = feature_dict["chain_mask"][:, :, None, None]
+    else:
+        torsions_true = torch.zeros([B, L, 4, 2], device=device)
+        mask_fix_sc = torch.ones([B, L, 1, 1], device=device)
+    random_angle = (
+        2 * torch_pi * torch.rand([S_af2.shape[0], S_af2.shape[1], 4], device=device)
+    )
+    random_sin_cos = torch.cat(
+        [torch.sin(random_angle)[..., None], torch.cos(random_angle)[..., None]], -1
+    )
+    torsions_noised = torch.clone(torsion_dict["torsion_angles_sin_cos"])
+    torsions_noised[:, :, 3:] = random_sin_cos * mask_fix_sc + torsions_true * (
+        1 - mask_fix_sc
+    )
+    pred_frames = feats.torsion_angles_to_frames(
+        rigids,
+        torsions_noised,
+        S_af2,
+        torch.tensor(restype_rigid_group_default_frame, device=device),
+    )
+    xyz14_noised = feats.frames_and_literature_positions_to_atom14_pos(
+        pred_frames,
+        S_af2,
+        torch.tensor(restype_rigid_group_default_frame, device=device),
+        torch.tensor(restype_atom14_to_rigid_group, device=device).long(),
+        torch.tensor(restype_atom14_mask, device=device),
+        torch.tensor(restype_atom14_rigid_group_positions, device=device),
+    )
+    xyz14_m = masks14_37["atom14_atom_exists"] * mask[:, :, None]
+    xyz14_noised = xyz14_noised * xyz14_m[:, :, :, None]
+    torsion_dict["xyz14_m"] = xyz14_m
+    torsion_dict["xyz14_noised"] = xyz14_noised
+    torsion_dict["mask_for_loss"] = mask
+    torsion_dict["rigids"] = rigids
+    torsion_dict["torsions_noised"] = torsions_noised
+    torsion_dict["mask_fix_sc"] = mask_fix_sc
+    torsion_dict["torsions_true"] = torsions_true
+    torsion_dict["S_af2"] = S_af2
+    return torsion_dict
+class Packer(nn.Module):
+    def __init__(
+        self,
+        edge_features=128,
+        node_features=128,
+        num_positional_embeddings=16,
+        num_chain_embeddings=16,
+        num_rbf=16,
+        top_k=30,
+        augment_eps=0.0,
+        atom37_order=False,
+        device=None,
+        atom_context_num=16,
+        lower_bound=0.0,
+        upper_bound=20.0,
+        hidden_dim=128,
+        num_encoder_layers=3,
+        num_decoder_layers=3,
+        dropout=0.1,
+        num_mix=3,
+    ):
+        super(Packer, self).__init__()
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.num_positional_embeddings = num_positional_embeddings
+        self.num_chain_embeddings = num_chain_embeddings
+        self.num_rbf = num_rbf
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.atom37_order = atom37_order
+        self.device = device
+        self.atom_context_num = atom_context_num
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.hidden_dim = hidden_dim
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.dropout = dropout
+        self.softplus = nn.Softplus(beta=1, threshold=20)
+        self.features = ProteinFeatures(
+            edge_features=edge_features,
+            node_features=node_features,
+            num_positional_embeddings=num_positional_embeddings,
+            num_chain_embeddings=num_chain_embeddings,
+            num_rbf=num_rbf,
+            top_k=top_k,
+            augment_eps=augment_eps,
+            atom37_order=atom37_order,
+            device=device,
+            atom_context_num=atom_context_num,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+        )
+        self.W_e = nn.Linear(edge_features, hidden_dim, bias=True)
+        self.W_v = nn.Linear(node_features, hidden_dim, bias=True)
+        self.W_f = nn.Linear(edge_features, hidden_dim, bias=True)
+        self.W_v_sc = nn.Linear(node_features, hidden_dim, bias=True)
+        self.linear_down = nn.Linear(2 * hidden_dim, hidden_dim, bias=True)
+        self.W_torsions = nn.Linear(hidden_dim, 4 * 3 * num_mix, bias=True)
+        self.num_mix = num_mix
+        self.dropout = nn.Dropout(dropout)
+        # Encoder layers
+        self.encoder_layers = nn.ModuleList(
+            [
+                EncLayer(hidden_dim, hidden_dim * 2, dropout=dropout)
+                for _ in range(num_encoder_layers)
+            ]
+        )
+        self.W_c = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.W_e_context = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.W_nodes_y = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.W_edges_y = nn.Linear(hidden_dim, hidden_dim, bias=True)
+        self.context_encoder_layers = nn.ModuleList(
+            [DecLayer(hidden_dim, hidden_dim * 2, dropout=dropout) for _ in range(2)]
+        )
+        self.V_C = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.V_C_norm = nn.LayerNorm(hidden_dim)
+        self.y_context_encoder_layers = nn.ModuleList(
+            [DecLayerJ(hidden_dim, hidden_dim, dropout=dropout) for _ in range(2)]
+        )
+        self.h_V_C_dropout = nn.Dropout(dropout)
+        # Decoder layers
+        self.decoder_layers = nn.ModuleList(
+            [
+                DecLayer(hidden_dim, hidden_dim * 3, dropout=dropout)
+                for _ in range(num_decoder_layers)
+            ]
+        )
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def encode(self, feature_dict):
+        mask = feature_dict["mask"]
+        V, E, E_idx, Y_nodes, Y_edges, E_context, Y_m = self.features.features_encode(
+            feature_dict
+        )
+        h_E_context = self.W_e_context(E_context)
+        h_V = self.W_v(V)
+        h_E = self.W_e(E)
+        mask_attend = gather_nodes(mask.unsqueeze(-1), E_idx).squeeze(-1)
+        mask_attend = mask.unsqueeze(-1) * mask_attend
+        for layer in self.encoder_layers:
+            h_V, h_E = layer(h_V, h_E, E_idx, mask, mask_attend)
+        h_V_C = self.W_c(h_V)
+        Y_m_edges = Y_m[:, :, :, None] * Y_m[:, :, None, :]
+        Y_nodes = self.W_nodes_y(Y_nodes)
+        Y_edges = self.W_edges_y(Y_edges)
+        for i in range(len(self.context_encoder_layers)):
+            Y_nodes = self.y_context_encoder_layers[i](Y_nodes, Y_edges, Y_m, Y_m_edges)
+            h_E_context_cat = torch.cat([h_E_context, Y_nodes], -1)
+            h_V_C = self.context_encoder_layers[i](h_V_C, h_E_context_cat, mask, Y_m)
+        h_V_C = self.V_C(h_V_C)
+        h_V = h_V + self.V_C_norm(self.h_V_C_dropout(h_V_C))
+        return h_V, h_E, E_idx
+    def decode(self, feature_dict):
+        h_V = feature_dict["h_V"]
+        h_E = feature_dict["h_E"]
+        E_idx = feature_dict["E_idx"]
+        mask = feature_dict["mask"]
+        device = h_V.device
+        V, F = self.features.features_decode(feature_dict)
+        h_F = self.W_f(F)
+        h_EF = torch.cat([h_E, h_F], -1)
+        h_V_sc = self.W_v_sc(V)
+        h_V_combined = torch.cat([h_V, h_V_sc], -1)
+        h_V = self.linear_down(h_V_combined)
+        for layer in self.decoder_layers:
+            h_EV = cat_neighbors_nodes(h_V, h_EF, E_idx)
+            h_V = layer(h_V, h_EV, mask)
+        torsions = self.W_torsions(h_V)
+        torsions = torsions.reshape(h_V.shape[0], h_V.shape[1], 4, self.num_mix, 3)
+        mean = torsions[:, :, :, :, 0].float()
+        concentration = 0.1 + self.softplus(torsions[:, :, :, :, 1]).float()
+        mix_logits = torsions[:, :, :, :, 2].float()
+        return mean, concentration, mix_logits
+class ProteinFeatures(nn.Module):
+    def __init__(
+        self,
+        edge_features=128,
+        node_features=128,
+        num_positional_embeddings=16,
+        num_chain_embeddings=16,
+        num_rbf=16,
+        top_k=30,
+        augment_eps=0.0,
+        atom37_order=False,
+        device=None,
+        atom_context_num=16,
+        lower_bound=0.0,
+        upper_bound=20.0,
+    ):
+        """Extract protein features"""
+        super(ProteinFeatures, self).__init__()
+        self.edge_features = edge_features
+        self.node_features = node_features
+        self.num_positional_embeddings = num_positional_embeddings
+        self.num_chain_embeddings = num_chain_embeddings
+        self.num_rbf = num_rbf
+        self.top_k = top_k
+        self.augment_eps = augment_eps
+        self.atom37_order = atom37_order
+        self.device = device
+        self.atom_context_num = atom_context_num
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        # deal with oxygen index
+        # ------
+        self.N_idx = 0
+        self.CA_idx = 1
+        self.C_idx = 2
+        if atom37_order:
+            self.O_idx = 4
+        else:
+            self.O_idx = 3
+        # -------
+        self.positional_embeddings = PositionalEncodings(num_positional_embeddings)
+        # Features for the encoder
+        enc_node_in = 21  # alphabet for the sequence
+        enc_edge_in = (
+            num_positional_embeddings + num_rbf * 25
+        )  # positional + distance features
+        self.enc_node_in = enc_node_in
+        self.enc_edge_in = enc_edge_in
+        self.enc_edge_embedding = nn.Linear(enc_edge_in, edge_features, bias=False)
+        self.enc_norm_edges = nn.LayerNorm(edge_features)
+        self.enc_node_embedding = nn.Linear(enc_node_in, node_features, bias=False)
+        self.enc_norm_nodes = nn.LayerNorm(node_features)
+        # Features for the decoder
+        dec_node_in = 14 * atom_context_num * num_rbf
+        dec_edge_in = num_rbf * 14 * 14 + 42
+        self.dec_node_in = dec_node_in
+        self.dec_edge_in = dec_edge_in
+        self.W_XY_project_down1 = nn.Linear(num_rbf + 120, num_rbf, bias=True)
+        self.dec_edge_embedding1 = nn.Linear(dec_edge_in, edge_features, bias=False)
+        self.dec_norm_edges1 = nn.LayerNorm(edge_features)
+        self.dec_node_embedding1 = nn.Linear(dec_node_in, node_features, bias=False)
+        self.dec_norm_nodes1 = nn.LayerNorm(node_features)
+        self.node_project_down = nn.Linear(
+            5 * num_rbf + 64 + 4, node_features, bias=True
+        )
+        self.norm_nodes = nn.LayerNorm(node_features)
+        self.type_linear = nn.Linear(147, 64)
+        self.y_nodes = nn.Linear(147, node_features, bias=False)
+        self.y_edges = nn.Linear(num_rbf, node_features, bias=False)
+        self.norm_y_edges = nn.LayerNorm(node_features)
+        self.norm_y_nodes = nn.LayerNorm(node_features)
+        self.periodic_table_features = torch.tensor(
+            [
+                [
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    19,
+                    20,
+                    21,
+                    22,
+                    23,
+                    24,
+                    25,
+                    26,
+                    27,
+                    28,
+                    29,
+                    30,
+                    31,
+                    32,
+                    33,
+                    34,
+                    35,
+                    36,
+                    37,
+                    38,
+                    39,
+                    40,
+                    41,
+                    42,
+                    43,
+                    44,
+                    45,
+                    46,
+                    47,
+                    48,
+                    49,
+                    50,
+                    51,
+                    52,
+                    53,
+                    54,
+                    55,
+                    56,
+                    57,
+                    58,
+                    59,
+                    60,
+                    61,
+                    62,
+                    63,
+                    64,
+                    65,
+                    66,
+                    67,
+                    68,
+                    69,
+                    70,
+                    71,
+                    72,
+                    73,
+                    74,
+                    75,
+                    76,
+                    77,
+                    78,
+                    79,
+                    80,
+                    81,
+                    82,
+                    83,
+                    84,
+                    85,
+                    86,
+                    87,
+                    88,
+                    89,
+                    90,
+                    91,
+                    92,
+                    93,
+                    94,
+                    95,
+                    96,
+                    97,
+                    98,
+                    99,
+                    100,
+                    101,
+                    102,
+                    103,
+                    104,
+                    105,
+                    106,
+                    107,
+                    108,
+                    109,
+                    110,
+                    111,
+                    112,
+                    113,
+                    114,
+                    115,
+                    116,
+                    117,
+                    118,
+                ],
+                [
+                    0,
+                    1,
+                    18,
+                    1,
+                    2,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    1,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                ],
+                [
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    3,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                    7,
+                ],
+            ],
+            dtype=torch.long,
+            device=device,
+        )
+    def _dist(self, X, mask, eps=1e-6):
+        mask_2D = torch.unsqueeze(mask, 1) * torch.unsqueeze(mask, 2)
+        dX = torch.unsqueeze(X, 1) - torch.unsqueeze(X, 2)
+        D = mask_2D * torch.sqrt(torch.sum(dX**2, 3) + eps)
+        D_max, _ = torch.max(D, -1, keepdim=True)
+        D_adjust = D + (1.0 - mask_2D) * D_max
+        sampled_top_k = self.top_k
+        D_neighbors, E_idx = torch.topk(
+            D_adjust, np.minimum(self.top_k, X.shape[1]), dim=-1, largest=False
+        )
+        return D_neighbors, E_idx
+    def _make_angle_features(self, A, B, C, Y):
+        v1 = A - B
+        v2 = C - B
+        e1 = torch.nn.functional.normalize(v1, dim=-1)
+        e1_v2_dot = torch.einsum("bli, bli -> bl", e1, v2)[..., None]
+        u2 = v2 - e1 * e1_v2_dot
+        e2 = torch.nn.functional.normalize(u2, dim=-1)
+        e3 = torch.cross(e1, e2, dim=-1)
+        R_residue = torch.cat(
+            (e1[:, :, :, None], e2[:, :, :, None], e3[:, :, :, None]), dim=-1
+        )
+        local_vectors = torch.einsum(
+            "blqp, blyq -> blyp", R_residue, Y - B[:, :, None, :]
+        )
+        rxy = torch.sqrt(local_vectors[..., 0] ** 2 + local_vectors[..., 1] ** 2 + 1e-8)
+        f1 = local_vectors[..., 0] / rxy
+        f2 = local_vectors[..., 1] / rxy
+        rxyz = torch.norm(local_vectors, dim=-1) + 1e-8
+        f3 = rxy / rxyz
+        f4 = local_vectors[..., 2] / rxyz
+        f = torch.cat([f1[..., None], f2[..., None], f3[..., None], f4[..., None]], -1)
+        return f
+    def _rbf(
+        self,
+        D,
+        D_mu_shape=[1, 1, 1, -1],
+        lower_bound=0.0,
+        upper_bound=20.0,
+        num_bins=16,
+    ):
+        device = D.device
+        D_min, D_max, D_count = lower_bound, upper_bound, num_bins
+        D_mu = torch.linspace(D_min, D_max, D_count, device=device)
+        D_mu = D_mu.view(D_mu_shape)
+        D_sigma = (D_max - D_min) / D_count
+        D_expand = torch.unsqueeze(D, -1)
+        RBF = torch.exp(-(((D_expand - D_mu) / D_sigma) ** 2))
+        return RBF
+    def _get_rbf(
+        self,
+        A,
+        B,
+        E_idx,
+        D_mu_shape=[1, 1, 1, -1],
+        lower_bound=2.0,
+        upper_bound=22.0,
+        num_bins=16,
+    ):
+        D_A_B = torch.sqrt(
+            torch.sum((A[:, :, None, :] - B[:, None, :, :]) ** 2, -1) + 1e-6
+        )  # [B, L, L]
+        D_A_B_neighbors = gather_edges(D_A_B[:, :, :, None], E_idx)[
+            :, :, :, 0
+        ]  # [B,L,K]
+        RBF_A_B = self._rbf(
+            D_A_B_neighbors,
+            D_mu_shape=D_mu_shape,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+            num_bins=num_bins,
+        )
+        return RBF_A_B
+    def features_encode(self, features):
+        """
+        make protein graph and encode backbone
+        """
+        S = features["S"]
+        X = features["X"]
+        Y = features["Y"]
+        Y_m = features["Y_m"]
+        Y_t = features["Y_t"]
+        mask = features["mask"]
+        R_idx = features["R_idx"]
+        chain_labels = features["chain_labels"]
+        if self.training and self.augment_eps > 0:
+            X = X + self.augment_eps * torch.randn_like(X)
+        Ca = X[:, :, self.CA_idx, :]
+        N = X[:, :, self.N_idx, :]
+        C = X[:, :, self.C_idx, :]
+        O = X[:, :, self.O_idx, :]
+        b = Ca - N
+        c = C - Ca
+        a = torch.cross(b, c, dim=-1)
+        Cb = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + Ca  # shift from CA
+        _, E_idx = self._dist(Ca, mask)
+        backbone_coords_list = [N, Ca, C, O, Cb]
+        RBF_all = []
+        for atom_1 in backbone_coords_list:
+            for atom_2 in backbone_coords_list:
+                RBF_all.append(
+                    self._get_rbf(
+                        atom_1,
+                        atom_2,
+                        E_idx,
+                        D_mu_shape=[1, 1, 1, -1],
+                        lower_bound=self.lower_bound,
+                        upper_bound=self.upper_bound,
+                        num_bins=self.num_rbf,
+                    )
+                )
+        RBF_all = torch.cat(tuple(RBF_all), dim=-1)
+        offset = R_idx[:, :, None] - R_idx[:, None, :]
+        offset = gather_edges(offset[:, :, :, None], E_idx)[:, :, :, 0]  # [B, L, K]
+        d_chains = (
+            (chain_labels[:, :, None] - chain_labels[:, None, :]) == 0
+        ).long()  # find self vs non-self interaction
+        E_chains = gather_edges(d_chains[:, :, :, None], E_idx)[:, :, :, 0]
+        E_positional = self.positional_embeddings(offset.long(), E_chains)
+        E = torch.cat((E_positional, RBF_all), -1)
+        E = self.enc_edge_embedding(E)
+        E = self.enc_norm_edges(E)
+        V = torch.nn.functional.one_hot(S, self.enc_node_in).float()
+        V = self.enc_node_embedding(V)
+        V = self.enc_norm_nodes(V)
+        Y_t = Y_t.long()
+        Y_t_g = self.periodic_table_features[1][Y_t]  # group; 19 categories including 0
+        Y_t_p = self.periodic_table_features[2][Y_t]  # period; 8 categories including 0
+        Y_t_g_1hot_ = torch.nn.functional.one_hot(Y_t_g, 19)  # [B, L, M, 19]
+        Y_t_p_1hot_ = torch.nn.functional.one_hot(Y_t_p, 8)  # [B, L, M, 8]
+        Y_t_1hot_ = torch.nn.functional.one_hot(Y_t, 120)  # [B, L, M, 120]
+        Y_t_1hot_ = torch.cat(
+            [Y_t_1hot_, Y_t_g_1hot_, Y_t_p_1hot_], -1
+        )  # [B, L, M, 147]
+        Y_t_1hot = self.type_linear(Y_t_1hot_.float())
+        D_N_Y = torch.sqrt(
+            torch.sum((N[:, :, None, :] - Y) ** 2, -1) + 1e-6
+        )  # [B, L, M, num_bins]
+        D_N_Y = self._rbf(
+            D_N_Y,
+            D_mu_shape=[1, 1, 1, -1],
+            lower_bound=self.lower_bound,
+            upper_bound=self.upper_bound,
+            num_bins=self.num_rbf,
+        )
+        D_Ca_Y = torch.sqrt(
+            torch.sum((Ca[:, :, None, :] - Y) ** 2, -1) + 1e-6
+        )  # [B, L, M, num_bins]
+        D_Ca_Y = self._rbf(
+            D_Ca_Y,
+            D_mu_shape=[1, 1, 1, -1],
+            lower_bound=self.lower_bound,
+            upper_bound=self.upper_bound,
+            num_bins=self.num_rbf,
+        )
+        D_C_Y = torch.sqrt(
+            torch.sum((C[:, :, None, :] - Y) ** 2, -1) + 1e-6
+        )  # [B, L, M, num_bins]
+        D_C_Y = self._rbf(
+            D_C_Y,
+            D_mu_shape=[1, 1, 1, -1],
+            lower_bound=self.lower_bound,
+            upper_bound=self.upper_bound,
+            num_bins=self.num_rbf,
+        )
+        D_O_Y = torch.sqrt(
+            torch.sum((O[:, :, None, :] - Y) ** 2, -1) + 1e-6
+        )  # [B, L, M, num_bins]
+        D_O_Y = self._rbf(
+            D_O_Y,
+            D_mu_shape=[1, 1, 1, -1],
+            lower_bound=self.lower_bound,
+            upper_bound=self.upper_bound,
+            num_bins=self.num_rbf,
+        )
+        D_Cb_Y = torch.sqrt(
+            torch.sum((Cb[:, :, None, :] - Y) ** 2, -1) + 1e-6
+        )  # [B, L, M, num_bins]
+        D_Cb_Y = self._rbf(
+            D_Cb_Y,
+            D_mu_shape=[1, 1, 1, -1],
+            lower_bound=self.lower_bound,
+            upper_bound=self.upper_bound,
+            num_bins=self.num_rbf,
+        )
+        f_angles = self._make_angle_features(N, Ca, C, Y)
+        D_all = torch.cat(
+            (D_N_Y, D_Ca_Y, D_C_Y, D_O_Y, D_Cb_Y, Y_t_1hot, f_angles), dim=-1
+        )  # [B,L,M,5*num_bins+5]
+        E_context = self.node_project_down(D_all)  # [B, L, M, node_features]
+        E_context = self.norm_nodes(E_context)
+        Y_edges = self._rbf(
+            torch.sqrt(
+                torch.sum((Y[:, :, :, None, :] - Y[:, :, None, :, :]) ** 2, -1) + 1e-6
+            )
+        )  # [B, L, M, M, num_bins]
+        Y_edges = self.y_edges(Y_edges)
+        Y_nodes = self.y_nodes(Y_t_1hot_.float())
+        Y_edges = self.norm_y_edges(Y_edges)
+        Y_nodes = self.norm_y_nodes(Y_nodes)
+        return V, E, E_idx, Y_nodes, Y_edges, E_context, Y_m
+    def features_decode(self, features):
+        """
+        Make features for decoding. Explicit side chain atom and other atom distances.
+        """
+        S = features["S"]
+        X = features["X"]
+        X_m = features["X_m"]
+        mask = features["mask"]
+        E_idx = features["E_idx"]
+        Y = features["Y"][:, :, : self.atom_context_num]
+        Y_m = features["Y_m"][:, :, : self.atom_context_num]
+        Y_t = features["Y_t"][:, :, : self.atom_context_num]
+        X_m = X_m * mask[:, :, None]
+        device = S.device
+        B, L, _, _ = X.shape
+        RBF_sidechain = []
+        X_m_gathered = gather_nodes(X_m, E_idx)  # [B, L, K, 14]
+        for i in range(14):
+            for j in range(14):
+                rbf_features = self._get_rbf(
+                    X[:, :, i, :],
+                    X[:, :, j, :],
+                    E_idx,
+                    D_mu_shape=[1, 1, 1, -1],
+                    lower_bound=self.lower_bound,
+                    upper_bound=self.upper_bound,
+                    num_bins=self.num_rbf,
+                )
+                rbf_features = (
+                    rbf_features
+                    * X_m[:, :, i, None, None]
+                    * X_m_gathered[:, :, :, j, None]
+                )
+                RBF_sidechain.append(rbf_features)
+        D_XY = torch.sqrt(
+            torch.sum((X[:, :, :, None, :] - Y[:, :, None, :, :]) ** 2, -1) + 1e-6
+        )  # [B, L, 14, atom_context_num]
+        XY_features = self._rbf(
+            D_XY,
+            D_mu_shape=[1, 1, 1, 1, -1],
+            lower_bound=self.lower_bound,
+            upper_bound=self.upper_bound,
+            num_bins=self.num_rbf,
+        )  # [B, L, 14, atom_context_num, num_rbf]
+        XY_features = XY_features * X_m[:, :, :, None, None] * Y_m[:, :, None, :, None]
+        Y_t_1hot = torch.nn.functional.one_hot(
+            Y_t.long(), 120
+        ).float()  # [B, L, atom_context_num, 120]
+        XY_Y_t = torch.cat(
+            [XY_features, Y_t_1hot[:, :, None, :, :].repeat(1, 1, 14, 1, 1)], -1
+        )  # [B, L, 14, atom_context_num, num_rbf+120]
+        XY_Y_t = self.W_XY_project_down1(
+            XY_Y_t
+        )  # [B, L, 14, atom_context_num, num_rbf]
+        XY_features = XY_Y_t.view([B, L, -1])
+        V = self.dec_node_embedding1(XY_features)
+        V = self.dec_norm_nodes1(V)
+        S_1h = torch.nn.functional.one_hot(S, self.enc_node_in).float()
+        S_1h_gathered = gather_nodes(S_1h, E_idx)  # [B, L, K, 21]
+        S_features = torch.cat(
+            [S_1h[:, :, None, :].repeat(1, 1, E_idx.shape[2], 1), S_1h_gathered], -1
+        )  # [B, L, K, 42]
+        F = torch.cat(
+            tuple(RBF_sidechain), dim=-1
+        )  # [B,L,atom_context_num,14*14*num_rbf]
+        F = torch.cat([F, S_features], -1)
+        F = self.dec_edge_embedding1(F)
+        F = self.dec_norm_edges1(F)
+        return V, F

score.py ADDED Viewed

	@@ -0,0 +1,549 @@

+import argparse
+import json
+import os.path
+import random
+import sys
+import numpy as np
+import torch
+from data_utils import (
+    element_dict_rev,
+    alphabet,
+    restype_int_to_str,
+    featurize,
+    parse_PDB,
+)
+from model_utils import ProteinMPNN
+def main(args) -> None:
+    """
+    Inference function
+    """
+    if args.seed:
+        seed = args.seed
+    else:
+        seed = int(np.random.randint(0, high=99999, size=1, dtype=int)[0])
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
+    folder_for_outputs = args.out_folder
+    base_folder = folder_for_outputs
+    if base_folder[-1] != "/":
+        base_folder = base_folder + "/"
+    if not os.path.exists(base_folder):
+        os.makedirs(base_folder, exist_ok=True)
+    if args.model_type == "protein_mpnn":
+        checkpoint_path = args.checkpoint_protein_mpnn
+    elif args.model_type == "ligand_mpnn":
+        checkpoint_path = args.checkpoint_ligand_mpnn
+    elif args.model_type == "per_residue_label_membrane_mpnn":
+        checkpoint_path = args.checkpoint_per_residue_label_membrane_mpnn
+    elif args.model_type == "global_label_membrane_mpnn":
+        checkpoint_path = args.checkpoint_global_label_membrane_mpnn
+    elif args.model_type == "soluble_mpnn":
+        checkpoint_path = args.checkpoint_soluble_mpnn
+    else:
+        print("Choose one of the available models")
+        sys.exit()
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    if args.model_type == "ligand_mpnn":
+        atom_context_num = checkpoint["atom_context_num"]
+        ligand_mpnn_use_side_chain_context = args.ligand_mpnn_use_side_chain_context
+        k_neighbors = checkpoint["num_edges"]
+    else:
+        atom_context_num = 1
+        ligand_mpnn_use_side_chain_context = 0
+        k_neighbors = checkpoint["num_edges"]
+    model = ProteinMPNN(
+        node_features=128,
+        edge_features=128,
+        hidden_dim=128,
+        num_encoder_layers=3,
+        num_decoder_layers=3,
+        k_neighbors=k_neighbors,
+        device=device,
+        atom_context_num=atom_context_num,
+        model_type=args.model_type,
+        ligand_mpnn_use_side_chain_context=ligand_mpnn_use_side_chain_context,
+    )
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.to(device)
+    model.eval()
+    if args.pdb_path_multi:
+        with open(args.pdb_path_multi, "r") as fh:
+            pdb_paths = list(json.load(fh))
+    else:
+        pdb_paths = [args.pdb_path]
+    if args.fixed_residues_multi:
+        with open(args.fixed_residues_multi, "r") as fh:
+            fixed_residues_multi = json.load(fh)
+    else:
+        fixed_residues = [item for item in args.fixed_residues.split()]
+        fixed_residues_multi = {}
+        for pdb in pdb_paths:
+            fixed_residues_multi[pdb] = fixed_residues
+    if args.redesigned_residues_multi:
+        with open(args.redesigned_residues_multi, "r") as fh:
+            redesigned_residues_multi = json.load(fh)
+    else:
+        redesigned_residues = [item for item in args.redesigned_residues.split()]
+        redesigned_residues_multi = {}
+        for pdb in pdb_paths:
+            redesigned_residues_multi[pdb] = redesigned_residues
+    # loop over PDB paths
+    for pdb in pdb_paths:
+        if args.verbose:
+            print("Designing protein from this path:", pdb)
+        fixed_residues = fixed_residues_multi[pdb]
+        redesigned_residues = redesigned_residues_multi[pdb]
+        protein_dict, backbone, other_atoms, icodes, _ = parse_PDB(
+            pdb,
+            device=device,
+            chains=args.parse_these_chains_only,
+            parse_all_atoms=args.ligand_mpnn_use_side_chain_context,
+            parse_atoms_with_zero_occupancy=args.parse_atoms_with_zero_occupancy
+        )
+        # make chain_letter + residue_idx + insertion_code mapping to integers
+        R_idx_list = list(protein_dict["R_idx"].cpu().numpy())  # residue indices
+        chain_letters_list = list(protein_dict["chain_letters"])  # chain letters
+        encoded_residues = []
+        for i, R_idx_item in enumerate(R_idx_list):
+            tmp = str(chain_letters_list[i]) + str(R_idx_item) + icodes[i]
+            encoded_residues.append(tmp)
+        encoded_residue_dict = dict(zip(encoded_residues, range(len(encoded_residues))))
+        encoded_residue_dict_rev = dict(
+            zip(list(range(len(encoded_residues))), encoded_residues)
+        )
+        fixed_positions = torch.tensor(
+            [int(item not in fixed_residues) for item in encoded_residues],
+            device=device,
+        )
+        redesigned_positions = torch.tensor(
+            [int(item not in redesigned_residues) for item in encoded_residues],
+            device=device,
+        )
+        # specify which residues are buried for checkpoint_per_residue_label_membrane_mpnn model
+        if args.transmembrane_buried:
+            buried_residues = [item for item in args.transmembrane_buried.split()]
+            buried_positions = torch.tensor(
+                [int(item in buried_residues) for item in encoded_residues],
+                device=device,
+            )
+        else:
+            buried_positions = torch.zeros_like(fixed_positions)
+        if args.transmembrane_interface:
+            interface_residues = [item for item in args.transmembrane_interface.split()]
+            interface_positions = torch.tensor(
+                [int(item in interface_residues) for item in encoded_residues],
+                device=device,
+            )
+        else:
+            interface_positions = torch.zeros_like(fixed_positions)
+        protein_dict["membrane_per_residue_labels"] = 2 * buried_positions * (
+            1 - interface_positions
+        ) + 1 * interface_positions * (1 - buried_positions)
+        if args.model_type == "global_label_membrane_mpnn":
+            protein_dict["membrane_per_residue_labels"] = (
+                args.global_transmembrane_label + 0 * fixed_positions
+            )
+        if type(args.chains_to_design) == str:
+            chains_to_design_list = args.chains_to_design.split(",")
+        else:
+            chains_to_design_list = protein_dict["chain_letters"]
+        chain_mask = torch.tensor(
+            np.array(
+                [
+                    item in chains_to_design_list
+                    for item in protein_dict["chain_letters"]
+                ],
+                dtype=np.int32,
+            ),
+            device=device,
+        )
+        # create chain_mask to notify which residues are fixed (0) and which need to be designed (1)
+        if redesigned_residues:
+            protein_dict["chain_mask"] = chain_mask * (1 - redesigned_positions)
+        elif fixed_residues:
+            protein_dict["chain_mask"] = chain_mask * fixed_positions
+        else:
+            protein_dict["chain_mask"] = chain_mask
+        if args.verbose:
+            PDB_residues_to_be_redesigned = [
+                encoded_residue_dict_rev[item]
+                for item in range(protein_dict["chain_mask"].shape[0])
+                if protein_dict["chain_mask"][item] == 1
+            ]
+            PDB_residues_to_be_fixed = [
+                encoded_residue_dict_rev[item]
+                for item in range(protein_dict["chain_mask"].shape[0])
+                if protein_dict["chain_mask"][item] == 0
+            ]
+            print("These residues will be redesigned: ", PDB_residues_to_be_redesigned)
+            print("These residues will be fixed: ", PDB_residues_to_be_fixed)
+        # specify which residues are linked
+        if args.symmetry_residues:
+            symmetry_residues_list_of_lists = [
+                x.split(",") for x in args.symmetry_residues.split("|")
+            ]
+            remapped_symmetry_residues = []
+            for t_list in symmetry_residues_list_of_lists:
+                tmp_list = []
+                for t in t_list:
+                    tmp_list.append(encoded_residue_dict[t])
+                remapped_symmetry_residues.append(tmp_list)
+        else:
+            remapped_symmetry_residues = [[]]
+        if args.homo_oligomer:
+            if args.verbose:
+                print("Designing HOMO-OLIGOMER")
+            chain_letters_set = list(set(chain_letters_list))
+            reference_chain = chain_letters_set[0]
+            lc = len(reference_chain)
+            residue_indices = [
+                item[lc:] for item in encoded_residues if item[:lc] == reference_chain
+            ]
+            remapped_symmetry_residues = []
+            for res in residue_indices:
+                tmp_list = []
+                tmp_w_list = []
+                for chain in chain_letters_set:
+                    name = chain + res
+                    tmp_list.append(encoded_residue_dict[name])
+                    tmp_w_list.append(1 / len(chain_letters_set))
+                remapped_symmetry_residues.append(tmp_list)
+        # set other atom bfactors to 0.0
+        if other_atoms:
+            other_bfactors = other_atoms.getBetas()
+            other_atoms.setBetas(other_bfactors * 0.0)
+        # adjust input PDB name by dropping .pdb if it does exist
+        name = pdb[pdb.rfind("/") + 1 :]
+        if name[-4:] == ".pdb":
+            name = name[:-4]
+        with torch.no_grad():
+            # run featurize to remap R_idx and add batch dimension
+            if args.verbose:
+                if "Y" in list(protein_dict):
+                    atom_coords = protein_dict["Y"].cpu().numpy()
+                    atom_types = list(protein_dict["Y_t"].cpu().numpy())
+                    atom_mask = list(protein_dict["Y_m"].cpu().numpy())
+                    number_of_atoms_parsed = np.sum(atom_mask)
+                else:
+                    print("No ligand atoms parsed")
+                    number_of_atoms_parsed = 0
+                    atom_types = ""
+                    atom_coords = []
+                if number_of_atoms_parsed == 0:
+                    print("No ligand atoms parsed")
+                elif args.model_type == "ligand_mpnn":
+                    print(
+                        f"The number of ligand atoms parsed is equal to: {number_of_atoms_parsed}"
+                    )
+                    for i, atom_type in enumerate(atom_types):
+                        print(
+                            f"Type: {element_dict_rev[atom_type]}, Coords {atom_coords[i]}, Mask {atom_mask[i]}"
+                        )
+            feature_dict = featurize(
+                protein_dict,
+                cutoff_for_score=args.ligand_mpnn_cutoff_for_score,
+                use_atom_context=args.ligand_mpnn_use_atom_context,
+                number_of_ligand_atoms=atom_context_num,
+                model_type=args.model_type,
+            )
+            feature_dict["batch_size"] = args.batch_size
+            B, L, _, _ = feature_dict["X"].shape  # batch size should be 1 for now.
+            # add additional keys to the feature dictionary
+            feature_dict["symmetry_residues"] = remapped_symmetry_residues
+            logits_list = []
+            probs_list = []
+            log_probs_list = []
+            decoding_order_list = []
+            for _ in range(args.number_of_batches):
+                feature_dict["randn"] = torch.randn(
+                    [feature_dict["batch_size"], feature_dict["mask"].shape[1]],
+                    device=device,
+                )
+                if args.autoregressive_score:
+                    score_dict = model.score(feature_dict, use_sequence=args.use_sequence)
+                elif args.single_aa_score:
+                    score_dict = model.single_aa_score(feature_dict, use_sequence=args.use_sequence)
+                else:
+                    print("Set either autoregressive_score or single_aa_score to True")
+                    sys.exit()
+                logits_list.append(score_dict["logits"])
+                log_probs_list.append(score_dict["log_probs"])
+                probs_list.append(torch.exp(score_dict["log_probs"]))
+                decoding_order_list.append(score_dict["decoding_order"])
+            log_probs_stack = torch.cat(log_probs_list, 0)
+            logits_stack = torch.cat(logits_list, 0)
+            probs_stack = torch.cat(probs_list, 0)
+            decoding_order_stack = torch.cat(decoding_order_list, 0)
+            output_stats_path = base_folder + name + args.file_ending + ".pt"
+            out_dict = {}
+            out_dict["logits"] = logits_stack.cpu().numpy()
+            out_dict["probs"] = probs_stack.cpu().numpy()
+            out_dict["log_probs"] = log_probs_stack.cpu().numpy()
+            out_dict["decoding_order"] = decoding_order_stack.cpu().numpy()
+            out_dict["native_sequence"] = feature_dict["S"][0].cpu().numpy()
+            out_dict["mask"] = feature_dict["mask"][0].cpu().numpy()
+            out_dict["chain_mask"] = feature_dict["chain_mask"][0].cpu().numpy() #this affects decoding order
+            out_dict["seed"] = seed
+            out_dict["alphabet"] = alphabet
+            out_dict["residue_names"] = encoded_residue_dict_rev
+            mean_probs = np.mean(out_dict["probs"], 0)
+            std_probs = np.std(out_dict["probs"], 0)
+            sequence = [restype_int_to_str[AA] for AA in out_dict["native_sequence"]]
+            mean_dict = {}
+            std_dict = {}
+            for residue in range(L):
+                mean_dict_ = dict(zip(alphabet, mean_probs[residue]))
+                mean_dict[encoded_residue_dict_rev[residue]] = mean_dict_
+                std_dict_ = dict(zip(alphabet, std_probs[residue]))
+                std_dict[encoded_residue_dict_rev[residue]] = std_dict_
+            out_dict["sequence"] = sequence
+            out_dict["mean_of_probs"] = mean_dict
+            out_dict["std_of_probs"] = std_dict
+            torch.save(out_dict, output_stats_path)
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "--model_type",
+        type=str,
+        default="protein_mpnn",
+        help="Choose your model: protein_mpnn, ligand_mpnn, per_residue_label_membrane_mpnn, global_label_membrane_mpnn, soluble_mpnn",
+    )
+    # protein_mpnn - original ProteinMPNN trained on the whole PDB exluding non-protein atoms
+    # ligand_mpnn - atomic context aware model trained with small molecules, nucleotides, metals etc on the whole PDB
+    # per_residue_label_membrane_mpnn - ProteinMPNN model trained with addition label per residue specifying if that residue is buried or exposed
+    # global_label_membrane_mpnn - ProteinMPNN model trained with global label per PDB id to specify if protein is transmembrane
+    # soluble_mpnn - ProteinMPNN trained only on soluble PDB ids
+    argparser.add_argument(
+        "--checkpoint_protein_mpnn",
+        type=str,
+        default="./model_params/proteinmpnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_ligand_mpnn",
+        type=str,
+        default="./model_params/ligandmpnn_v_32_010_25.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_per_residue_label_membrane_mpnn",
+        type=str,
+        default="./model_params/per_residue_label_membrane_mpnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_global_label_membrane_mpnn",
+        type=str,
+        default="./model_params/global_label_membrane_mpnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument(
+        "--checkpoint_soluble_mpnn",
+        type=str,
+        default="./model_params/solublempnn_v_48_020.pt",
+        help="Path to model weights.",
+    )
+    argparser.add_argument("--verbose", type=int, default=1, help="Print stuff")
+    argparser.add_argument(
+        "--pdb_path", type=str, default="", help="Path to the input PDB."
+    )
+    argparser.add_argument(
+        "--pdb_path_multi",
+        type=str,
+        default="",
+        help="Path to json listing PDB paths. {'/path/to/pdb': ''} - only keys will be used.",
+    )
+    argparser.add_argument(
+        "--fixed_residues",
+        type=str,
+        default="",
+        help="Provide fixed residues, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--fixed_residues_multi",
+        type=str,
+        default="",
+        help="Path to json mapping of fixed residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}",
+    )
+    argparser.add_argument(
+        "--redesigned_residues",
+        type=str,
+        default="",
+        help="Provide to be redesigned residues, everything else will be fixed, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--redesigned_residues_multi",
+        type=str,
+        default="",
+        help="Path to json mapping of redesigned residues for each pdb i.e., {'/path/to/pdb': 'A12 A13 A14 B2 B25'}",
+    )
+    argparser.add_argument(
+        "--symmetry_residues",
+        type=str,
+        default="",
+        help="Add list of lists for which residues need to be symmetric, e.g. 'A12,A13,A14|C2,C3|A5,B6'",
+    )
+    argparser.add_argument(
+        "--homo_oligomer",
+        type=int,
+        default=0,
+        help="Setting this to 1 will automatically set --symmetry_residues and --symmetry_weights to do homooligomer design with equal weighting.",
+    )
+    argparser.add_argument(
+        "--out_folder",
+        type=str,
+        help="Path to a folder to output scores, e.g. /home/out/",
+    )
+    argparser.add_argument(
+        "--file_ending", type=str, default="", help="adding_string_to_the_end"
+    )
+    argparser.add_argument(
+        "--zero_indexed",
+        type=str,
+        default=0,
+        help="1 - to start output PDB numbering with 0",
+    )
+    argparser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Set seed for torch, numpy, and python random.",
+    )
+    argparser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="Number of sequence to generate per one pass.",
+    )
+    argparser.add_argument(
+        "--number_of_batches",
+        type=int,
+        default=1,
+        help="Number of times to design sequence using a chosen batch size.",
+    )
+    argparser.add_argument(
+        "--ligand_mpnn_use_atom_context",
+        type=int,
+        default=1,
+        help="1 - use atom context, 0 - do not use atom context.",
+    )
+    argparser.add_argument(
+        "--ligand_mpnn_use_side_chain_context",
+        type=int,
+        default=0,
+        help="Flag to use side chain atoms as ligand context for the fixed residues",
+    )
+    argparser.add_argument(
+        "--ligand_mpnn_cutoff_for_score",
+        type=float,
+        default=8.0,
+        help="Cutoff in angstroms between protein and context atoms to select residues for reporting score.",
+    )
+    argparser.add_argument(
+        "--chains_to_design",
+        type=str,
+        default=None,
+        help="Specify which chains to redesign, all others will be kept fixed.",
+    )
+    argparser.add_argument(
+        "--parse_these_chains_only",
+        type=str,
+        default="",
+        help="Provide chains letters for parsing backbones, 'ABCF'",
+    )
+    argparser.add_argument(
+        "--transmembrane_buried",
+        type=str,
+        default="",
+        help="Provide buried residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--transmembrane_interface",
+        type=str,
+        default="",
+        help="Provide interface residues when using checkpoint_per_residue_label_membrane_mpnn model, A12 A13 A14 B2 B25",
+    )
+    argparser.add_argument(
+        "--global_transmembrane_label",
+        type=int,
+        default=0,
+        help="Provide global label for global_label_membrane_mpnn model. 1 - transmembrane, 0 - soluble",
+    )
+    argparser.add_argument(
+        "--parse_atoms_with_zero_occupancy",
+        type=int,
+        default=0,
+        help="To parse atoms with zero occupancy in the PDB input files. 0 - do not parse, 1 - parse atoms with zero occupancy",
+    )
+    argparser.add_argument(
+        "--use_sequence",
+        type=int,
+        default=1,
+        help="1 - get scores using amino acid sequence info; 0 - get scores using backbone info only",
+    )
+    argparser.add_argument(
+        "--autoregressive_score",
+        type=int,
+        default=0,
+        help="1 - run autoregressive scoring function; p(AA_1|backbone); p(AA_2|backbone, AA_1) etc, 0 - False",
+    )
+    argparser.add_argument(
+        "--single_aa_score",
+        type=int,
+        default=1,
+        help="1 - run single amino acid scoring function; p(AA_i|backbone, AA_{all except ith one}), 0 - False",
+    )
+    args = argparser.parse_args()
+    main(args)

space_utils/download_weights.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import subprocess
+def download_ligandmpnn_weights():
+    url = "https://files.ipd.uw.edu/pub/ligandmpnn/ligandmpnn_v_32_030_25.pt"
+    command = f"wget {url} -O ./model_params/ligandmpnn_v_32_030_25.pt"
+    subprocess.run(command, shell=True, check=True)
+    return 0