"""Constants for the ESMFold2 input pipeline. Includes molecule types, residue types, vocabularies, atom lists, and element data. """ # ============================================================================= # Molecule types # ============================================================================= MOL_TYPE_PROTEIN = 0 MOL_TYPE_DNA = 1 MOL_TYPE_RNA = 2 MOL_TYPE_NONPOLYMER = 3 # ============================================================================= # Residue type indices # ============================================================================= # Standard amino acids (indices 2-21), MSE mapped to MET PROTEIN_RESIDUE_TO_RES_TYPE = { "ALA": 2, "ARG": 3, "ASN": 4, "ASP": 5, "CYS": 6, "GLN": 7, "GLU": 8, "GLY": 9, "HIS": 10, "ILE": 11, "LEU": 12, "LYS": 13, "MET": 14, "PHE": 15, "PRO": 16, "SER": 17, "THR": 18, "TRP": 19, "TYR": 20, "VAL": 21, "MSE": 14, # Selenomethionine -> MET } PROTEIN_UNK_RES_TYPE = 22 # RNA nucleotides (indices 23-26, unknown=27) RNA_RESIDUE_TO_RES_TYPE = {"A": 23, "G": 24, "C": 25, "U": 26} RNA_UNK_RES_TYPE = 27 # DNA nucleotides (indices 28-31, unknown=32) DNA_RESIDUE_TO_RES_TYPE = {"DA": 28, "DG": 29, "DC": 30, "DT": 31} DNA_UNK_RES_TYPE = 32 GAP_RES_TYPE = 32 # ============================================================================= # Vocabularies # ============================================================================= # 3-letter to 1-letter codes for proteins PROTEIN_3TO1 = { "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C", "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I", "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P", "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V", "MSE": "M", } # 1-letter to 3-letter codes PROTEIN_1TO3 = {v: k for k, v in PROTEIN_3TO1.items() if k != "MSE"} PROTEIN_1TO3["X"] = "UNK" # DNA 1-letter to CCD code DNA_1TO3 = {"A": "DA", "T": "DT", "C": "DC", "G": "DG"} # RNA 1-letter to CCD code RNA_1TO3 = {"A": "A", "U": "U", "C": "C", "G": "G"} # ESM-2 input_ids vocabulary for proteins ESM_PROTEIN_VOCAB = { "L": 4, "A": 5, "G": 6, "V": 7, "S": 8, "E": 9, "R": 10, "T": 11, "I": 12, "D": 13, "P": 14, "K": 15, "Q": 16, "N": 17, "F": 18, "Y": 19, "M": 20, "H": 21, "W": 22, "C": 23, "X": 3, # Unknown } # For DNA/RNA/ligands DNA_RNA_LIGAND_INPUT_ID = 24 # MSA tokens MSA_PAD_TOKEN_ID = 0 MSA_GAP_TOKEN_ID = 1 # Gap/insertion token for MSA # res_type int -> CCD component ID (for conformer lookup) RES_TYPE_TO_CCD = { # Proteins (2-22) 2: "ALA", 3: "ARG", 4: "ASN", 5: "ASP", 6: "CYS", 7: "GLN", 8: "GLU", 9: "GLY", 10: "HIS", 11: "ILE", 12: "LEU", 13: "LYS", 14: "MET", 15: "PHE", 16: "PRO", 17: "SER", 18: "THR", 19: "TRP", 20: "TYR", 21: "VAL", 22: "UNK", # RNA (23-27) 23: "A", 24: "G", 25: "C", 26: "U", 27: "N", # DNA (28-32) 28: "DA", 29: "DG", 30: "DC", 31: "DT", 32: "DN", } # ============================================================================= # Charged atoms at physiological pH # ============================================================================= CHARGED_ATOMS: dict[tuple[str, str], int] = { ("LYS", "NZ"): 1, ("ARG", "NH2"): 1, ("HIS", "ND1"): 1, ("PO4", "O2"): -1, ("PO4", "O3"): -1, ("PO4", "O4"): -1, ("SO4", "O3"): -1, ("SO4", "O4"): -1, ("MG", "MG"): 2, ("ZN", "ZN"): 2, ("CA", "CA"): 2, ("FE2", "FE"): 2, ("MN", "MN"): 2, ("CO", "CO"): 2, ("NCO", "CO"): 3, ("CU", "CU"): 2, ("NI", "NI"): 2, ("K", "K"): 1, ("NA", "NA"): 1, ("CD", "CD"): 2, ("CL", "CL"): -1, ("ACT", "OXT"): -1, ("NAD", "O2N"): -1, ("NAD", "N1N"): 1, ("NAP", "O2N"): -1, ("NAP", "N1N"): 1, ("IMD", "N3"): 1, ("SAM", "SD"): 1, ("FE", "FE"): 3, ("A1BH3", "N3"): 1, } # ============================================================================= # Element atomic numbers (Z=1 to 92) # ============================================================================= ELEMENT_TO_ATOMIC_NUM = { "H": 1, "LI": 3, "BE": 4, "B": 5, "C": 6, "N": 7, "O": 8, "F": 9, "NE": 10, "NA": 11, "MG": 12, "AL": 13, "SI": 14, "P": 15, "S": 16, "CL": 17, "AR": 18, "K": 19, "CA": 20, "SC": 21, "TI": 22, "V": 23, "CR": 24, "MN": 25, "FE": 26, "CO": 27, "NI": 28, "CU": 29, "ZN": 30, "GA": 31, "GE": 32, "AS": 33, "SE": 34, "BR": 35, "KR": 36, "RB": 37, "SR": 38, "Y": 39, "ZR": 40, "NB": 41, "MO": 42, "TC": 43, "RU": 44, "RH": 45, "PD": 46, "AG": 47, "CD": 48, "IN": 49, "SN": 50, "SB": 51, "TE": 52, "I": 53, "XE": 54, "CS": 55, "BA": 56, "LA": 57, "CE": 58, "PR": 59, "ND": 60, "PM": 61, "SM": 62, "EU": 63, "GD": 64, "TB": 65, "DY": 66, "HO": 67, "ER": 68, "TM": 69, "YB": 70, "LU": 71, "HF": 72, "TA": 73, "W": 74, "RE": 75, "OS": 76, "IR": 77, "PT": 78, "AU": 79, "HG": 80, "TL": 81, "PB": 82, "BI": 83, "PO": 84, "AT": 85, "RN": 86, "FR": 87, "RA": 88, "AC": 89, "TH": 90, "PA": 91, "U": 92, } # Inverse mapping: atomic number → element symbol ELEMENT_NUMBER_TO_SYMBOL = {v: k for k, v in ELEMENT_TO_ATOMIC_NUM.items()} # ============================================================================= # Standard heavy atoms per residue type # ============================================================================= PROTEIN_HEAVY_ATOMS = { "ALA": ["N", "CA", "C", "O", "CB"], "ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2"], "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2"], "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2"], "CYS": ["N", "CA", "C", "O", "CB", "SG"], "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2"], "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2"], "GLY": ["N", "CA", "C", "O"], "HIS": ["N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2"], "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1"], "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2"], "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ"], "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE"], "PHE": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ"], "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD"], "SER": ["N", "CA", "C", "O", "CB", "OG"], "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2"], "TRP": [ "N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "NE1", "CE2", "CE3", "CZ2", "CZ3", "CH2", ], "TYR": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH"], "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2"], "MSE": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE"], "UNK": ["N", "CA", "C", "O"], } DNA_HEAVY_ATOMS = { "DA": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N9", "C8", "N7", "C5", "C6", "N6", "N1", "C2", "N3", "C4", ], "DG": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N9", "C8", "N7", "C5", "C6", "O6", "N1", "C2", "N2", "N3", "C4", ], "DC": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6", ], "DT": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", "N1", "C2", "O2", "N3", "C4", "O4", "C5", "C7", "C6", ], } RNA_HEAVY_ATOMS = { "A": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N9", "C8", "N7", "C5", "C6", "N6", "N1", "C2", "N3", "C4", ], "G": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N9", "C8", "N7", "C5", "C6", "O6", "N1", "C2", "N2", "N3", "C4", ], "C": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6", ], "U": [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", "N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6", ], } # Unknown nucleotide backbone atoms DNA_BACKBONE_ATOMS = [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "C1'", ] RNA_BACKBONE_ATOMS = [ "P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'", ]