| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """Standard residue names for polymers of different types (e.g., L- or D-amino acid proteins, |
| | mixed-chirality proteins, DNA/RNA, etc.) |
| | """ |
| |
|
| | from enum import Enum |
| |
|
| |
|
| | class polymerType(Enum): |
| | LPROT = 0 |
| | DPROT = 1 |
| | LDPROT = 2 |
| | DNA = 3 |
| | RNA = 4 |
| |
|
| |
|
| | def polymer_type_name(ptype: polymerType): |
| | if ptype == polymerType.LPROT: |
| | return "polypeptide(L)" |
| | elif ptype == polymerType.DPROT: |
| | return "polypeptide(D)" |
| | elif ptype == polymerType.LDPROT: |
| | return "polypeptide(L,D)" |
| | elif ptype == polymerType.DNA: |
| | return "polydeoxyribonucleotide" |
| | elif ptype == polymerType.RNA: |
| | return "polyribonucleotide" |
| | else: |
| | raise Exception(f"unknown polymer type {ptype}") |
| |
|
| |
|
| | _res3 = [[] for _ in range(len(polymerType))] |
| |
|
| | _res1 = [[] for _ in range(len(polymerType))] |
| |
|
| | _res_to_idx = [dict() for _ in range(len(polymerType))] |
| |
|
| | _unk_idx = [set() for _ in range(len(polymerType))] |
| |
|
| | _gap_idx = [set() for _ in range(len(polymerType))] |
| |
|
| | _stp_idx = [set() for _ in range(len(polymerType))] |
| |
|
| |
|
| | def _add_residue(ptype: polymerType, res3, res1): |
| | if isinstance(ptype, list): |
| | for pt, r3, r1 in zip(ptype, res3, res1): |
| | _add_residue(pt, r3, r1) |
| | else: |
| | _res_to_idx[ptype.value][res3] = len(_res3[ptype.value]) |
| | |
| | if res1 not in _res_to_idx[ptype.value]: |
| | _res_to_idx[ptype.value][res1] = _res_to_idx[ptype.value][res3] |
| | _res3[ptype.value].append(res3) |
| | _res1[ptype.value].append(res1) |
| | if res3 == "---": |
| | _gap_idx[ptype.value].add(_res_to_idx[ptype.value][res3]) |
| | elif res3 == "UNK": |
| | _unk_idx[ptype.value].add(_res_to_idx[ptype.value][res3]) |
| | elif res3 == "STP": |
| | _stp_idx[ptype.value].add(_res_to_idx[ptype.value][res3]) |
| |
|
| |
|
| | def num_tokens(ptype=polymerType.LPROT): |
| | return len(_res3[ptype.value]) |
| |
|
| |
|
| | def num_known_molecular_tokens(ptype=polymerType.LPROT): |
| | return sum( |
| | [ |
| | not is_punctuation_index(idx) and not is_unknown(idx) |
| | for idx in range(len(_res3[ptype.value])) |
| | ] |
| | ) |
| |
|
| |
|
| | def res_to_index(res: str, ptype=polymerType.LPROT): |
| | return _res_to_idx[ptype.value].get(res, next(iter(_unk_idx[ptype.value]))) |
| |
|
| |
|
| | def index_to_single(idx: int, ptype=polymerType.LPROT): |
| | return _res1[ptype.value][idx] |
| |
|
| |
|
| | def index_to_triple(idx: int, ptype=polymerType.LPROT): |
| | return _res3[ptype.value][idx] |
| |
|
| |
|
| | def to_single(res: str, ptype=polymerType.LPROT): |
| | return index_to_single(res_to_index(res, ptype)) |
| |
|
| |
|
| | def to_triple(res: str, ptype=polymerType.LPROT): |
| | return index_to_triple(res_to_index(res, ptype)) |
| |
|
| |
|
| | def is_gap_index(idx: int, ptype=polymerType.LPROT): |
| | return idx in _gap_idx[ptype.value] |
| |
|
| |
|
| | def is_stop_index(idx: int, ptype=polymerType.LPROT): |
| | return idx in _stp_idx[ptype.value] |
| |
|
| |
|
| | def is_unknown(res: str, ptype=polymerType.LPROT): |
| | return is_unknown_index(res_to_index(res, ptype), ptype) |
| |
|
| |
|
| | def is_unknown_index(idx: int, ptype=polymerType.LPROT): |
| | return idx in _unk_idx[ptype.value] |
| |
|
| |
|
| | def is_polymer_residue(res: str, ptype: polymerType): |
| | if ptype is None: |
| | |
| | for ptype in polymerType: |
| | if res in _res_to_idx[ptype.value]: |
| | return True |
| | return False |
| | return res in _res_to_idx[ptype.value] |
| |
|
| |
|
| | def is_punctuation_index(idx: int, ptype=polymerType.LPROT): |
| | return is_gap_index(idx, ptype) or is_stop_index(idx, ptype) |
| |
|
| |
|
| | def is_canonical(res: str, ptype=polymerType.LPROT): |
| | if ptype == polymerType.LPROT or ptype == polymerType.DPROT: |
| | idx = res_to_index(res, ptype) |
| | return (idx < 20) and (idx >= 0) |
| | elif ptype == polymerType.LDPROT: |
| | return is_canonical(res, polymerType.LPROT) or is_canonical( |
| | mirror_amino_acid(res), polymerType.DPROT |
| | ) |
| | raise Exception(f"do not known how to deal with polymer type {ptype}") |
| |
|
| |
|
| | def canonical_amino_acids(ptype=polymerType.LPROT): |
| | canonicals = [] |
| | for aa in _res3[ptype.value]: |
| | if is_canonical(aa, ptype): |
| | canonicals.append(aa) |
| | return canonicals |
| |
|
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["ALA", "DAL"], ["A", "a"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["CYS", "DCY"], ["C", "c"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["ASP", "DAS"], ["D", "d"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["GLU", "DGL"], ["E", "e"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["PHE", "DPN"], ["F", "f"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["GLY", "GLY"], ["G", "G"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["HIS", "DHI"], ["H", "h"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["ILE", "DIL"], ["I", "i"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["LYS", "DLY"], ["K", "k"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["LEU", "DLE"], ["L", "l"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["MET", "MED"], ["M", "m"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["ASN", "DSG"], ["N", "n"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["PRO", "DPR"], ["P", "p"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["GLN", "DGN"], ["Q", "q"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["ARG", "DAR"], ["R", "r"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["SER", "DSN"], ["S", "s"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["THR", "DTH"], ["T", "t"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["VAL", "DVA"], ["V", "v"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["TRP", "DTR"], ["W", "w"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["TYR", "DTY"], ["Y", "y"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["HSD", "DSD"], ["H", "h"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["HSE", "DSE"], ["H", "h"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["HSC", "DSC"], ["H", "h"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["HSP", "DSP"], ["H", "h"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["MSE", "DMS"], ["M", "m"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["CSO", "DCS"], ["C", "c"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["SEC", "DEC"], ["C", "c"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["CSX", "DCX"], ["C", "c"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["HIP", "DHP"], ["H", "h"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["SEP", "DEP"], ["S", "s"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["TPO", "DTP"], ["T", "t"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["PTR", "DPT"], ["Y", "y"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["UNK", "UNK"], ["X", "X"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["STP", "STP"], ["*", "*"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["---", "---"], ["-", "-"]) |
| |
|
| | _add_residue([polymerType.LPROT, polymerType.DPROT], ["---", "---"], [".", "."]) |
| |
|
| | for grp in [1, 2, 3]: |
| | for tp in [polymerType.LPROT, polymerType.DPROT]: |
| | for idx in range(num_tokens(tp)): |
| | if grp == 1: |
| | if not is_punctuation_index(idx, tp) and ( |
| | not is_unknown_index(idx, tp) |
| | ): |
| | if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]: |
| | _add_residue( |
| | polymerType.LDPROT, |
| | _res3[tp.value][idx], |
| | _res1[tp.value][idx], |
| | ) |
| | elif grp == 2: |
| | if is_unknown_index(idx, tp): |
| | if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]: |
| | _add_residue( |
| | polymerType.LDPROT, |
| | _res3[tp.value][idx], |
| | _res1[tp.value][idx], |
| | ) |
| | elif grp == 3: |
| | if is_punctuation_index(idx, tp): |
| | if _res3[tp.value][idx] not in _res3[polymerType.LDPROT.value]: |
| | _add_residue( |
| | polymerType.LDPROT, |
| | _res3[tp.value][idx], |
| | _res1[tp.value][idx], |
| | ) |
| |
|
| |
|
| | def mirror_amino_acid(res: str): |
| | idx = mirror_amino_acid_index(res_to_index(res, polymerType.LDPROT)) |
| | if len(res) == 1: |
| | return index_to_single(idx) |
| | return index_to_triple(idx) |
| |
|
| |
|
| | def mirror_amino_acid_index(idx: int): |
| | N = num_known_molecular_tokens(polymerType.LDPROT) |
| |
|
| | |
| | if idx >= N: |
| | return idx |
| |
|
| | |
| | return (idx + N // 2) % N |
| |
|