| import contextlib |
| from dataclasses import dataclass, replace |
| from typing import Optional |
|
|
| import gemmi |
| import numpy as np |
| from rdkit import rdBase |
| from rdkit.Chem import AllChem |
| from rdkit.Chem.rdchem import Conformer, Mol |
| from sklearn.neighbors import KDTree |
|
|
| from boltz.data import const |
| from boltz.data.types import ( |
| Atom, |
| Bond, |
| Chain, |
| Connection, |
| Interface, |
| Residue, |
| Structure, |
| StructureInfo, |
| ) |
|
|
| |
| |
| |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ParsedAtom: |
| """A parsed atom object.""" |
|
|
| name: str |
| element: int |
| charge: int |
| coords: tuple[float, float, float] |
| conformer: tuple[float, float, float] |
| is_present: bool |
| chirality: int |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ParsedBond: |
| """A parsed bond object.""" |
|
|
| atom_1: int |
| atom_2: int |
| type: int |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ParsedResidue: |
| """A parsed residue object.""" |
|
|
| name: str |
| type: int |
| idx: int |
| atoms: list[ParsedAtom] |
| bonds: list[ParsedBond] |
| orig_idx: Optional[int] |
| atom_center: int |
| atom_disto: int |
| is_standard: bool |
| is_present: bool |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ParsedChain: |
| """A parsed chain object.""" |
|
|
| name: str |
| entity: str |
| type: str |
| residues: list[ParsedResidue] |
| sequence: list[str] |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ParsedConnection: |
| """A parsed connection object.""" |
|
|
| chain_1: str |
| chain_2: str |
| residue_index_1: int |
| residue_index_2: int |
| atom_index_1: str |
| atom_index_2: str |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class ParsedStructure: |
| """A parsed structure object.""" |
|
|
| data: Structure |
| info: StructureInfo |
| covalents: list[int] |
|
|
|
|
| |
| |
| |
|
|
|
|
| def get_dates(block: gemmi.cif.Block) -> tuple[str, str, str]: |
| """Get the deposited, released, and last revision dates. |
| |
| Parameters |
| ---------- |
| block : gemmi.cif.Block |
| The block to process. |
| |
| Returns |
| ------- |
| str |
| The deposited date. |
| str |
| The released date. |
| str |
| The last revision date. |
| |
| """ |
| deposited = "_pdbx_database_status.recvd_initial_deposition_date" |
| revision = "_pdbx_audit_revision_history.revision_date" |
| deposit_date = revision_date = release_date = "" |
| with contextlib.suppress(Exception): |
| deposit_date = block.find([deposited])[0][0] |
| release_date = block.find([revision])[0][0] |
| revision_date = block.find([revision])[-1][0] |
|
|
| return deposit_date, release_date, revision_date |
|
|
|
|
| def get_resolution(block: gemmi.cif.Block) -> float: |
| """Get the resolution from a gemmi structure. |
| |
| Parameters |
| ---------- |
| block : gemmi.cif.Block |
| The block to process. |
| |
| Returns |
| ------- |
| float |
| The resolution. |
| |
| """ |
| resolution = 0.0 |
| for res_key in ( |
| "_refine.ls_d_res_high", |
| "_em_3d_reconstruction.resolution", |
| "_reflns.d_resolution_high", |
| ): |
| with contextlib.suppress(Exception): |
| resolution = float(block.find([res_key])[0].str(0)) |
| break |
| return resolution |
|
|
|
|
| def get_method(block: gemmi.cif.Block) -> str: |
| """Get the method from a gemmi structure. |
| |
| Parameters |
| ---------- |
| block : gemmi.cif.Block |
| The block to process. |
| |
| Returns |
| ------- |
| str |
| The method. |
| |
| """ |
| method = "" |
| method_key = "_exptl.method" |
| with contextlib.suppress(Exception): |
| methods = block.find([method_key]) |
| method = ",".join([m.str(0).lower() for m in methods]) |
|
|
| return method |
|
|
|
|
| def convert_atom_name(name: str) -> tuple[int, int, int, int]: |
| """Convert an atom name to a standard format. |
| |
| Parameters |
| ---------- |
| name : str |
| The atom name. |
| |
| Returns |
| ------- |
| tuple[int, int, int, int] |
| The converted atom name. |
| |
| """ |
| name = name.strip() |
| name = [ord(c) - 32 for c in name] |
| name = name + [0] * (4 - len(name)) |
| return tuple(name) |
|
|
|
|
| def get_unk_token(dtype: gemmi.PolymerType) -> str: |
| """Get the unknown token for a given entity type. |
| |
| Parameters |
| ---------- |
| dtype : gemmi.EntityType |
| The entity type. |
| |
| Returns |
| ------- |
| str |
| The unknown token. |
| |
| """ |
| if dtype == gemmi.PolymerType.PeptideL: |
| unk = const.unk_token["PROTEIN"] |
| elif dtype == gemmi.PolymerType.Dna: |
| unk = const.unk_token["DNA"] |
| elif dtype == gemmi.PolymerType.Rna: |
| unk = const.unk_token["RNA"] |
| else: |
| msg = f"Unknown polymer type: {dtype}" |
| raise ValueError(msg) |
|
|
| return unk |
|
|
|
|
| def get_conformer(mol: Mol) -> Conformer: |
| """Retrieve an rdkit object for a deemed conformer. |
| |
| Inspired by `pdbeccdutils.core.component.Component`. |
| |
| Parameters |
| ---------- |
| mol: Mol |
| The molecule to process. |
| |
| Returns |
| ------- |
| Conformer |
| The desired conformer, if any. |
| |
| Raises |
| ------ |
| ValueError |
| If there are no conformers of the given tyoe. |
| |
| """ |
| for c in mol.GetConformers(): |
| try: |
| if c.GetProp("name") == "Computed": |
| return c |
| except KeyError: |
| pass |
|
|
| for c in mol.GetConformers(): |
| try: |
| if c.GetProp("name") == "Ideal": |
| return c |
| except KeyError: |
| pass |
|
|
| msg = "Conformer does not exist." |
| raise ValueError(msg) |
|
|
|
|
| def compute_covalent_ligands( |
| connections: list[gemmi.Connection], |
| subchain_map: dict[tuple[str, int], str], |
| entities: dict[str, gemmi.Entity], |
| ) -> set[str]: |
| """Compute the covalent ligands from a list of connections. |
| |
| Parameters |
| ---------- |
| connections: List[gemmi.Connection] |
| The connections to process. |
| subchain_map: dict[tuple[str, int], str] |
| The mapping from chain, residue index to subchain name. |
| entities: dict[str, gemmi.Entity] |
| The entities in the structure. |
| |
| Returns |
| ------- |
| set |
| The covalent ligand subchains. |
| |
| """ |
| |
| covalent_chain_ids = set() |
| for connection in connections: |
| if connection.type.name != "Covale": |
| continue |
|
|
| |
| chain_1_name = connection.partner1.chain_name |
| chain_2_name = connection.partner2.chain_name |
|
|
| res_1_id = connection.partner1.res_id.seqid |
| res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip() |
|
|
| res_2_id = connection.partner2.res_id.seqid |
| res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip() |
|
|
| subchain_1 = subchain_map[(chain_1_name, res_1_id)] |
| subchain_2 = subchain_map[(chain_2_name, res_2_id)] |
|
|
| |
| entity_1 = entities[subchain_1].entity_type.name |
| entity_2 = entities[subchain_2].entity_type.name |
|
|
| if entity_1 in {"NonPolymer", "Branched"}: |
| covalent_chain_ids.add(subchain_1) |
| if entity_2 in {"NonPolymer", "Branched"}: |
| covalent_chain_ids.add(subchain_2) |
|
|
| return covalent_chain_ids |
|
|
|
|
| def compute_interfaces(atom_data: np.ndarray, chain_data: np.ndarray) -> np.ndarray: |
| """Compute the chain-chain interfaces from a gemmi structure. |
| |
| Parameters |
| ---------- |
| atom_data : List[tuple] |
| The atom data. |
| chain_data : List[tuple] |
| The chain data. |
| |
| Returns |
| ------- |
| List[tuple[int, int]] |
| The interfaces. |
| |
| """ |
| |
| chain_ids = [] |
| for idx, chain in enumerate(chain_data): |
| chain_ids.extend([idx] * chain["atom_num"]) |
| chain_ids = np.array(chain_ids) |
|
|
| |
| coords = atom_data["coords"] |
| mask = atom_data["is_present"] |
|
|
| coords = coords[mask] |
| chain_ids = chain_ids[mask] |
|
|
| |
| tree = KDTree(coords, metric="euclidean") |
| query = tree.query_radius(coords, const.atom_interface_cutoff) |
|
|
| |
| interfaces = set() |
| for c1, pairs in zip(chain_ids, query): |
| chains = np.unique(chain_ids[pairs]) |
| chains = chains[chains != c1] |
| interfaces.update((c1, c2) for c2 in chains) |
|
|
| |
| interfaces = [(min(i, j), max(i, j)) for i, j in interfaces] |
| interfaces = list({(int(i), int(j)) for i, j in interfaces}) |
| interfaces = np.array(interfaces, dtype=Interface) |
| return interfaces |
|
|
|
|
| |
| |
| |
|
|
|
|
| def parse_ccd_residue( |
| name: str, |
| components: dict[str, Mol], |
| res_idx: int, |
| gemmi_mol: Optional[gemmi.Residue] = None, |
| is_covalent: bool = False, |
| ) -> Optional[ParsedResidue]: |
| """Parse an MMCIF ligand. |
| |
| First tries to get the SMILES string from the RCSB. |
| Then, tries to infer atom ordering using RDKit. |
| |
| Parameters |
| ---------- |
| name: str |
| The name of the molecule to parse. |
| components : dict |
| The preprocessed PDB components dictionary. |
| res_idx : int |
| The residue index. |
| gemmi_mol : Optional[gemmi.Residue] |
| The PDB molecule, as a gemmi Residue object, if any. |
| |
| Returns |
| ------- |
| ParsedResidue, optional |
| The output ParsedResidue, if successful. |
| |
| """ |
| unk_chirality = const.chirality_type_ids[const.unk_chirality_type] |
| |
| |
| is_present = gemmi_mol is not None |
|
|
| |
| if is_present: |
| orig_idx = gemmi_mol.seqid |
| orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip() |
| else: |
| orig_idx = None |
|
|
| |
| ref_mol = components[name] |
|
|
| |
| ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False) |
|
|
| |
| if ref_mol.GetNumAtoms() == 1: |
| pos = (0, 0, 0) |
| if is_present: |
| pos = ( |
| gemmi_mol[0].pos.x, |
| gemmi_mol[0].pos.y, |
| gemmi_mol[0].pos.z, |
| ) |
| ref_atom = ref_mol.GetAtoms()[0] |
| chirality_type = const.chirality_type_ids.get( |
| str(ref_atom.GetChiralTag()), unk_chirality |
| ) |
| atom = ParsedAtom( |
| name=ref_atom.GetProp("name"), |
| element=ref_atom.GetAtomicNum(), |
| charge=ref_atom.GetFormalCharge(), |
| coords=pos, |
| conformer=(0, 0, 0), |
| is_present=is_present, |
| chirality=chirality_type, |
| ) |
| unk_prot_id = const.unk_token_ids["PROTEIN"] |
| residue = ParsedResidue( |
| name=name, |
| type=unk_prot_id, |
| atoms=[atom], |
| bonds=[], |
| idx=res_idx, |
| orig_idx=orig_idx, |
| atom_center=0, |
| atom_disto=0, |
| is_standard=False, |
| is_present=is_present, |
| ) |
| return residue |
|
|
| |
| pdb_pos = {} |
| if is_present: |
| |
| for atom in gemmi_mol: |
| atom: gemmi.Atom |
| pos = (atom.pos.x, atom.pos.y, atom.pos.z) |
| pdb_pos[atom.name] = pos |
|
|
| |
| conformer = get_conformer(ref_mol) |
|
|
| |
| atoms = [] |
| atom_idx = 0 |
| idx_map = {} |
|
|
| for i, atom in enumerate(ref_mol.GetAtoms()): |
| |
| atom_name = atom.GetProp("name") |
| charge = atom.GetFormalCharge() |
| element = atom.GetAtomicNum() |
| ref_coords = conformer.GetAtomPosition(atom.GetIdx()) |
| ref_coords = (ref_coords.x, ref_coords.y, ref_coords.z) |
| chirality_type = const.chirality_type_ids.get( |
| str(atom.GetChiralTag()), unk_chirality |
| ) |
|
|
| |
| if ( |
| int(atom.GetProp("leaving_atom")) == 1 |
| and is_covalent |
| and (atom_name not in pdb_pos) |
| ): |
| continue |
|
|
| |
| coords = pdb_pos.get(atom_name) |
| if coords is None: |
| atom_is_present = False |
| coords = (0, 0, 0) |
| else: |
| atom_is_present = True |
|
|
| |
| atoms.append( |
| ParsedAtom( |
| name=atom_name, |
| element=element, |
| charge=charge, |
| coords=coords, |
| conformer=ref_coords, |
| is_present=atom_is_present, |
| chirality=chirality_type, |
| ) |
| ) |
| idx_map[i] = atom_idx |
| atom_idx += 1 |
|
|
| |
| bonds = [] |
| unk_bond = const.bond_type_ids[const.unk_bond_type] |
| for bond in ref_mol.GetBonds(): |
| idx_1 = bond.GetBeginAtomIdx() |
| idx_2 = bond.GetEndAtomIdx() |
|
|
| |
| if (idx_1 not in idx_map) or (idx_2 not in idx_map): |
| continue |
|
|
| idx_1 = idx_map[idx_1] |
| idx_2 = idx_map[idx_2] |
| start = min(idx_1, idx_2) |
| end = max(idx_1, idx_2) |
| bond_type = bond.GetBondType().name |
| bond_type = const.bond_type_ids.get(bond_type, unk_bond) |
| bonds.append(ParsedBond(start, end, bond_type)) |
|
|
| unk_prot_id = const.unk_token_ids["PROTEIN"] |
| return ParsedResidue( |
| name=name, |
| type=unk_prot_id, |
| atoms=atoms, |
| bonds=bonds, |
| idx=res_idx, |
| atom_center=0, |
| atom_disto=0, |
| orig_idx=orig_idx, |
| is_standard=False, |
| is_present=is_present, |
| ) |
|
|
|
|
| def parse_polymer( |
| polymer: gemmi.ResidueSpan, |
| polymer_type: gemmi.PolymerType, |
| sequence: list[str], |
| chain_id: str, |
| entity: str, |
| components: dict[str, Mol], |
| ) -> Optional[ParsedChain]: |
| """Process a gemmi Polymer into a chain object. |
| |
| Performs alignment of the full sequence to the polymer |
| residues. Loads coordinates and masks for the atoms in |
| the polymer, following the ordering in const.atom_order. |
| |
| Parameters |
| ---------- |
| polymer : gemmi.ResidueSpan |
| The polymer to process. |
| polymer_type : gemmi.PolymerType |
| The polymer type. |
| sequence : str |
| The full sequence of the polymer. |
| chain_id : str |
| The chain identifier. |
| entity : str |
| The entity name. |
| components : dict[str, Mol] |
| The preprocessed PDB components dictionary. |
| |
| Returns |
| ------- |
| ParsedChain, optional |
| The output chain, if successful. |
| |
| Raises |
| ------ |
| ValueError |
| If the alignment fails. |
| |
| """ |
| |
| unk_chirality = const.chirality_type_ids[const.unk_chirality_type] |
|
|
| |
| sequence = [gemmi.Entity.first_mon(item) for item in sequence] |
|
|
| |
| |
| result = gemmi.align_sequence_to_polymer( |
| sequence, |
| polymer, |
| polymer_type, |
| gemmi.AlignmentScoring(), |
| ) |
|
|
| |
| i = 0 |
| ref_res = set(const.tokens) |
| parsed = [] |
| for j, match in enumerate(result.match_string): |
| |
| res_name = sequence[j] |
|
|
| |
| res = None |
| name_to_atom = {} |
|
|
| if match == "|": |
| |
| res = polymer[i] |
| name_to_atom = {a.name.upper(): a for a in res} |
|
|
| |
| if res.name != res_name: |
| msg = "Alignment mismatch!" |
| raise ValueError(msg) |
|
|
| |
| i += 1 |
|
|
| |
| if res_name == "MSE": |
| res_name = "MET" |
| if "SE" in name_to_atom: |
| name_to_atom["SD"] = name_to_atom["SE"] |
|
|
| |
| elif res_name not in ref_res: |
| residue = parse_ccd_residue( |
| name=res_name, |
| components=components, |
| res_idx=j, |
| gemmi_mol=res, |
| is_covalent=True, |
| ) |
| parsed.append(residue) |
| continue |
|
|
| |
| ref_mol = components[res_name] |
| ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False) |
| ref_conformer = get_conformer(ref_mol) |
|
|
| |
| ref_name_to_atom = {a.GetProp("name"): a for a in ref_mol.GetAtoms()} |
| ref_atoms = [ref_name_to_atom[a] for a in const.ref_atoms[res_name]] |
|
|
| |
| atoms: list[ParsedAtom] = [] |
|
|
| for ref_atom in ref_atoms: |
| |
| atom_name = ref_atom.GetProp("name") |
| idx = ref_atom.GetIdx() |
|
|
| |
| ref_coords = ref_conformer.GetAtomPosition(idx) |
| ref_coords = (ref_coords.x, ref_coords.y, ref_coords.z) |
|
|
| |
| if atom_name in name_to_atom: |
| atom = name_to_atom[atom_name] |
| atom_is_present = True |
| coords = (atom.pos.x, atom.pos.y, atom.pos.z) |
| else: |
| atom_is_present = False |
| coords = (0, 0, 0) |
|
|
| |
| atoms.append( |
| ParsedAtom( |
| name=atom_name, |
| element=ref_atom.GetAtomicNum(), |
| charge=ref_atom.GetFormalCharge(), |
| coords=coords, |
| conformer=ref_coords, |
| is_present=atom_is_present, |
| chirality=const.chirality_type_ids.get( |
| str(ref_atom.GetChiralTag()), unk_chirality |
| ), |
| ) |
| ) |
|
|
| |
| |
| if (res is not None) and (res_name == "ARG"): |
| ref_atoms: list[str] = const.ref_atoms["ARG"] |
| cd = atoms[ref_atoms.index("CD")] |
| nh1 = atoms[ref_atoms.index("NH1")] |
| nh2 = atoms[ref_atoms.index("NH2")] |
|
|
| cd_coords = np.array(cd.coords) |
| nh1_coords = np.array(nh1.coords) |
| nh2_coords = np.array(nh2.coords) |
|
|
| if all(atom.is_present for atom in (cd, nh1, nh2)) and ( |
| np.linalg.norm(nh1_coords - cd_coords) |
| > np.linalg.norm(nh2_coords - cd_coords) |
| ): |
| atoms[ref_atoms.index("NH1")] = replace(nh1, coords=nh2.coords) |
| atoms[ref_atoms.index("NH2")] = replace(nh2, coords=nh1.coords) |
|
|
| |
| if res is not None: |
| orig_idx = res.seqid |
| orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip() |
| else: |
| orig_idx = None |
|
|
| atom_center = const.res_to_center_atom_id[res_name] |
| atom_disto = const.res_to_disto_atom_id[res_name] |
| parsed.append( |
| ParsedResidue( |
| name=res_name, |
| type=const.token_ids[res_name], |
| atoms=atoms, |
| bonds=[], |
| idx=j, |
| atom_center=atom_center, |
| atom_disto=atom_disto, |
| is_standard=True, |
| is_present=res is not None, |
| orig_idx=orig_idx, |
| ) |
| ) |
|
|
| |
| if polymer_type == gemmi.PolymerType.PeptideL: |
| chain_type = const.chain_type_ids["PROTEIN"] |
| elif polymer_type == gemmi.PolymerType.Dna: |
| chain_type = const.chain_type_ids["DNA"] |
| elif polymer_type == gemmi.PolymerType.Rna: |
| chain_type = const.chain_type_ids["RNA"] |
|
|
| |
| return ParsedChain( |
| name=chain_id, |
| entity=entity, |
| residues=parsed, |
| type=chain_type, |
| sequence=gemmi.one_letter_code(sequence), |
| ) |
|
|
|
|
| def parse_connection( |
| connection: gemmi.Connection, |
| chains: list[ParsedChain], |
| subchain_map: dict[tuple[str, int], str], |
| ) -> ParsedConnection: |
| """Parse (covalent) connection from a gemmi Connection. |
| |
| Parameters |
| ---------- |
| connections : gemmi.ConnectionList |
| The connection list to parse. |
| chains : List[Chain] |
| The parsed chains. |
| subchain_map : dict[tuple[str, int], str] |
| The mapping from chain, residue index to subchain name. |
| |
| Returns |
| ------- |
| List[Connection] |
| The parsed connections. |
| |
| """ |
| |
| chain_1_name = connection.partner1.chain_name |
| chain_2_name = connection.partner2.chain_name |
|
|
| res_1_id = connection.partner1.res_id.seqid |
| res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip() |
|
|
| res_2_id = connection.partner2.res_id.seqid |
| res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip() |
|
|
| subchain_1 = subchain_map[(chain_1_name, res_1_id)] |
| subchain_2 = subchain_map[(chain_2_name, res_2_id)] |
|
|
| |
| chain_1 = next(chain for chain in chains if (chain.name == subchain_1)) |
| chain_2 = next(chain for chain in chains if (chain.name == subchain_2)) |
|
|
| |
| res_1_idx, res_1 = next( |
| (idx, res) |
| for idx, res in enumerate(chain_1.residues) |
| if (res.orig_idx == res_1_id) |
| ) |
| res_2_idx, res_2 = next( |
| (idx, res) |
| for idx, res in enumerate(chain_2.residues) |
| if (res.orig_idx == res_2_id) |
| ) |
|
|
| |
| atom_index_1 = next( |
| idx |
| for idx, atom in enumerate(res_1.atoms) |
| if atom.name == connection.partner1.atom_name |
| ) |
| atom_index_2 = next( |
| idx |
| for idx, atom in enumerate(res_2.atoms) |
| if atom.name == connection.partner2.atom_name |
| ) |
|
|
| conn = ParsedConnection( |
| chain_1=subchain_1, |
| chain_2=subchain_2, |
| residue_index_1=res_1_idx, |
| residue_index_2=res_2_idx, |
| atom_index_1=atom_index_1, |
| atom_index_2=atom_index_2, |
| ) |
|
|
| return conn |
|
|
|
|
| def parse_mmcif( |
| path: str, |
| components: dict[str, Mol], |
| use_assembly: bool = True, |
| ) -> ParsedStructure: |
| """Parse a structure in MMCIF format. |
| |
| Parameters |
| ---------- |
| mmcif_file : PathLike |
| Path to the MMCIF file. |
| components: dict[str, Mol] |
| The preprocessed PDB components dictionary. |
| use_assembly: bool |
| Whether to use the first assembly. |
| |
| Returns |
| ------- |
| ParsedStructure |
| The parsed structure. |
| |
| """ |
| |
| blocker = rdBase.BlockLogs() |
|
|
| |
| block = gemmi.cif.read(str(path))[0] |
|
|
| |
| deposit_date, release_date, revision_date = get_dates(block) |
| resolution = get_resolution(block) |
| method = get_method(block) |
|
|
| |
| structure = gemmi.make_structure_from_block(block) |
|
|
| |
| structure.merge_chain_parts() |
| structure.remove_waters() |
| structure.remove_hydrogens() |
| structure.remove_alternative_conformations() |
| structure.remove_empty_chains() |
|
|
| |
| if use_assembly and structure.assemblies: |
| how = gemmi.HowToNameCopiedChain.AddNumber |
| assembly_name = structure.assemblies[0].name |
| structure.transform_to_assembly(assembly_name, how=how) |
|
|
| |
| |
| entities: dict[str, gemmi.Entity] = {} |
| entity_ids: dict[str, int] = {} |
| for entity_id, entity in enumerate(structure.entities): |
| entity: gemmi.Entity |
| if entity.entity_type.name == "Water": |
| continue |
| for subchain_id in entity.subchains: |
| entities[subchain_id] = entity |
| entity_ids[subchain_id] = entity_id |
|
|
| |
| |
| subchain_map = {} |
| for chain in structure[0]: |
| for residue in chain: |
| seq_id = residue.seqid |
| seq_id = str(seq_id.num) + str(seq_id.icode).strip() |
| subchain_map[(chain.name, seq_id)] = residue.subchain |
|
|
| |
| covalent_chain_ids = compute_covalent_ligands( |
| connections=structure.connections, |
| subchain_map=subchain_map, |
| entities=entities, |
| ) |
|
|
| |
| chains: list[ParsedChain] = [] |
| chain_seqs = [] |
| for raw_chain in structure[0].subchains(): |
| |
| subchain_id = raw_chain.subchain_id() |
| entity: gemmi.Entity = entities[subchain_id] |
| entity_type = entity.entity_type.name |
|
|
| |
| if entity_type == "Polymer": |
| |
| if entity.polymer_type.name not in { |
| "PeptideL", |
| "Dna", |
| "Rna", |
| }: |
| continue |
|
|
| |
| parsed_polymer = parse_polymer( |
| polymer=raw_chain, |
| polymer_type=entity.polymer_type, |
| sequence=entity.full_sequence, |
| chain_id=subchain_id, |
| entity=entity.name, |
| components=components, |
| ) |
| if parsed_polymer is not None: |
| chains.append(parsed_polymer) |
| chain_seqs.append(parsed_polymer.sequence) |
|
|
| |
| elif entity_type in {"NonPolymer", "Branched"}: |
| |
| if any(components.get(lig.name) is None for lig in raw_chain): |
| continue |
|
|
| residues = [] |
| for lig_idx, ligand in enumerate(raw_chain): |
| |
| if entity_type == "Branched": |
| is_covalent = True |
| else: |
| is_covalent = subchain_id in covalent_chain_ids |
|
|
| ligand: gemmi.Residue |
| residue = parse_ccd_residue( |
| name=ligand.name, |
| components=components, |
| res_idx=lig_idx, |
| gemmi_mol=ligand, |
| is_covalent=is_covalent, |
| ) |
| residues.append(residue) |
|
|
| if residues: |
| chains.append( |
| ParsedChain( |
| name=subchain_id, |
| entity=entity.name, |
| residues=residues, |
| type=const.chain_type_ids["NONPOLYMER"], |
| sequence=None, |
| ) |
| ) |
|
|
| |
| if not chains: |
| msg = "No chains parsed!" |
| raise ValueError(msg) |
|
|
| |
| connections: list[ParsedConnection] = [] |
| for connection in structure.connections: |
| |
| connection: gemmi.Connection |
| if connection.type.name != "Covale": |
| continue |
|
|
| parsed_connection = parse_connection( |
| connection=connection, |
| chains=chains, |
| subchain_map=subchain_map, |
| ) |
| connections.append(parsed_connection) |
|
|
| |
| atom_data = [] |
| bond_data = [] |
| res_data = [] |
| chain_data = [] |
| connection_data = [] |
|
|
| |
| atom_idx = 0 |
| res_idx = 0 |
| asym_id = 0 |
| sym_count = {} |
| chain_to_idx = {} |
| res_to_idx = {} |
|
|
| for asym_id, chain in enumerate(chains): |
| |
| res_num = len(chain.residues) |
| atom_num = sum(len(res.atoms) for res in chain.residues) |
|
|
| |
| entity_id = entity_ids[chain.name] |
| sym_id = sym_count.get(entity_id, 0) |
| chain_data.append( |
| ( |
| chain.name, |
| chain.type, |
| entity_id, |
| sym_id, |
| asym_id, |
| atom_idx, |
| atom_num, |
| res_idx, |
| res_num, |
| ) |
| ) |
| chain_to_idx[chain.name] = asym_id |
| sym_count[entity_id] = sym_id + 1 |
|
|
| |
| for i, res in enumerate(chain.residues): |
| atom_center = atom_idx + res.atom_center |
| atom_disto = atom_idx + res.atom_disto |
| res_data.append( |
| ( |
| res.name, |
| res.type, |
| res.idx, |
| atom_idx, |
| len(res.atoms), |
| atom_center, |
| atom_disto, |
| res.is_standard, |
| res.is_present, |
| ) |
| ) |
| res_to_idx[(chain.name, i)] = (res_idx, atom_idx) |
|
|
| for bond in res.bonds: |
| atom_1 = atom_idx + bond.atom_1 |
| atom_2 = atom_idx + bond.atom_2 |
| bond_data.append((atom_1, atom_2, bond.type)) |
|
|
| for atom in res.atoms: |
| atom_data.append( |
| ( |
| convert_atom_name(atom.name), |
| atom.element, |
| atom.charge, |
| atom.coords, |
| atom.conformer, |
| atom.is_present, |
| atom.chirality, |
| ) |
| ) |
| atom_idx += 1 |
|
|
| res_idx += 1 |
|
|
| |
| for conn in connections: |
| chain_1_idx = chain_to_idx[conn.chain_1] |
| chain_2_idx = chain_to_idx[conn.chain_2] |
| res_1_idx, atom_1_offset = res_to_idx[(conn.chain_1, conn.residue_index_1)] |
| res_2_idx, atom_2_offset = res_to_idx[(conn.chain_2, conn.residue_index_2)] |
| atom_1_idx = atom_1_offset + conn.atom_index_1 |
| atom_2_idx = atom_2_offset + conn.atom_index_2 |
| connection_data.append( |
| ( |
| chain_1_idx, |
| chain_2_idx, |
| res_1_idx, |
| res_2_idx, |
| atom_1_idx, |
| atom_2_idx, |
| ) |
| ) |
|
|
| |
| atoms = np.array(atom_data, dtype=Atom) |
| bonds = np.array(bond_data, dtype=Bond) |
| residues = np.array(res_data, dtype=Residue) |
| chains = np.array(chain_data, dtype=Chain) |
| connections = np.array(connection_data, dtype=Connection) |
| mask = np.ones(len(chain_data), dtype=bool) |
|
|
| |
| interfaces = compute_interfaces(atoms, chains) |
|
|
| |
| info = StructureInfo( |
| deposited=deposit_date, |
| revised=revision_date, |
| released=release_date, |
| resolution=resolution, |
| method=method, |
| num_chains=len(chains), |
| num_interfaces=len(interfaces), |
| ) |
|
|
| data = Structure( |
| atoms=atoms, |
| bonds=bonds, |
| residues=residues, |
| chains=chains, |
| connections=connections, |
| interfaces=interfaces, |
| mask=mask, |
| ) |
|
|
| return ParsedStructure(data=data, info=info, covalents=[]) |
|
|