| |
|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| import json |
| import numpy as np |
| import sys |
|
|
| sys.path.append(".") |
|
|
|
|
| |
| def get_struc_seq( |
| foldseek, |
| path, |
| chains: list = None, |
| process_id: int = 0, |
| plddt_path: str = None, |
| plddt_threshold: float = 70.0, |
| ) -> dict: |
| """ |
| |
| Args: |
| foldseek: Binary executable file of foldseek |
| path: Path to pdb file |
| chains: Chains to be extracted from pdb file. If None, all chains will be extracted. |
| process_id: Process ID for temporary files. This is used for parallel processing. |
| plddt_path: Path to plddt file. If None, plddt will not be used. |
| plddt_threshold: Threshold for plddt. If plddt is lower than this value, the structure will be masked. |
| |
| Returns: |
| seq_dict: A dict of structural seqs. The keys are chain IDs. The values are tuples of |
| (seq, struc_seq, combined_seq). |
| """ |
| assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}" |
| assert os.path.exists(path), f"Pdb file not found: {path}" |
| assert plddt_path is None or os.path.exists( |
| plddt_path |
| ), f"Plddt file not found: {plddt_path}" |
|
|
| tmp_save_path = f"get_struc_seq_{process_id}.tsv" |
| cmd = f"{foldseek} structureto3didescriptor -v 0 --threads 1 --chain-name-mode 1 {path} {tmp_save_path}" |
| os.system(cmd) |
|
|
| seq_dict = {} |
| name = os.path.basename(path) |
| with open(tmp_save_path, "r") as r: |
| for i, line in enumerate(r): |
| desc, seq, struc_seq = line.split("\t")[:3] |
|
|
| |
| if plddt_path is not None: |
| with open(plddt_path, "r") as r: |
| plddts = np.array(json.load(r)["confidenceScore"]) |
|
|
| |
| indices = np.where(plddts < plddt_threshold)[0] |
| np_seq = np.array(list(struc_seq)) |
| np_seq[indices] = "#" |
| struc_seq = "".join(np_seq) |
|
|
| name_chain = desc.split(" ")[0] |
| chain = name_chain.replace(name, "").split("_")[-1] |
|
|
| if chains is None or chain in chains: |
| if chain not in seq_dict: |
| combined_seq = "".join( |
| [a + b.lower() for a, b in zip(seq, struc_seq)] |
| ) |
| seq_dict[chain] = (seq, struc_seq, combined_seq) |
|
|
| os.remove(tmp_save_path) |
| os.remove(tmp_save_path + ".dbtype") |
| return seq_dict |
|
|
|
|
| if __name__ == "__main__": |
| foldseek = "/sujin/bin/foldseek" |
| |
| test_path = "/sujin/Datasets/FLIP/meltome/af2_structures/A0A061ACX4.pdb" |
| plddt_path = "/sujin/Datasets/FLIP/meltome/af2_plddts/A0A061ACX4.json" |
| res = get_struc_seq( |
| foldseek, test_path, plddt_path=plddt_path, plddt_threshold=70.0 |
| ) |
| print(res["A"][1].lower()) |
|
|