|
|
import os
|
|
|
import sys
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from Bio.PDB import PDBParser, PPBuilder, Superimposer
|
|
|
from Bio.PDB.Polypeptide import is_aa
|
|
|
from Bio import BiopythonWarning
|
|
|
import warnings
|
|
|
from tmtools import tm_align
|
|
|
|
|
|
|
|
|
warnings.simplefilter('ignore', BiopythonWarning)
|
|
|
|
|
|
|
|
|
def extract_sequence(pdb_path):
|
|
|
"""
|
|
|
从 PDB 文件中提取第一个链的氨基酸序列。
|
|
|
|
|
|
参数:
|
|
|
pdb_path (str): PDB 文件的路径。
|
|
|
|
|
|
返回:
|
|
|
str: 氨基酸序列(单字母代码),如果无法提取则返回 None。
|
|
|
"""
|
|
|
parser = PDBParser(QUIET=True)
|
|
|
try:
|
|
|
structure = parser.get_structure('structure', pdb_path)
|
|
|
except Exception as e:
|
|
|
print(f"无法解析 PDB 文件 {pdb_path}: {e}")
|
|
|
return None
|
|
|
|
|
|
ppb = PPBuilder()
|
|
|
seq = ""
|
|
|
for pp in ppb.build_peptides(structure):
|
|
|
seq += str(pp.get_sequence())
|
|
|
|
|
|
break
|
|
|
|
|
|
return seq if seq else None
|
|
|
|
|
|
|
|
|
def get_sequences(directory):
|
|
|
"""
|
|
|
获取指定目录下所有 PDB 文件的序列。
|
|
|
|
|
|
参数:
|
|
|
directory (str): PDB 文件夹路径。
|
|
|
|
|
|
返回:
|
|
|
dict: 键为 PDB 文件名,值为其氨基酸序列。
|
|
|
"""
|
|
|
seq_dict = {}
|
|
|
for filename in os.listdir(directory):
|
|
|
if filename.endswith('.pdb'):
|
|
|
filepath = os.path.join(directory, filename)
|
|
|
seq = extract_sequence(filepath)
|
|
|
if seq:
|
|
|
seq_dict[filename] = seq
|
|
|
else:
|
|
|
print(f"警告: 无法提取序列 {filename}")
|
|
|
return seq_dict
|
|
|
|
|
|
|
|
|
def match_pdbs(wetlab_seqs, pdb_files_seqs):
|
|
|
"""
|
|
|
根据序列内容匹配 PDB 文件。
|
|
|
|
|
|
参数:
|
|
|
wetlab_seqs (dict): wetlab_pdb 文件夹中的 PDB 文件及其序列。
|
|
|
pdb_files_seqs (dict): pdb_files 文件夹中的 PDB 文件及其序列。
|
|
|
|
|
|
返回:
|
|
|
list of tuples: 每个元组包含 (wetlab_pdb, pdb_files_pdb)。
|
|
|
"""
|
|
|
matches = []
|
|
|
pdb_files_seqs_reverse = {}
|
|
|
for pdb, seq in pdb_files_seqs.items():
|
|
|
pdb_files_seqs_reverse.setdefault(seq, []).append(pdb)
|
|
|
|
|
|
for wetlab_pdb, wetlab_seq in wetlab_seqs.items():
|
|
|
matched_pdbs = pdb_files_seqs_reverse.get(wetlab_seq, [])
|
|
|
if matched_pdbs:
|
|
|
for matched_pdb in matched_pdbs:
|
|
|
matches.append((wetlab_pdb, matched_pdb))
|
|
|
else:
|
|
|
print(f"警告: 在 pdb_files 中未找到匹配的序列 for {wetlab_pdb} {wetlab_seq}")
|
|
|
return matches
|
|
|
|
|
|
|
|
|
def get_ca_atoms(pdb_path):
|
|
|
"""
|
|
|
提取 PDB 文件中第一个链的所有 C-alpha 原子。
|
|
|
|
|
|
参数:
|
|
|
pdb_path (str): PDB 文件的路径。
|
|
|
|
|
|
返回:
|
|
|
list of Bio.PDB.Atom.Atom: C-alpha 原子列表,如果失败则返回 None。
|
|
|
"""
|
|
|
parser = PDBParser(QUIET=True)
|
|
|
try:
|
|
|
structure = parser.get_structure('structure', pdb_path)
|
|
|
except Exception as e:
|
|
|
print(f"无法解析 PDB 文件 {pdb_path}: {e}")
|
|
|
return None
|
|
|
|
|
|
ca_atoms = []
|
|
|
models = list(structure.get_models())
|
|
|
if not models:
|
|
|
print(f"警告: PDB 文件 {pdb_path} 没有模型。")
|
|
|
return None
|
|
|
first_model = models[0]
|
|
|
|
|
|
chains = list(first_model.get_chains())
|
|
|
if not chains:
|
|
|
print(f"警告: PDB 文件 {pdb_path} 没有链。")
|
|
|
return None
|
|
|
first_chain = chains[0]
|
|
|
|
|
|
for residue in first_chain:
|
|
|
if is_aa(residue, standard=True):
|
|
|
if 'CA' in residue:
|
|
|
ca = residue['CA']
|
|
|
ca_atoms.append(ca)
|
|
|
|
|
|
return ca_atoms if ca_atoms else None
|
|
|
|
|
|
|
|
|
def calculate_rmsd(pdb_path1, pdb_path2):
|
|
|
"""
|
|
|
计算两个 PDB 文件之间的 RMSD。
|
|
|
|
|
|
参数:
|
|
|
pdb_path1 (str): 第一个 PDB 文件路径。
|
|
|
pdb_path2 (str): 第二个 PDB 文件路径。
|
|
|
|
|
|
返回:
|
|
|
float: RMSD 值,如果计算失败则返回 None。
|
|
|
"""
|
|
|
atoms1 = get_ca_atoms(pdb_path1)
|
|
|
atoms2 = get_ca_atoms(pdb_path2)
|
|
|
|
|
|
if atoms1 is None or atoms2 is None:
|
|
|
print(f"警告: 无法获取 C-alpha 原子 for {pdb_path1} and/or {pdb_path2}")
|
|
|
return None
|
|
|
|
|
|
if len(atoms1) != len(atoms2):
|
|
|
print(f"警告: {os.path.basename(pdb_path1)} 和 {os.path.basename(pdb_path2)} 的 C-alpha 原子数量不同 ({len(atoms1)} vs {len(atoms2)}).")
|
|
|
return None
|
|
|
|
|
|
|
|
|
sup = Superimposer()
|
|
|
sup.set_atoms(atoms1, atoms2)
|
|
|
sup.apply(atoms2)
|
|
|
rmsd = sup.rms
|
|
|
|
|
|
return rmsd
|
|
|
|
|
|
|
|
|
def calculate_tm_score_tmtools(coords1, coords2, seq1, seq2):
|
|
|
"""
|
|
|
使用 tmtools 计算两个坐标阵列之间的 TM-score。
|
|
|
|
|
|
参数:
|
|
|
coords1 (np.ndarray): 第一个坐标数组,形状为 N1 x 3。
|
|
|
coords2 (np.ndarray): 第二个坐标数组,形状为 N2 x 3。
|
|
|
seq1 (str): 第一个序列。
|
|
|
seq2 (str): 第二个序列。
|
|
|
|
|
|
返回:
|
|
|
float: TM-score 值,如果计算失败则返回 None。
|
|
|
"""
|
|
|
try:
|
|
|
res = tm_align(coords1, coords2, seq1, seq2)
|
|
|
tm_score = res.tm_norm_chain2
|
|
|
return tm_score
|
|
|
except Exception as e:
|
|
|
print(f"Error calculating TM-score with tmtools: {e}")
|
|
|
return None
|
|
|
|
|
|
|
|
|
def process_pair_rmsd_tm(pdb1_path, pdb2_path, seq1, seq2):
|
|
|
"""
|
|
|
处理一对 PDB 文件,计算 TM-score 和 RMSD。
|
|
|
|
|
|
参数:
|
|
|
pdb1_path (str): 第一个 PDB 文件路径。
|
|
|
pdb2_path (str): 第二个 PDB 文件路径。
|
|
|
seq1 (str): 第一个序列。
|
|
|
seq2 (str): 第二个序列。
|
|
|
|
|
|
返回:
|
|
|
tuple: (PDB_ID, TM_score, RMSD)
|
|
|
"""
|
|
|
pdb1 = os.path.basename(pdb1_path)
|
|
|
pdb2 = os.path.basename(pdb2_path)
|
|
|
|
|
|
|
|
|
atoms1 = get_ca_atoms(pdb1_path)
|
|
|
atoms2 = get_ca_atoms(pdb2_path)
|
|
|
|
|
|
if atoms1 is None or atoms2 is None:
|
|
|
print(f"警告: 无法获取 C-alpha 原子 for {pdb1} and/or {pdb2}")
|
|
|
return (f"{pdb1} vs {pdb2}", None, None)
|
|
|
|
|
|
if len(atoms1) != len(atoms2):
|
|
|
print(f"警告: {pdb1} 和 {pdb2} 的 C-alpha 原子数量不同 ({len(atoms1)} vs {len(atoms2)}).")
|
|
|
return (f"{pdb1} vs {pdb2}", None, None)
|
|
|
|
|
|
|
|
|
coords1 = np.array([atom.get_coord() for atom in atoms1])
|
|
|
coords2 = np.array([atom.get_coord() for atom in atoms2])
|
|
|
|
|
|
|
|
|
rmsd = calculate_rmsd(pdb1_path, pdb2_path)
|
|
|
|
|
|
|
|
|
tm_score = calculate_tm_score_tmtools(coords1, coords2, seq1, seq2)
|
|
|
|
|
|
return (f"{pdb1} vs {pdb2}", tm_score, rmsd)
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
wetlab_dir = "./wetlab_pdb"
|
|
|
pdb_files_dir = "/home/ubuntu/alphafold3/output_pdb/"
|
|
|
output_csv = "alphafold3_tm_rmsd_results.csv"
|
|
|
|
|
|
|
|
|
if not os.path.isdir(wetlab_dir):
|
|
|
print(f"错误: 文件夹 '{wetlab_dir}' 不存在。")
|
|
|
sys.exit(1)
|
|
|
if not os.path.isdir(pdb_files_dir):
|
|
|
print(f"错误: 文件夹 '{pdb_files_dir}' 不存在。")
|
|
|
sys.exit(1)
|
|
|
|
|
|
print("提取 wetlab_pdb 文件夹中的序列...")
|
|
|
wetlab_seqs = get_sequences(wetlab_dir)
|
|
|
print(f"提取到 {len(wetlab_seqs)} 个序列。\n")
|
|
|
|
|
|
print("提取 pdb_files 文件夹中的序列...")
|
|
|
pdb_files_seqs = get_sequences(pdb_files_dir)
|
|
|
print(f"提取到 {len(pdb_files_seqs)} 个序列。\n")
|
|
|
|
|
|
print("匹配 PDB 文件 based on sequences...")
|
|
|
matches = match_pdbs(wetlab_seqs, pdb_files_seqs)
|
|
|
print(f"找到 {len(matches)} 对匹配的 PDB 文件。\n")
|
|
|
|
|
|
if not matches:
|
|
|
print("没有找到任何匹配的 PDB 文件。")
|
|
|
sys.exit(0)
|
|
|
|
|
|
results = []
|
|
|
|
|
|
print("开始计算 TM-score 和 RMSD...\n")
|
|
|
for wetlab_pdb, ref_pdb in matches:
|
|
|
print(f"处理: {wetlab_pdb} vs {ref_pdb}")
|
|
|
wetlab_path = os.path.join(wetlab_dir, wetlab_pdb)
|
|
|
ref_path = os.path.join(pdb_files_dir, ref_pdb)
|
|
|
try:
|
|
|
|
|
|
seq1 = wetlab_seqs[wetlab_pdb]
|
|
|
seq2 = pdb_files_seqs[ref_pdb]
|
|
|
|
|
|
pdb_id, tm_score, rmsd = process_pair_rmsd_tm(wetlab_path, ref_path, seq1, seq2)
|
|
|
if tm_score is not None and rmsd is not None:
|
|
|
print(f"TM-score: {tm_score:.4f}, RMSD: {rmsd:.4f} Å\n")
|
|
|
else:
|
|
|
print("计算失败。\n")
|
|
|
except Exception as e:
|
|
|
print(f"错误: 处理 {wetlab_pdb} vs {ref_pdb} 时出错: {e}\n")
|
|
|
pdb_id, tm_score, rmsd = (f"{wetlab_pdb} vs {ref_pdb}", None, None)
|
|
|
results.append((pdb_id, tm_score, rmsd))
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(results, columns=['PDB_ID', 'TM_score', 'RMSD'])
|
|
|
|
|
|
|
|
|
valid_df = df.dropna(subset=['TM_score', 'RMSD'])
|
|
|
|
|
|
if not valid_df.empty:
|
|
|
|
|
|
avg_tm = valid_df['TM_score'].mean()
|
|
|
std_tm = valid_df['TM_score'].std()
|
|
|
avg_rmsd = valid_df['RMSD'].mean()
|
|
|
std_rmsd = valid_df['RMSD'].std()
|
|
|
|
|
|
|
|
|
summary = pd.DataFrame({
|
|
|
'PDB_ID': ['Average', 'Std Dev'],
|
|
|
'TM_score': [avg_tm, std_tm],
|
|
|
'RMSD': [avg_rmsd, std_rmsd]
|
|
|
})
|
|
|
|
|
|
|
|
|
final_df = pd.concat([summary, df], ignore_index=True)
|
|
|
else:
|
|
|
final_df = df.copy()
|
|
|
|
|
|
|
|
|
final_df.to_csv(output_csv, index=False)
|
|
|
print(f"结果已保存到 '{output_csv}'。")
|
|
|
|
|
|
|
|
|
if not valid_df.empty:
|
|
|
print("\n### 计算摘要 ###")
|
|
|
print(f"平均 TM-score: {avg_tm:.4f}")
|
|
|
print(f"TM-score 标准差: {std_tm:.4f}")
|
|
|
print(f"平均 RMSD: {avg_rmsd:.4f} Å")
|
|
|
print(f"RMSD 标准差: {std_rmsd:.4f} Å")
|
|
|
else:
|
|
|
print("没有成功计算任何 PDB 对的 TM-score 和 RMSD。")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |