MotifAlign / rmsd_utils.py
jiehou's picture
Upload 3 files
a6c9f2a verified
"""
RMSD Calculation Utilities for RNA Structure Comparison
Fixed version with explicit purine-pyrimidine atom mapping
"""
import numpy as np
def parse_residue_atoms(fname):
"""
Parse PDB file and organize atoms by residue.
Args:
fname: Path to PDB file
Returns:
List of residues, where each residue is a dict with:
- 'resnum': residue number
- 'resname': residue name (A, C, G, U)
- 'atoms': dict of {atom_name: [x, y, z]}
"""
with open(fname) as f:
content = f.readlines()
residues = {}
for line in content:
record = line[0:6].strip()
if record == 'ATOM' or record == 'HETATM' or record == 'HETAT':
atomname = line[12:16].strip()
resname = line[17:20].strip() # residue name (A, C, G, U)
resnum = int(line[22:26].strip()) # residue number
x = float(line[30:38].strip())
y = float(line[38:46].strip())
z = float(line[46:54].strip())
# Initialize residue if not seen before
if resnum not in residues:
residues[resnum] = {
'resnum': resnum,
'resname': resname,
'atoms': {}
}
residues[resnum]['atoms'][atomname] = [x, y, z]
# Convert to sorted list by residue number
sorted_residues = [residues[k] for k in sorted(residues.keys())]
return sorted_residues
def get_backbone_sugar_coords_from_residue(residue):
"""
Extract backbone and sugar atom coordinates from a residue dict.
Args:
residue: Dict with 'atoms' key containing atom coordinates
Returns:
List of [x, y, z] coordinates in consistent order
"""
# Define the order of backbone and sugar atoms
backbone_sugar_atoms = ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'"]
coords = []
atoms = residue['atoms']
for atom_name in backbone_sugar_atoms:
if atom_name in atoms:
coords.append(atoms[atom_name])
return coords
def get_base_coords_from_residue(residue):
"""
Extract the three key base atom coordinates from a residue.
Returns list of [x, y, z] coordinates in the correct order:
- For purines (A, G): N9, C8, C4
- For pyrimidines (C, U): N1, C2, C6
These are ordered to enable proper purine-pyrimidine mapping:
N9 <-> N1, C8 <-> C2, C4 <-> C6
Args:
residue: Dict with 'resname' and 'atoms' keys
Returns:
List of [x, y, z] coordinates
"""
resname = residue['resname']
atoms = residue['atoms']
coords = []
if resname in ['A', 'G']: # Purines
base_atoms = ['N9', 'C8', 'C4']
elif resname in ['C', 'U']: # Pyrimidines
base_atoms = ['N1', 'C2', 'C6']
else:
# Unknown residue type
return coords
for atom_name in base_atoms:
if atom_name in atoms:
coords.append(atoms[atom_name])
return coords
def get_backbone_sugar_and_selectbase_coords_fixed(fname):
"""
Extract backbone, sugar, and select base atom coordinates.
Ensures proper ordering for purine-pyrimidine mapping.
For each residue, extracts:
1. All backbone and sugar atoms (in consistent order)
2. Three base atoms:
- Purines (A, G): N9, C8, C4
- Pyrimidines (C, U): N1, C2, C6
This ordering ensures that when comparing structures with different sequences,
the atoms are correctly mapped (N9<->N1, C8<->C2, C4<->C6).
Args:
fname: Path to PDB file
Returns:
Numpy array of coordinates
"""
residues = parse_residue_atoms(fname)
all_coords = []
for residue in residues:
# Get backbone and sugar coordinates
backbone_coords = get_backbone_sugar_coords_from_residue(residue)
all_coords.extend(backbone_coords)
# Get base coordinates
base_coords = get_base_coords_from_residue(residue)
all_coords.extend(base_coords)
return np.asarray(all_coords)
def calculate_COM(coords):
"""
Calculate center of mass (geometric center) of coordinates.
Args:
coords: Numpy array of shape (N, 3)
Returns:
Numpy array of shape (3,) representing the center of mass
"""
L = coords.shape[0]
COM = np.sum(coords, axis=0) / float(L)
return COM
def calculate_rotation_rmsd(coords1, coords2, COM1, COM2):
"""
Calculate rotation matrix and RMSD using Kabsch algorithm.
Args:
coords1: Coordinates of structure 1 (N, 3)
coords2: Coordinates of structure 2 (N, 3)
COM1: Center of mass of structure 1 (3,)
COM2: Center of mass of structure 2 (3,)
Returns:
U: Rotation matrix (3, 3)
RMSD: Root mean square deviation (float)
"""
sel1 = coords1 - COM1
sel2 = coords2 - COM2
# Check for consistency
if len(sel1) != len(sel2):
return None, None
L = len(sel1)
assert L > 0
# Initial residual, see Kabsch.
R0 = np.sum(np.sum(sel1 * sel1, axis=0), axis=0) + np.sum(np.sum(sel2 * sel2, axis=0), axis=0)
# Calculate the components of the rotation matrix (V,W)
# S is used to calculate the error (RMSD)
V, S, W = np.linalg.svd(np.dot(sel2.T, sel1))
# Calculate if the product of the determinants is + or -
# if negative reflect the rotation matrix components prior
# determining the rotation matrix (U)
reflect = float(str(float(np.linalg.det(V) * np.linalg.det(W))))
if reflect == -1.0:
S[-1] = -S[-1]
V[:, -1] = -V[:, -1]
U = np.dot(V, W)
# Calculate the RMSD using sigma from the SVD calculation above
RMSD = R0 - (2.0 * sum(S))
RMSD = np.sqrt(abs(RMSD / L))
return U, RMSD
def translate_rotate_coords(coords, COM, U=None):
"""
Translate and optionally rotate coordinates.
Args:
coords: Coordinates to transform (N, 3)
COM: Center of mass to translate by (3,)
U: Rotation matrix (3, 3), optional
Returns:
Transformed coordinates (N, 3)
"""
# Translate only
if U is None:
return coords - COM
# Translate and rotate
return np.dot((coords - COM), U)
def get_all_atom_coords(fname):
"""
Get all atom coordinates from a PDB file.
Args:
fname: Path to PDB file
Returns:
Numpy array of coordinates (N, 3)
"""
with open(fname) as f:
content = f.readlines()
coords = []
for line in content:
record = line[0:6].strip()
if record == 'ATOM' or record == 'HETATM' or record == 'HETAT':
x = float(line[30:38].strip())
y = float(line[38:46].strip())
z = float(line[46:54].strip())
coords.append([x, y, z])
return np.asarray(coords)
def apply_transformation_to_pdb(fname, U, COM, output_fname):
"""
Apply rotation and translation to a PDB file and save result.
Args:
fname: Input PDB file path
U: Rotation matrix (3, 3)
COM: Center of mass to translate from (3,)
output_fname: Output PDB file path
"""
with open(fname) as f:
lines = f.readlines()
with open(output_fname, 'w') as f:
for line in lines:
record = line[0:6].strip()
if record == 'ATOM' or record == 'HETATM' or record == 'HETAT':
# Extract coordinates
x = float(line[30:38].strip())
y = float(line[38:46].strip())
z = float(line[46:54].strip())
# Transform
coord = np.array([x, y, z])
new_coord = np.dot((coord - COM), U)
# Write transformed line
new_line = (
line[:30] +
f"{new_coord[0]:8.3f}" +
f"{new_coord[1]:8.3f}" +
f"{new_coord[2]:8.3f}" +
line[54:]
)
f.write(new_line)
else:
f.write(line)