""" RMSD Calculation Utilities for RNA Structure Comparison Fixed version with explicit purine-pyrimidine atom mapping """ import numpy as np def parse_residue_atoms(fname): """ Parse PDB file and organize atoms by residue. Args: fname: Path to PDB file Returns: List of residues, where each residue is a dict with: - 'resnum': residue number - 'resname': residue name (A, C, G, U) - 'atoms': dict of {atom_name: [x, y, z]} """ with open(fname) as f: content = f.readlines() residues = {} for line in content: record = line[0:6].strip() if record == 'ATOM' or record == 'HETATM' or record == 'HETAT': atomname = line[12:16].strip() resname = line[17:20].strip() # residue name (A, C, G, U) resnum = int(line[22:26].strip()) # residue number x = float(line[30:38].strip()) y = float(line[38:46].strip()) z = float(line[46:54].strip()) # Initialize residue if not seen before if resnum not in residues: residues[resnum] = { 'resnum': resnum, 'resname': resname, 'atoms': {} } residues[resnum]['atoms'][atomname] = [x, y, z] # Convert to sorted list by residue number sorted_residues = [residues[k] for k in sorted(residues.keys())] return sorted_residues def get_backbone_sugar_coords_from_residue(residue): """ Extract backbone and sugar atom coordinates from a residue dict. Args: residue: Dict with 'atoms' key containing atom coordinates Returns: List of [x, y, z] coordinates in consistent order """ # Define the order of backbone and sugar atoms backbone_sugar_atoms = ["P", "OP1", "OP2", "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'", "O2'", "C1'"] coords = [] atoms = residue['atoms'] for atom_name in backbone_sugar_atoms: if atom_name in atoms: coords.append(atoms[atom_name]) return coords def get_base_coords_from_residue(residue): """ Extract the three key base atom coordinates from a residue. Returns list of [x, y, z] coordinates in the correct order: - For purines (A, G): N9, C8, C4 - For pyrimidines (C, U): N1, C2, C6 These are ordered to enable proper purine-pyrimidine mapping: N9 <-> N1, C8 <-> C2, C4 <-> C6 Args: residue: Dict with 'resname' and 'atoms' keys Returns: List of [x, y, z] coordinates """ resname = residue['resname'] atoms = residue['atoms'] coords = [] if resname in ['A', 'G']: # Purines base_atoms = ['N9', 'C8', 'C4'] elif resname in ['C', 'U']: # Pyrimidines base_atoms = ['N1', 'C2', 'C6'] else: # Unknown residue type return coords for atom_name in base_atoms: if atom_name in atoms: coords.append(atoms[atom_name]) return coords def get_backbone_sugar_and_selectbase_coords_fixed(fname): """ Extract backbone, sugar, and select base atom coordinates. Ensures proper ordering for purine-pyrimidine mapping. For each residue, extracts: 1. All backbone and sugar atoms (in consistent order) 2. Three base atoms: - Purines (A, G): N9, C8, C4 - Pyrimidines (C, U): N1, C2, C6 This ordering ensures that when comparing structures with different sequences, the atoms are correctly mapped (N9<->N1, C8<->C2, C4<->C6). Args: fname: Path to PDB file Returns: Numpy array of coordinates """ residues = parse_residue_atoms(fname) all_coords = [] for residue in residues: # Get backbone and sugar coordinates backbone_coords = get_backbone_sugar_coords_from_residue(residue) all_coords.extend(backbone_coords) # Get base coordinates base_coords = get_base_coords_from_residue(residue) all_coords.extend(base_coords) return np.asarray(all_coords) def calculate_COM(coords): """ Calculate center of mass (geometric center) of coordinates. Args: coords: Numpy array of shape (N, 3) Returns: Numpy array of shape (3,) representing the center of mass """ L = coords.shape[0] COM = np.sum(coords, axis=0) / float(L) return COM def calculate_rotation_rmsd(coords1, coords2, COM1, COM2): """ Calculate rotation matrix and RMSD using Kabsch algorithm. Args: coords1: Coordinates of structure 1 (N, 3) coords2: Coordinates of structure 2 (N, 3) COM1: Center of mass of structure 1 (3,) COM2: Center of mass of structure 2 (3,) Returns: U: Rotation matrix (3, 3) RMSD: Root mean square deviation (float) """ sel1 = coords1 - COM1 sel2 = coords2 - COM2 # Check for consistency if len(sel1) != len(sel2): return None, None L = len(sel1) assert L > 0 # Initial residual, see Kabsch. R0 = np.sum(np.sum(sel1 * sel1, axis=0), axis=0) + np.sum(np.sum(sel2 * sel2, axis=0), axis=0) # Calculate the components of the rotation matrix (V,W) # S is used to calculate the error (RMSD) V, S, W = np.linalg.svd(np.dot(sel2.T, sel1)) # Calculate if the product of the determinants is + or - # if negative reflect the rotation matrix components prior # determining the rotation matrix (U) reflect = float(str(float(np.linalg.det(V) * np.linalg.det(W)))) if reflect == -1.0: S[-1] = -S[-1] V[:, -1] = -V[:, -1] U = np.dot(V, W) # Calculate the RMSD using sigma from the SVD calculation above RMSD = R0 - (2.0 * sum(S)) RMSD = np.sqrt(abs(RMSD / L)) return U, RMSD def translate_rotate_coords(coords, COM, U=None): """ Translate and optionally rotate coordinates. Args: coords: Coordinates to transform (N, 3) COM: Center of mass to translate by (3,) U: Rotation matrix (3, 3), optional Returns: Transformed coordinates (N, 3) """ # Translate only if U is None: return coords - COM # Translate and rotate return np.dot((coords - COM), U) def get_all_atom_coords(fname): """ Get all atom coordinates from a PDB file. Args: fname: Path to PDB file Returns: Numpy array of coordinates (N, 3) """ with open(fname) as f: content = f.readlines() coords = [] for line in content: record = line[0:6].strip() if record == 'ATOM' or record == 'HETATM' or record == 'HETAT': x = float(line[30:38].strip()) y = float(line[38:46].strip()) z = float(line[46:54].strip()) coords.append([x, y, z]) return np.asarray(coords) def apply_transformation_to_pdb(fname, U, COM, output_fname): """ Apply rotation and translation to a PDB file and save result. Args: fname: Input PDB file path U: Rotation matrix (3, 3) COM: Center of mass to translate from (3,) output_fname: Output PDB file path """ with open(fname) as f: lines = f.readlines() with open(output_fname, 'w') as f: for line in lines: record = line[0:6].strip() if record == 'ATOM' or record == 'HETATM' or record == 'HETAT': # Extract coordinates x = float(line[30:38].strip()) y = float(line[38:46].strip()) z = float(line[46:54].strip()) # Transform coord = np.array([x, y, z]) new_coord = np.dot((coord - COM), U) # Write transformed line new_line = ( line[:30] + f"{new_coord[0]:8.3f}" + f"{new_coord[1]:8.3f}" + f"{new_coord[2]:8.3f}" + line[54:] ) f.write(new_line) else: f.write(line)