import argparse import os import warnings from Bio.PDB import PDBList, PDBParser, PDBIO, Select # type: ignore from Bio.PDB.PDBExceptions import PDBConstructionWarning # type: ignore # Suppress PDB construction warnings warnings.simplefilter('ignore', PDBConstructionWarning) # Define a filter to keep only the protein (no water, no ligands) class ProteinSelect(Select): def accept_residue(self, residue): return residue.get_resname() not in ['HOH', 'WAT', 'NAG', 'MAN'] # Removes water and sugars def download_and_clean_pdb(pdb_id, output_file=None, data_dir='data'): """ Download and clean a PDB file. Args: pdb_id: PDB ID to download (e.g., '3KAS') output_file: Output filename (default: {data_dir}/{pdb_id.lower()}_clean.pdb) data_dir: Directory to store downloaded files (default: 'data') Returns: str: Path to the cleaned PDB file Raises: FileNotFoundError: If the PDB file cannot be downloaded or found Exception: For other errors during processing """ pdb_id = pdb_id.lower() # 3KAS -> 3kas if output_file is None: output_file = f'{data_dir}/{pdb_id}.pdb' # Create data directory if it doesn't exist os.makedirs(data_dir, exist_ok=True) # Check if final output file exists - if yes, return immediately if os.path.exists(output_file) and os.path.getsize(output_file) > 0: print(f"📁 METHOD: Using existing output file - {output_file}") return output_file # Download .ent file using PDBList print(f"📡 METHOD: Downloading from PDB - {pdb_id}") pdbl = PDBList() ent_file = pdbl.retrieve_pdb_file(pdb_id, pdir=data_dir, file_format='pdb') if not ent_file or not os.path.exists(ent_file): expected_ent = f'{data_dir}/pdb{pdb_id}.ent' if not os.path.exists(expected_ent): raise FileNotFoundError(f"❌ Failed to download PDB file for {pdb_id}. The PDB ID may not exist.") ent_file = expected_ent # Parse and save the clean version with warnings.catch_warnings(): warnings.simplefilter('ignore', PDBConstructionWarning) parser_obj = PDBParser(QUIET=True) structure = parser_obj.get_structure(pdb_id, ent_file) io = PDBIO() io.set_structure(structure) io.save(output_file, ProteinSelect()) # Delete the .ent file if os.path.exists(ent_file): os.remove(ent_file) return output_file if __name__ == "__main__": parser = argparse.ArgumentParser(description='Download and clean a PDB file') parser.add_argument('pdb_id', type=str, help='PDB ID to download (e.g., 3KAS)') parser.add_argument('--output', type=str, default=None, help='Output filename (default: {pdb_id}_clean.pdb)') parser.add_argument('--data-dir', type=str, default='data', help='Directory to store downloaded files (default: data)') args = parser.parse_args() # Clean PDB ID - remove common prefixes like "pdb_id=" pdb_id = args.pdb_id.strip() if '=' in pdb_id: pdb_id = pdb_id.split('=')[-1].strip() pdb_id = pdb_id.upper() try: output_file = download_and_clean_pdb(pdb_id, args.output, args.data_dir) print(f"Target {pdb_id.lower()} is downloaded and cleaned as {output_file}!") except Exception as e: print(f"Error processing {pdb_id}: {str(e)}") raise