Spaces:
Sleeping
Sleeping
File size: 3,428 Bytes
32c275c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import argparse
import os
import warnings
from Bio.PDB import PDBList, PDBParser, PDBIO, Select # type: ignore
from Bio.PDB.PDBExceptions import PDBConstructionWarning # type: ignore
# Suppress PDB construction warnings
warnings.simplefilter('ignore', PDBConstructionWarning)
# Define a filter to keep only the protein (no water, no ligands)
class ProteinSelect(Select):
def accept_residue(self, residue):
return residue.get_resname() not in ['HOH', 'WAT', 'NAG', 'MAN'] # Removes water and sugars
def download_and_clean_pdb(pdb_id, output_file=None, data_dir='data'):
"""
Download and clean a PDB file.
Args:
pdb_id: PDB ID to download (e.g., '3KAS')
output_file: Output filename (default: {data_dir}/{pdb_id.lower()}_clean.pdb)
data_dir: Directory to store downloaded files (default: 'data')
Returns:
str: Path to the cleaned PDB file
Raises:
FileNotFoundError: If the PDB file cannot be downloaded or found
Exception: For other errors during processing
"""
pdb_id = pdb_id.lower() # 3KAS -> 3kas
if output_file is None:
output_file = f'{data_dir}/{pdb_id}.pdb'
# Create data directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)
# Check if final output file exists - if yes, return immediately
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
print(f"📁 METHOD: Using existing output file - {output_file}")
return output_file
# Download .ent file using PDBList
print(f"📡 METHOD: Downloading from PDB - {pdb_id}")
pdbl = PDBList()
ent_file = pdbl.retrieve_pdb_file(pdb_id, pdir=data_dir, file_format='pdb')
if not ent_file or not os.path.exists(ent_file):
expected_ent = f'{data_dir}/pdb{pdb_id}.ent'
if not os.path.exists(expected_ent):
raise FileNotFoundError(f"❌ Failed to download PDB file for {pdb_id}. The PDB ID may not exist.")
ent_file = expected_ent
# Parse and save the clean version
with warnings.catch_warnings():
warnings.simplefilter('ignore', PDBConstructionWarning)
parser_obj = PDBParser(QUIET=True)
structure = parser_obj.get_structure(pdb_id, ent_file)
io = PDBIO()
io.set_structure(structure)
io.save(output_file, ProteinSelect())
# Delete the .ent file
if os.path.exists(ent_file):
os.remove(ent_file)
return output_file
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download and clean a PDB file')
parser.add_argument('pdb_id', type=str, help='PDB ID to download (e.g., 3KAS)')
parser.add_argument('--output', type=str, default=None, help='Output filename (default: {pdb_id}_clean.pdb)')
parser.add_argument('--data-dir', type=str, default='data', help='Directory to store downloaded files (default: data)')
args = parser.parse_args()
# Clean PDB ID - remove common prefixes like "pdb_id="
pdb_id = args.pdb_id.strip()
if '=' in pdb_id:
pdb_id = pdb_id.split('=')[-1].strip()
pdb_id = pdb_id.upper()
try:
output_file = download_and_clean_pdb(pdb_id, args.output, args.data_dir)
print(f"Target {pdb_id.lower()} is downloaded and cleaned as {output_file}!")
except Exception as e:
print(f"Error processing {pdb_id}: {str(e)}")
raise |