File size: 3,428 Bytes
32c275c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import argparse
import os
import warnings
from Bio.PDB import PDBList, PDBParser, PDBIO, Select # type: ignore
from Bio.PDB.PDBExceptions import PDBConstructionWarning # type: ignore

# Suppress PDB construction warnings
warnings.simplefilter('ignore', PDBConstructionWarning)

# Define a filter to keep only the protein (no water, no ligands)
class ProteinSelect(Select):
    def accept_residue(self, residue):
        return residue.get_resname() not in ['HOH', 'WAT', 'NAG', 'MAN'] # Removes water and sugars

def download_and_clean_pdb(pdb_id, output_file=None, data_dir='data'):
    """
    Download and clean a PDB file.
    
    Args:
        pdb_id: PDB ID to download (e.g., '3KAS')
        output_file: Output filename (default: {data_dir}/{pdb_id.lower()}_clean.pdb)
        data_dir: Directory to store downloaded files (default: 'data')
    
    Returns:
        str: Path to the cleaned PDB file
    
    Raises:
        FileNotFoundError: If the PDB file cannot be downloaded or found
        Exception: For other errors during processing
    """
    pdb_id = pdb_id.lower()  # 3KAS -> 3kas
    if output_file is None:
        output_file = f'{data_dir}/{pdb_id}.pdb'
    
    # Create data directory if it doesn't exist
    os.makedirs(data_dir, exist_ok=True)
    
    # Check if final output file exists - if yes, return immediately
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"📁 METHOD: Using existing output file - {output_file}")
        return output_file
    
    # Download .ent file using PDBList
    print(f"📡 METHOD: Downloading from PDB - {pdb_id}")
    pdbl = PDBList()
    ent_file = pdbl.retrieve_pdb_file(pdb_id, pdir=data_dir, file_format='pdb')
    
    if not ent_file or not os.path.exists(ent_file):
        expected_ent = f'{data_dir}/pdb{pdb_id}.ent'
        if not os.path.exists(expected_ent):
            raise FileNotFoundError(f"❌ Failed to download PDB file for {pdb_id}. The PDB ID may not exist.")
        ent_file = expected_ent
    
    # Parse and save the clean version
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', PDBConstructionWarning)
        parser_obj = PDBParser(QUIET=True)
        structure = parser_obj.get_structure(pdb_id, ent_file)
    io = PDBIO()
    io.set_structure(structure)
    io.save(output_file, ProteinSelect())
    
    # Delete the .ent file
    if os.path.exists(ent_file):
        os.remove(ent_file)
    
    return output_file

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Download and clean a PDB file')
    parser.add_argument('pdb_id', type=str, help='PDB ID to download (e.g., 3KAS)')
    parser.add_argument('--output', type=str, default=None, help='Output filename (default: {pdb_id}_clean.pdb)')
    parser.add_argument('--data-dir', type=str, default='data', help='Directory to store downloaded files (default: data)')
    
    args = parser.parse_args()
    
    # Clean PDB ID - remove common prefixes like "pdb_id="
    pdb_id = args.pdb_id.strip()
    if '=' in pdb_id:
        pdb_id = pdb_id.split('=')[-1].strip()
    pdb_id = pdb_id.upper()
    
    try:
        output_file = download_and_clean_pdb(pdb_id, args.output, args.data_dir)
        print(f"Target {pdb_id.lower()} is downloaded and cleaned as {output_file}!")
    except Exception as e:
        print(f"Error processing {pdb_id}: {str(e)}")
        raise