cryptobank_prediction

Sleeping

App Files Files Community

ThorbenF commited on Dec 3, 2024

Commit

a28eeb5

1 Parent(s): a653779

Update

Browse files

Files changed (2) hide show

.ipynb_checkpoints/app-checkpoint.py +84 -51
app.py +84 -51

.ipynb_checkpoints/app-checkpoint.py CHANGED Viewed

@@ -23,10 +23,9 @@ from scipy.special import expit
 import requests
 # Biopython imports
-from Bio.PDB import PDBParser, Select
 from Bio.PDB.DSSP import DSSP
-from gradio_molecule3d import Molecule3D
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
@@ -38,6 +37,79 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device)
 model.eval()
 def create_dataset(tokenizer, seqs, labels, checkpoint):
     tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
     dataset = Dataset.from_dict(tokenized)
@@ -138,59 +210,20 @@ def fetch_pdb(pdb_id):
         print(f"Error fetching PDB: {e}")
         return None
-def extract_protein_sequence(pdb_path):
-    """
-    Extract the longest protein sequence from a PDB file
-    """
-    parser = PDBParser(QUIET=1)
-    structure = parser.get_structure('protein', pdb_path)
-    class ProteinSelect(Select):
-        def accept_residue(self, residue):
-            # Only accept standard amino acids
-            standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
-            return residue.get_resname() in standard_aa
-    # Find the longest protein chain
-    longest_sequence = ""
-    longest_chain = None
-    for model in structure:
-        for chain in model:
-            sequence = ""
-            for residue in chain:
-                if Select().accept_residue(residue):
-                    sequence += residue.get_resname()
-            # Convert 3-letter amino acid codes to 1-letter
-            aa_dict = {
-                'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F',
-                'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L',
-                'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R',
-                'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'
-            }
-            one_letter_sequence = ''.join([aa_dict.get(res, 'X') for res in sequence])
-            # Track the longest sequence
-            if len(one_letter_sequence) > len(longest_sequence) and \
-               10 < len(one_letter_sequence) < 1500:
-                longest_sequence = one_letter_sequence
-                longest_chain = chain
-    return longest_sequence, longest_chain, pdb_path
 def process_pdb(pdb_id):
     # Fetch PDB file
-    pdb_path = fetch_pdb(pdb_id)
-    if not pdb_path:
-        return "Failed to fetch PDB file", pdb_path
     # Extract protein sequence and chain
-    protein_sequence, chain, pdb_file = extract_protein_sequence(pdb_path)
     if not protein_sequence:
-        return "No suitable protein sequence found", pdb_file
     # Predict binding sites
     sequence, normalized_scores = predict_protein_sequence(protein_sequence)
@@ -198,7 +231,7 @@ def process_pdb(pdb_id):
     # Prepare result string
     result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
-    return result_str, pdb_file
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -246,4 +279,4 @@ with gr.Blocks() as demo:
         outputs=[predictions_output, molecule_output]
     )
-demo.launch(share=True)

 import requests
 # Biopython imports
+from Bio.PDB import PDBParser, Select, PDBIO
 from Bio.PDB.DSSP import DSSP
+import Bio.PDB.PDBList as PDBList
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 model.to(device)
 model.eval()
+def is_valid_sequence_length(length: int) -> bool:
+    """Check if sequence length is within valid range."""
+    return 100 <= length <= 1500
+def is_nucleic_acid_chain(chain) -> bool:
+    """Check if chain contains nucleic acids."""
+    nucleic_acids = {'A', 'C', 'G', 'T', 'U', 'DA', 'DC', 'DG', 'DT', 'DU', 'UNK'}
+    return any(residue.get_resname().strip() in nucleic_acids for residue in chain)
+def extract_protein_sequence(pdb_path):
+    """
+    Extract the longest protein sequence from a PDB file with improved logic
+    """
+    parser = PDBParser(QUIET=1)
+    structure = parser.get_structure('protein', pdb_path)
+    # Comprehensive amino acid mapping
+    aa_dict = {
+        # Standard amino acids (20 canonical)
+        'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
+        'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
+        'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
+        'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
+        # Modified amino acids and alternative names
+        'MSE': 'M',  # Selenomethionine
+        'SEP': 'S',  # Phosphoserine
+        'TPO': 'T',  # Phosphothreonine
+        'CSO': 'C',  # Hydroxylalanine
+        'PTR': 'Y',  # Phosphotyrosine
+        'HYP': 'P',  # Hydroxyproline
+    }
+    # Ligand and nucleic acid exclusion set
+    ligand_exclusion_set = {'HOH', 'WAT', 'DOD', 'SO4', 'PO4', 'GOL', 'ACT', 'EDO'}
+    # Find the longest protein chain
+    longest_sequence = ""
+    longest_chain = None
+    for model in structure:
+        for chain in model:
+            # Skip nucleic acid chains
+            if is_nucleic_acid_chain(chain):
+                continue
+            # Extract and convert sequence
+            sequence = ""
+            for residue in chain:
+                # Check if residue is a standard amino acid or a known modified amino acid
+                res_name = residue.get_resname().strip()
+                if res_name in aa_dict:
+                    sequence += aa_dict[res_name]
+            # Check for valid length and update longest sequence
+            if (10 < len(sequence) < 1500 and
+                len(sequence) > len(longest_sequence)):
+                longest_sequence = sequence
+                longest_chain = chain
+    if not longest_sequence:
+        return None, None, pdb_path
+    # Save filtered PDB if needed
+    if longest_chain:
+        io = PDBIO()
+        io.set_structure(longest_chain.get_parent().get_parent())
+        filtered_pdb_path = pdb_path.replace('.pdb', '_filtered.pdb')
+        io.save(filtered_pdb_path)
+        return longest_sequence, longest_chain, filtered_pdb_path
+    return longest_sequence, longest_chain, pdb_path
 def create_dataset(tokenizer, seqs, labels, checkpoint):
     tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
     dataset = Dataset.from_dict(tokenized)
         print(f"Error fetching PDB: {e}")
         return None
 def process_pdb(pdb_id):
     # Fetch PDB file
+    # Use PDBList to download the file if it doesn't exist locally
+    pdbl = PDBList.PDBList()
+    pdb_path = pdbl.retrieve_pdb_file(pdb_id, pdir='pdb_files', file_format='pdb')
+    if not pdb_path or not os.path.exists(pdb_path):
+        return "Failed to fetch PDB file", None
     # Extract protein sequence and chain
+    protein_sequence, chain, filtered_pdb_path = extract_protein_sequence(pdb_path)
     if not protein_sequence:
+        return "No suitable protein sequence found", None
     # Predict binding sites
     sequence, normalized_scores = predict_protein_sequence(protein_sequence)
     # Prepare result string
     result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
+    return result_str, filtered_pdb_path
 # Create Gradio interface
 with gr.Blocks() as demo:
         outputs=[predictions_output, molecule_output]
     )
+demo.launch()

app.py CHANGED Viewed

@@ -23,10 +23,9 @@ from scipy.special import expit
 import requests
 # Biopython imports
-from Bio.PDB import PDBParser, Select
 from Bio.PDB.DSSP import DSSP
-from gradio_molecule3d import Molecule3D
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
@@ -38,6 +37,79 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device)
 model.eval()
 def create_dataset(tokenizer, seqs, labels, checkpoint):
     tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
     dataset = Dataset.from_dict(tokenized)
@@ -138,59 +210,20 @@ def fetch_pdb(pdb_id):
         print(f"Error fetching PDB: {e}")
         return None
-def extract_protein_sequence(pdb_path):
-    """
-    Extract the longest protein sequence from a PDB file
-    """
-    parser = PDBParser(QUIET=1)
-    structure = parser.get_structure('protein', pdb_path)
-    class ProteinSelect(Select):
-        def accept_residue(self, residue):
-            # Only accept standard amino acids
-            standard_aa = set('ACDEFGHIKLMNPQRSTVWY')
-            return residue.get_resname() in standard_aa
-    # Find the longest protein chain
-    longest_sequence = ""
-    longest_chain = None
-    for model in structure:
-        for chain in model:
-            sequence = ""
-            for residue in chain:
-                if Select().accept_residue(residue):
-                    sequence += residue.get_resname()
-            # Convert 3-letter amino acid codes to 1-letter
-            aa_dict = {
-                'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E', 'PHE':'F',
-                'GLY':'G', 'HIS':'H', 'ILE':'I', 'LYS':'K', 'LEU':'L',
-                'MET':'M', 'ASN':'N', 'PRO':'P', 'GLN':'Q', 'ARG':'R',
-                'SER':'S', 'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'
-            }
-            one_letter_sequence = ''.join([aa_dict.get(res, 'X') for res in sequence])
-            # Track the longest sequence
-            if len(one_letter_sequence) > len(longest_sequence) and \
-               10 < len(one_letter_sequence) < 1500:
-                longest_sequence = one_letter_sequence
-                longest_chain = chain
-    return longest_sequence, longest_chain, pdb_path
 def process_pdb(pdb_id):
     # Fetch PDB file
-    pdb_path = fetch_pdb(pdb_id)
-    if not pdb_path:
-        return "Failed to fetch PDB file", pdb_path
     # Extract protein sequence and chain
-    protein_sequence, chain, pdb_file = extract_protein_sequence(pdb_path)
     if not protein_sequence:
-        return "No suitable protein sequence found", pdb_file
     # Predict binding sites
     sequence, normalized_scores = predict_protein_sequence(protein_sequence)
@@ -198,7 +231,7 @@ def process_pdb(pdb_id):
     # Prepare result string
     result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
-    return result_str, pdb_file
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -246,4 +279,4 @@ with gr.Blocks() as demo:
         outputs=[predictions_output, molecule_output]
     )
-demo.launch(share=True)

 import requests
 # Biopython imports
+from Bio.PDB import PDBParser, Select, PDBIO
 from Bio.PDB.DSSP import DSSP
+import Bio.PDB.PDBList as PDBList
 # Configuration
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
 model.to(device)
 model.eval()
+def is_valid_sequence_length(length: int) -> bool:
+    """Check if sequence length is within valid range."""
+    return 100 <= length <= 1500
+def is_nucleic_acid_chain(chain) -> bool:
+    """Check if chain contains nucleic acids."""
+    nucleic_acids = {'A', 'C', 'G', 'T', 'U', 'DA', 'DC', 'DG', 'DT', 'DU', 'UNK'}
+    return any(residue.get_resname().strip() in nucleic_acids for residue in chain)
+def extract_protein_sequence(pdb_path):
+    """
+    Extract the longest protein sequence from a PDB file with improved logic
+    """
+    parser = PDBParser(QUIET=1)
+    structure = parser.get_structure('protein', pdb_path)
+    # Comprehensive amino acid mapping
+    aa_dict = {
+        # Standard amino acids (20 canonical)
+        'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
+        'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
+        'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
+        'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
+        # Modified amino acids and alternative names
+        'MSE': 'M',  # Selenomethionine
+        'SEP': 'S',  # Phosphoserine
+        'TPO': 'T',  # Phosphothreonine
+        'CSO': 'C',  # Hydroxylalanine
+        'PTR': 'Y',  # Phosphotyrosine
+        'HYP': 'P',  # Hydroxyproline
+    }
+    # Ligand and nucleic acid exclusion set
+    ligand_exclusion_set = {'HOH', 'WAT', 'DOD', 'SO4', 'PO4', 'GOL', 'ACT', 'EDO'}
+    # Find the longest protein chain
+    longest_sequence = ""
+    longest_chain = None
+    for model in structure:
+        for chain in model:
+            # Skip nucleic acid chains
+            if is_nucleic_acid_chain(chain):
+                continue
+            # Extract and convert sequence
+            sequence = ""
+            for residue in chain:
+                # Check if residue is a standard amino acid or a known modified amino acid
+                res_name = residue.get_resname().strip()
+                if res_name in aa_dict:
+                    sequence += aa_dict[res_name]
+            # Check for valid length and update longest sequence
+            if (10 < len(sequence) < 1500 and
+                len(sequence) > len(longest_sequence)):
+                longest_sequence = sequence
+                longest_chain = chain
+    if not longest_sequence:
+        return None, None, pdb_path
+    # Save filtered PDB if needed
+    if longest_chain:
+        io = PDBIO()
+        io.set_structure(longest_chain.get_parent().get_parent())
+        filtered_pdb_path = pdb_path.replace('.pdb', '_filtered.pdb')
+        io.save(filtered_pdb_path)
+        return longest_sequence, longest_chain, filtered_pdb_path
+    return longest_sequence, longest_chain, pdb_path
 def create_dataset(tokenizer, seqs, labels, checkpoint):
     tokenized = tokenizer(seqs, max_length=max_length, padding=False, truncation=True)
     dataset = Dataset.from_dict(tokenized)
         print(f"Error fetching PDB: {e}")
         return None
 def process_pdb(pdb_id):
     # Fetch PDB file
+    # Use PDBList to download the file if it doesn't exist locally
+    pdbl = PDBList.PDBList()
+    pdb_path = pdbl.retrieve_pdb_file(pdb_id, pdir='pdb_files', file_format='pdb')
+    if not pdb_path or not os.path.exists(pdb_path):
+        return "Failed to fetch PDB file", None
     # Extract protein sequence and chain
+    protein_sequence, chain, filtered_pdb_path = extract_protein_sequence(pdb_path)
     if not protein_sequence:
+        return "No suitable protein sequence found", None
     # Predict binding sites
     sequence, normalized_scores = predict_protein_sequence(protein_sequence)
     # Prepare result string
     result_str = "\n".join([f"{aa}: {score:.2f}" for aa, score in zip(sequence, normalized_scores)])
+    return result_str, filtered_pdb_path
 # Create Gradio interface
 with gr.Blocks() as demo:
         outputs=[predictions_output, molecule_output]
     )
+demo.launch()