cryptobank_prediction

Sleeping

App Files Files Community

ThorbenFroehlking commited on Apr 10, 2025

Commit

17ad0e5

1 Parent(s): 5dbe94b

Updated

Browse files

Files changed (3) hide show

.ipynb_checkpoints/app-checkpoint.py +158 -144
app-Copy1.py +89 -22
app.py +158 -144

.ipynb_checkpoints/app-checkpoint.py CHANGED Viewed

@@ -27,10 +27,7 @@ from datasets import Dataset
 from scipy.special import expit
 # Load model and move to device
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_cryptic'
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50_database'
 max_length = 1500
 model, tokenizer = load_model(checkpoint, max_length)
@@ -45,45 +42,32 @@ def normalize_scores(scores):
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
-    try:
-        with open(pdb_path, 'r') as f:
-            return f.read()
-    except FileNotFoundError:
-        print(f"File not found: {pdb_path}")
-        raise
-    except Exception as e:
-        print(f"Error reading file {pdb_path}: {str(e)}")
-        raise
-def fetch_structure(pdb_id: str, output_dir: str = ".") -> Optional[str]:
     """
     Fetch the structure file for a given PDB ID. Prioritizes CIF files.
     If a structure file already exists locally, it uses that.
     """
     file_path = download_structure(pdb_id, output_dir)
-    if file_path:
-        return file_path
-    else:
-        return None
-def download_structure(pdb_id: str, output_dir: str) -> Optional[str]:
     """
     Attempt to download the structure file in CIF or PDB format.
-    Returns the path to the downloaded file, or None if download fails.
     """
     for ext in ['.cif', '.pdb']:
         file_path = os.path.join(output_dir, f"{pdb_id}{ext}")
         if os.path.exists(file_path):
             return file_path
         url = f"https://files.rcsb.org/download/{pdb_id}{ext}"
-        try:
-            response = requests.get(url, timeout=10)
-            if response.status_code == 200:
-                with open(file_path, 'wb') as f:
-                    f.write(response.content)
-                return file_path
-        except Exception as e:
-            print(f"Download error for {pdb_id}{ext}: {e}")
     return None
 def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
@@ -100,8 +84,6 @@ def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
 def fetch_pdb(pdb_id):
     pdb_path = fetch_structure(pdb_id)
-    if not pdb_path:
-        return None
     _, ext = os.path.splitext(pdb_path)
     if ext == '.cif':
         pdb_path = convert_cif_to_pdb(pdb_path)
@@ -111,11 +93,9 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     """
     Create a PDB file with only the selected chain and residues, replacing B-factor with prediction scores
     """
-    # Read the original PDB file
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure('protein', input_pdb)
-    # Prepare a new structure with only the specified chain and selected residues
     output_pdb = f"{os.path.splitext(input_pdb)[0]}_{chain_id}_predictions_scores.pdb"
     # Create scores dictionary for easy lookup
@@ -148,8 +128,57 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     return output_pdb
-def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
         pdb_path = pdb_id_or_file
@@ -158,51 +187,37 @@ def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
         pdb_id = pdb_id_or_file
         pdb_path = fetch_pdb(pdb_id)
-    if not pdb_path:
-        return "Failed to fetch PDB file", None, None
     # Determine the file format and choose the appropriate parser
     _, ext = os.path.splitext(pdb_path)
     parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
-    try:
-        # Parse the structure file
-        structure = parser.get_structure('protein', pdb_path)
-    except Exception as e:
-        return f"Error parsing structure file: {e}", None, None
     # Extract the specified chain
-    try:
-        chain = structure[0][segment]
-    except KeyError:
-        return "Invalid Chain ID", None, None
     protein_residues = [res for res in chain if is_aa(res)]
     sequence = "".join(seq1(res.resname) for res in protein_residues)
     sequence_id = [res.id[1] for res in protein_residues]
-    visualized_sequence = "".join(seq1(res.resname) for res in protein_residues)
-    if sequence != visualized_sequence:
-        raise ValueError("The visualized sequence does not match the prediction sequence")
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
-    scores = expit(outputs[:, 1] - outputs[:, 0])
-    normalized_scores = normalize_scores(scores)
     # Choose which scores to use based on score_type
-    display_scores = normalized_scores if score_type == 'normalized' else scores
     # Zip residues with scores to track the residue ID and score
     residue_scores = [(resi, score) for resi, score in zip(sequence_id, display_scores)]
     # Also save both score types for later use
-    raw_residue_scores = [(resi, score) for resi, score in zip(sequence_id, scores)]
     norm_residue_scores = [(resi, score) for resi, score in zip(sequence_id, normalized_scores)]
     # Define the score brackets
     score_brackets = {
@@ -223,79 +238,35 @@ def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
                 residues_by_bracket[bracket].append(resi)
                 break
-    # Preparing the result string
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    result_str = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\n\n"
-    result_str += "Residues by Score Brackets:\n\n"
-    # Add residues for each bracket
-    for bracket, residues in residues_by_bracket.items():
-        result_str += f"Bracket {bracket}:\n"
-        result_str += "Columns: Residue Name, Residue Number, One-letter Code, Normalized Score\n"
-        result_str += "\n".join([
-            f"{res.resname} {res.id[1]} {sequence[i]} {normalized_scores[i]:.2f}"
-            for i, res in enumerate(protein_residues) if res.id[1] in residues
-        ])
-        result_str += "\n\n"
     # Create chain-specific PDB with scores in B-factor
     scored_pdb = create_chain_specific_pdb(pdb_path, segment, residue_scores, protein_residues)
     # Molecule visualization with updated script with color mapping
-    mol_vis = molecule(pdb_path, residue_scores, segment)#, color_map)
-    # Improved PyMOL command suggestions
-    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    pymol_commands = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\n\n"
-    pymol_commands += f"""
-    # PyMOL Visualization Commands
-    fetch {pdb_id}, protein
-    hide everything, all
-    show cartoon, chain {segment}
-    color white, chain {segment}
-    """
-    # Define colors for each score bracket
-    bracket_colors = {
-        "0.0-0.2": "white",
-        "0.2-0.4": "lightorange",
-        "0.4-0.6": "orange",
-        "0.6-0.8": "red",
-        "0.8-1.0": "firebrick"
-    }
-    # Add PyMOL commands for each score bracket
-    for bracket, residues in residues_by_bracket.items():
-        if residues:  # Only add commands if there are residues in this bracket
-            color = bracket_colors[bracket]
-            resi_list = '+'.join(map(str, residues))
-            pymol_commands += f"""
-    select bracket_{bracket.replace('.', '').replace('-', '_')}, resi {resi_list} and chain {segment}
-    show sticks, bracket_{bracket.replace('.', '').replace('-', '_')}
-    color {color}, bracket_{bracket.replace('.', '').replace('-', '_')}
-    """
-    # Create prediction and scored PDB files
-    prediction_file = f"{pdb_id}_binding_site_residues.txt"
     with open(prediction_file, "w") as f:
         f.write(result_str)
-    return pymol_commands, mol_vis, [prediction_file, scored_pdb],raw_residue_scores,norm_residue_scores
 def molecule(input_pdb, residue_scores=None, segment='A'):
-    # Check if the file exists
-    if not os.path.isfile(input_pdb):
-        return f"<p>Error: PDB file not found at {input_pdb}</p>"
-    try:
-        # Read PDB file content
-        mol = read_mol(input_pdb)
-    except Exception as e:
-        return f"<p>Error reading PDB file: {str(e)}</p>"
-    # More granular scoring for visualization
-    #mol = read_mol(input_pdb)  # Read PDB file content
     # Prepare high-scoring residues script if scores are provided
     high_score_script = ""
     if residue_scores is not None:
@@ -491,9 +462,9 @@ with gr.Blocks(css="""
     Score dependent colorcoding:
     - 0.0-0.2: white
     - 0.2–0.4: light orange
-    - 0.4–0.6: orange
-    - 0.6–0.8: red
-    - 0.8–1.0: firebrick
     """)
     predictions_output = gr.Textbox(label="Visualize Prediction with PyMol")
     gr.Markdown("### Download:\n- List of predicted binding site residues\n- PDB with score in beta factor column")
@@ -504,6 +475,7 @@ with gr.Blocks(css="""
     norm_scores_state = gr.State(None)
     last_pdb_path = gr.State(None)
     last_segment = gr.State(None)
     def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
         selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
@@ -511,12 +483,10 @@ with gr.Blocks(css="""
         # First get the actual PDB file path
         if mode == "PDB ID":
             pdb_path = fetch_pdb(pdb_id)  # Get the actual file path
-            if not pdb_path:
-                return "Failed to fetch PDB file", None, None, None, None, None, None
-            pymol_cmd, mol_vis, files, raw_scores, norm_scores = process_pdb(pdb_path, chain_id, selected_score_type)
             # Store the actual file path, not just the PDB ID
-            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
@@ -525,24 +495,70 @@ with gr.Blocks(css="""
             else:
                 pdb_path = file_path
-            pymol_cmd, mol_vis, files, raw_scores, norm_scores = process_pdb(pdb_path, chain_id, selected_score_type)
-            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id
-        else:
-            return "Error: Invalid mode selected", None, None, None, None, None, None
-    def update_visualization(score_type_val, raw_scores, norm_scores, pdb_path, segment):
-        if raw_scores is None or norm_scores is None or pdb_path is None or segment is None:
-            return None
-        # Verify the file exists
-        if not os.path.exists(pdb_path):
-            return f"Error: File not found at {pdb_path}"
         # Choose scores based on radio button selection
-        selected_scores = norm_scores if score_type_val == "Normalized Scores" else raw_scores
         # Generate visualization with selected scores
-        return molecule(pdb_path, selected_scores, segment)
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
@@ -555,8 +571,6 @@ with gr.Blocks(css="""
             else:
                 pdb_path= file_path
             return pdb_path
-        else:
-            return "Error: Invalid mode selected"
     def toggle_mode(selected_mode):
         if selected_mode == "PDB ID":
@@ -574,14 +588,14 @@ with gr.Blocks(css="""
         process_interface,
         inputs=[mode, pdb_input, pdb_file, segment_input, score_type],
         outputs=[predictions_output, molecule_output, download_output,
-                raw_scores_state, norm_scores_state, last_pdb_path, last_segment]
     )
-    # Update visualization when score type changes
     score_type.change(
-        update_visualization,
-        inputs=[score_type, raw_scores_state, norm_scores_state, last_pdb_path, last_segment],
-        outputs=[molecule_output]
     )
     visualize_btn.click(
@@ -600,4 +614,4 @@ with gr.Blocks(css="""
         inputs=[pdb_input, segment_input],
         outputs=[predictions_output, molecule_output, download_output]
     )
-demo.launch(share=True)

 from scipy.special import expit
 # Load model and move to device
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50_database'
 max_length = 1500
 model, tokenizer = load_model(checkpoint, max_length)
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
+    with open(pdb_path, 'r') as f:
+        return f.read()
+def fetch_structure(pdb_id: str, output_dir: str = ".") -> str:
     """
     Fetch the structure file for a given PDB ID. Prioritizes CIF files.
     If a structure file already exists locally, it uses that.
     """
     file_path = download_structure(pdb_id, output_dir)
+    return file_path
+def download_structure(pdb_id: str, output_dir: str) -> str:
     """
     Attempt to download the structure file in CIF or PDB format.
+    Returns the path to the downloaded file.
     """
     for ext in ['.cif', '.pdb']:
         file_path = os.path.join(output_dir, f"{pdb_id}{ext}")
         if os.path.exists(file_path):
             return file_path
         url = f"https://files.rcsb.org/download/{pdb_id}{ext}"
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            return file_path
     return None
 def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
 def fetch_pdb(pdb_id):
     pdb_path = fetch_structure(pdb_id)
     _, ext = os.path.splitext(pdb_path)
     if ext == '.cif':
         pdb_path = convert_cif_to_pdb(pdb_path)
     """
     Create a PDB file with only the selected chain and residues, replacing B-factor with prediction scores
     """
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure('protein', input_pdb)
     output_pdb = f"{os.path.splitext(input_pdb)[0]}_{chain_id}_predictions_scores.pdb"
     # Create scores dictionary for easy lookup
     return output_pdb
+def generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, score_type):
+    """Generate PyMOL commands based on score type"""
+    pymol_commands = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\nScore Type: {score_type}\n\n"
+    pymol_commands += f"""
+    # PyMOL Visualization Commands
+    fetch {pdb_id}, protein
+    hide everything, all
+    show cartoon, chain {segment}
+    color white, chain {segment}
+    """
+    # Define colors for each score bracket
+    bracket_colors = {
+        "0.0-0.2": "white",
+        "0.2-0.4": "lightorange",
+        "0.4-0.6": "yelloworange",
+        "0.6-0.8": "orange",
+        "0.8-1.0": "red"
+    }
+    # Add PyMOL commands for each score bracket
+    for bracket, residues in residues_by_bracket.items():
+        if residues:  # Only add commands if there are residues in this bracket
+            color = bracket_colors[bracket]
+            resi_list = '+'.join(map(str, residues))
+            pymol_commands += f"""
+    select bracket_{bracket.replace('.', '').replace('-', '_')}, resi {resi_list} and chain {segment}
+    show sticks, bracket_{bracket.replace('.', '').replace('-', '_')}
+    color {color}, bracket_{bracket.replace('.', '').replace('-', '_')}
+    """
+    return pymol_commands
+def generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence, scores, current_time, score_type):
+    """Generate results text based on score type"""
+    result_str = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\nScore Type: {score_type}\n\n"
+    result_str += "Residues by Score Brackets:\n\n"
+    # Add residues for each bracket
+    for bracket, residues in residues_by_bracket.items():
+        result_str += f"Bracket {bracket}:\n"
+        result_str += f"Columns: Residue Name, Residue Number, One-letter Code, {score_type} Score\n"
+        result_str += "\n".join([
+            f"{res.resname} {res.id[1]} {sequence[i]} {scores[i]:.2f}"
+            for i, res in enumerate(protein_residues) if res.id[1] in residues
+        ])
+        result_str += "\n\n"
+    return result_str
+def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
         pdb_path = pdb_id_or_file
         pdb_id = pdb_id_or_file
         pdb_path = fetch_pdb(pdb_id)
     # Determine the file format and choose the appropriate parser
     _, ext = os.path.splitext(pdb_path)
     parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
+    # Parse the structure file
+    structure = parser.get_structure('protein', pdb_path)
     # Extract the specified chain
+    chain = structure[0][segment]
     protein_residues = [res for res in chain if is_aa(res)]
     sequence = "".join(seq1(res.resname) for res in protein_residues)
     sequence_id = [res.id[1] for res in protein_residues]
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
+    raw_scores = expit(outputs[:, 1] - outputs[:, 0])
+    normalized_scores = normalize_scores(raw_scores)
     # Choose which scores to use based on score_type
+    display_scores = normalized_scores if score_type == 'normalized' else raw_scores
     # Zip residues with scores to track the residue ID and score
     residue_scores = [(resi, score) for resi, score in zip(sequence_id, display_scores)]
     # Also save both score types for later use
+    raw_residue_scores = [(resi, score) for resi, score in zip(sequence_id, raw_scores)]
     norm_residue_scores = [(resi, score) for resi, score in zip(sequence_id, normalized_scores)]
     # Define the score brackets
     score_brackets = {
                 residues_by_bracket[bracket].append(resi)
                 break
+    # Generate timestamp
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    # Generate result text and PyMOL commands based on score type
+    display_score_type = "Normalized" if score_type == 'normalized' else "Raw"
+    result_str = generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence,
+                                      display_scores, current_time, display_score_type)
+    pymol_commands = generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, display_score_type)
     # Create chain-specific PDB with scores in B-factor
     scored_pdb = create_chain_specific_pdb(pdb_path, segment, residue_scores, protein_residues)
     # Molecule visualization with updated script with color mapping
+    mol_vis = molecule(pdb_path, residue_scores, segment)
+    # Create prediction file
+    prediction_file = f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt"
     with open(prediction_file, "w") as f:
         f.write(result_str)
+    scored_pdb_name = f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb"
+    os.rename(scored_pdb, scored_pdb_name)
+    return pymol_commands, mol_vis, [prediction_file, scored_pdb_name], raw_residue_scores, norm_residue_scores, pdb_id, segment
 def molecule(input_pdb, residue_scores=None, segment='A'):
+    # Read PDB file content
+    mol = read_mol(input_pdb)
     # Prepare high-scoring residues script if scores are provided
     high_score_script = ""
     if residue_scores is not None:
     Score dependent colorcoding:
     - 0.0-0.2: white
     - 0.2–0.4: light orange
+    - 0.4–0.6: yellow orange
+    - 0.6–0.8: orange
+    - 0.8–1.0: red
     """)
     predictions_output = gr.Textbox(label="Visualize Prediction with PyMol")
     gr.Markdown("### Download:\n- List of predicted binding site residues\n- PDB with score in beta factor column")
     norm_scores_state = gr.State(None)
     last_pdb_path = gr.State(None)
     last_segment = gr.State(None)
+    last_pdb_id = gr.State(None)
     def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
         selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
         # First get the actual PDB file path
         if mode == "PDB ID":
             pdb_path = fetch_pdb(pdb_id)  # Get the actual file path
+            pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
             # Store the actual file path, not just the PDB ID
+            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
             else:
                 pdb_path = file_path
+            pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
+            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
+    def update_visualization_and_files(score_type_val, raw_scores, norm_scores, pdb_path, segment, pdb_id):
+        if raw_scores is None or norm_scores is None or pdb_path is None or segment is None or pdb_id is None:
+            return None, None, None
         # Choose scores based on radio button selection
+        selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
+        selected_scores = norm_scores if selected_score_type == 'normalized' else raw_scores
         # Generate visualization with selected scores
+        mol_vis = molecule(pdb_path, selected_scores, segment)
+        # Generate PyMOL commands and downloadable files
+        # Get structure for residue info
+        _, ext = os.path.splitext(pdb_path)
+        parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
+        structure = parser.get_structure('protein', pdb_path)
+        chain = structure[0][segment]
+        protein_residues = [res for res in chain if is_aa(res)]
+        sequence = "".join(seq1(res.resname) for res in protein_residues)
+        # Define score brackets
+        score_brackets = {
+            "0.0-0.2": (0.0, 0.2),
+            "0.2-0.4": (0.2, 0.4),
+            "0.4-0.6": (0.4, 0.6),
+            "0.6-0.8": (0.6, 0.8),
+            "0.8-1.0": (0.8, 1.0)
+        }
+        # Initialize a dictionary to store residues by bracket
+        residues_by_bracket = {bracket: [] for bracket in score_brackets}
+        # Categorize residues into brackets
+        for resi, score in selected_scores:
+            for bracket, (lower, upper) in score_brackets.items():
+                if lower <= score < upper:
+                    residues_by_bracket[bracket].append(resi)
+                    break
+        # Generate timestamp
+        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # Generate result text and PyMOL commands based on score type
+        display_score_type = "Normalized" if selected_score_type == 'normalized' else "Raw"
+        scores_array = [score for _, score in selected_scores]
+        result_str = generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence,
+                                           scores_array, current_time, display_score_type)
+        pymol_commands = generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, display_score_type)
+        # Create chain-specific PDB with scores in B-factor
+        scored_pdb = create_chain_specific_pdb(pdb_path, segment, selected_scores, protein_residues)
+        # Create prediction file
+        prediction_file = f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt"
+        with open(prediction_file, "w") as f:
+            f.write(result_str)
+        scored_pdb_name = f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb"
+        os.rename(scored_pdb, scored_pdb_name)
+        return mol_vis, pymol_commands, [prediction_file, scored_pdb_name]
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
             else:
                 pdb_path= file_path
             return pdb_path
     def toggle_mode(selected_mode):
         if selected_mode == "PDB ID":
         process_interface,
         inputs=[mode, pdb_input, pdb_file, segment_input, score_type],
         outputs=[predictions_output, molecule_output, download_output,
+                raw_scores_state, norm_scores_state, last_pdb_path, last_segment, last_pdb_id]
     )
+    # Update visualization, PyMOL commands, and files when score type changes
     score_type.change(
+        update_visualization_and_files,
+        inputs=[score_type, raw_scores_state, norm_scores_state, last_pdb_path, last_segment, last_pdb_id],
+        outputs=[molecule_output, predictions_output, download_output]
     )
     visualize_btn.click(
         inputs=[pdb_input, segment_input],
         outputs=[predictions_output, molecule_output, download_output]
     )
+demo.launch(share=True)

app-Copy1.py CHANGED Viewed

@@ -45,8 +45,15 @@ def normalize_scores(scores):
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
-    with open(pdb_path, 'r') as f:
-        return f.read()
 def fetch_structure(pdb_id: str, output_dir: str = ".") -> Optional[str]:
     """
@@ -141,7 +148,8 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     return output_pdb
-def process_pdb(pdb_id_or_file, segment):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
         pdb_path = pdb_id_or_file
@@ -180,14 +188,20 @@ def process_pdb(pdb_id_or_file, segment):
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
     scores = expit(outputs[:, 1] - outputs[:, 0])
     normalized_scores = normalize_scores(scores)
     # Zip residues with scores to track the residue ID and score
-    residue_scores = [(resi, score) for resi, score in zip(sequence_id, normalized_scores)]
     # Define the score brackets
@@ -236,7 +250,7 @@ def process_pdb(pdb_id_or_file, segment):
     pymol_commands += f"""
     # PyMOL Visualization Commands
-    load {os.path.abspath(pdb_path)}, protein
     hide everything, all
     show cartoon, chain {segment}
     color white, chain {segment}
@@ -266,11 +280,21 @@ def process_pdb(pdb_id_or_file, segment):
     with open(prediction_file, "w") as f:
         f.write(result_str)
-    return pymol_commands, mol_vis, [prediction_file,scored_pdb]
 def molecule(input_pdb, residue_scores=None, segment='A'):
     # More granular scoring for visualization
-    mol = read_mol(input_pdb)  # Read PDB file content
     # Prepare high-scoring residues script if scores are provided
     high_score_script = ""
@@ -410,7 +434,6 @@ def molecule(input_pdb, residue_scores=None, segment='A'):
     # Return the HTML content within an iframe safely encoded for special characters
     return f'<iframe width="100%" height="700" srcdoc="{html_content.replace(chr(34), "&quot;").replace(chr(39), "&#39;")}"></iframe>'
-# Gradio UI
 with gr.Blocks(css="""
     /* Customize Gradio button colors */
     #visualize-btn, #predict-btn {
@@ -455,32 +478,71 @@ with gr.Blocks(css="""
         info="Choose in which chain to predict binding sites.")
         prediction_btn = gr.Button("Predict Binding Site", elem_id="predict-btn")
     molecule_output = gr.HTML(label="Protein Structure")
     explanation_vis = gr.Markdown("""
     Score dependent colorcoding:
     - 0.0-0.2: white
     - 0.2–0.4: light orange
     - 0.4–0.6: orange
-    - 0.6–0.8: orangered
-    - 0.8–1.0: red
     """)
     predictions_output = gr.Textbox(label="Visualize Prediction with PyMol")
     gr.Markdown("### Download:\n- List of predicted binding site residues\n- PDB with score in beta factor column")
     download_output = gr.File(label="Download Files", file_count="multiple")
-    def process_interface(mode, pdb_id, pdb_file, chain_id):
         if mode == "PDB ID":
-            return process_pdb(pdb_id, chain_id)
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
             if ext == '.cif':
                 pdb_path = convert_cif_to_pdb(file_path)
             else:
-                pdb_path= file_path
-            return process_pdb(pdb_path, chain_id)
         else:
-            return "Error: Invalid mode selected", None, None
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
@@ -488,12 +550,10 @@ with gr.Blocks(css="""
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
-            #print(ext)
             if ext == '.cif':
                 pdb_path = convert_cif_to_pdb(file_path)
             else:
                 pdb_path= file_path
-            #print(pdb_path)
             return pdb_path
         else:
             return "Error: Invalid mode selected"
@@ -512,8 +572,16 @@ with gr.Blocks(css="""
     prediction_btn.click(
         process_interface,
-        inputs=[mode, pdb_input, pdb_file, segment_input],
-        outputs=[predictions_output, molecule_output, download_output]
     )
     visualize_btn.click(
@@ -532,5 +600,4 @@ with gr.Blocks(css="""
         inputs=[pdb_input, segment_input],
         outputs=[predictions_output, molecule_output, download_output]
     )
 demo.launch(share=True)

 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
+    try:
+        with open(pdb_path, 'r') as f:
+            return f.read()
+    except FileNotFoundError:
+        print(f"File not found: {pdb_path}")
+        raise
+    except Exception as e:
+        print(f"Error reading file {pdb_path}: {str(e)}")
+        raise
 def fetch_structure(pdb_id: str, output_dir: str = ".") -> Optional[str]:
     """
     return output_pdb
+def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
         pdb_path = pdb_id_or_file
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
     scores = expit(outputs[:, 1] - outputs[:, 0])
     normalized_scores = normalize_scores(scores)
+    # Choose which scores to use based on score_type
+    display_scores = normalized_scores if score_type == 'normalized' else scores
     # Zip residues with scores to track the residue ID and score
+    residue_scores = [(resi, score) for resi, score in zip(sequence_id, display_scores)]
+    # Also save both score types for later use
+    raw_residue_scores = [(resi, score) for resi, score in zip(sequence_id, scores)]
+    norm_residue_scores = [(resi, score) for resi, score in zip(sequence_id, normalized_scores)]
     # Define the score brackets
     pymol_commands += f"""
     # PyMOL Visualization Commands
+    fetch {pdb_id}, protein
     hide everything, all
     show cartoon, chain {segment}
     color white, chain {segment}
     with open(prediction_file, "w") as f:
         f.write(result_str)
+    return pymol_commands, mol_vis, [prediction_file, scored_pdb],raw_residue_scores,norm_residue_scores
 def molecule(input_pdb, residue_scores=None, segment='A'):
+    # Check if the file exists
+    if not os.path.isfile(input_pdb):
+        return f"<p>Error: PDB file not found at {input_pdb}</p>"
+    try:
+        # Read PDB file content
+        mol = read_mol(input_pdb)
+    except Exception as e:
+        return f"<p>Error reading PDB file: {str(e)}</p>"
     # More granular scoring for visualization
+    #mol = read_mol(input_pdb)  # Read PDB file content
     # Prepare high-scoring residues script if scores are provided
     high_score_script = ""
     # Return the HTML content within an iframe safely encoded for special characters
     return f'<iframe width="100%" height="700" srcdoc="{html_content.replace(chr(34), "&quot;").replace(chr(39), "&#39;")}"></iframe>'
 with gr.Blocks(css="""
     /* Customize Gradio button colors */
     #visualize-btn, #predict-btn {
         info="Choose in which chain to predict binding sites.")
         prediction_btn = gr.Button("Predict Binding Site", elem_id="predict-btn")
+    # Add score type selector
+    score_type = gr.Radio(
+        choices=["Normalized Scores", "Raw Scores"],
+        value="Normalized Scores",
+        label="Score Visualization Type",
+        info="Choose which score type to visualize"
+    )
     molecule_output = gr.HTML(label="Protein Structure")
     explanation_vis = gr.Markdown("""
     Score dependent colorcoding:
     - 0.0-0.2: white
     - 0.2–0.4: light orange
     - 0.4–0.6: orange
+    - 0.6–0.8: red
+    - 0.8–1.0: firebrick
     """)
     predictions_output = gr.Textbox(label="Visualize Prediction with PyMol")
     gr.Markdown("### Download:\n- List of predicted binding site residues\n- PDB with score in beta factor column")
     download_output = gr.File(label="Download Files", file_count="multiple")
+    # Store these as state variables so we can switch between them
+    raw_scores_state = gr.State(None)
+    norm_scores_state = gr.State(None)
+    last_pdb_path = gr.State(None)
+    last_segment = gr.State(None)
+    def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
+        selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
+        # First get the actual PDB file path
         if mode == "PDB ID":
+            pdb_path = fetch_pdb(pdb_id)  # Get the actual file path
+            if not pdb_path:
+                return "Failed to fetch PDB file", None, None, None, None, None, None
+            pymol_cmd, mol_vis, files, raw_scores, norm_scores = process_pdb(pdb_path, chain_id, selected_score_type)
+            # Store the actual file path, not just the PDB ID
+            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
             if ext == '.cif':
                 pdb_path = convert_cif_to_pdb(file_path)
             else:
+                pdb_path = file_path
+            pymol_cmd, mol_vis, files, raw_scores, norm_scores = process_pdb(pdb_path, chain_id, selected_score_type)
+            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id
         else:
+            return "Error: Invalid mode selected", None, None, None, None, None, None
+    def update_visualization(score_type_val, raw_scores, norm_scores, pdb_path, segment):
+        if raw_scores is None or norm_scores is None or pdb_path is None or segment is None:
+            return None
+        # Verify the file exists
+        if not os.path.exists(pdb_path):
+            return f"Error: File not found at {pdb_path}"
+        # Choose scores based on radio button selection
+        selected_scores = norm_scores if score_type_val == "Normalized Scores" else raw_scores
+        # Generate visualization with selected scores
+        return molecule(pdb_path, selected_scores, segment)
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
             if ext == '.cif':
                 pdb_path = convert_cif_to_pdb(file_path)
             else:
                 pdb_path= file_path
             return pdb_path
         else:
             return "Error: Invalid mode selected"
     prediction_btn.click(
         process_interface,
+        inputs=[mode, pdb_input, pdb_file, segment_input, score_type],
+        outputs=[predictions_output, molecule_output, download_output,
+                raw_scores_state, norm_scores_state, last_pdb_path, last_segment]
+    )
+    # Update visualization when score type changes
+    score_type.change(
+        update_visualization,
+        inputs=[score_type, raw_scores_state, norm_scores_state, last_pdb_path, last_segment],
+        outputs=[molecule_output]
     )
     visualize_btn.click(
         inputs=[pdb_input, segment_input],
         outputs=[predictions_output, molecule_output, download_output]
     )
 demo.launch(share=True)

app.py CHANGED Viewed

@@ -27,10 +27,7 @@ from datasets import Dataset
 from scipy.special import expit
 # Load model and move to device
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_cryptic'
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50_database'
 max_length = 1500
 model, tokenizer = load_model(checkpoint, max_length)
@@ -45,45 +42,32 @@ def normalize_scores(scores):
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
-    try:
-        with open(pdb_path, 'r') as f:
-            return f.read()
-    except FileNotFoundError:
-        print(f"File not found: {pdb_path}")
-        raise
-    except Exception as e:
-        print(f"Error reading file {pdb_path}: {str(e)}")
-        raise
-def fetch_structure(pdb_id: str, output_dir: str = ".") -> Optional[str]:
     """
     Fetch the structure file for a given PDB ID. Prioritizes CIF files.
     If a structure file already exists locally, it uses that.
     """
     file_path = download_structure(pdb_id, output_dir)
-    if file_path:
-        return file_path
-    else:
-        return None
-def download_structure(pdb_id: str, output_dir: str) -> Optional[str]:
     """
     Attempt to download the structure file in CIF or PDB format.
-    Returns the path to the downloaded file, or None if download fails.
     """
     for ext in ['.cif', '.pdb']:
         file_path = os.path.join(output_dir, f"{pdb_id}{ext}")
         if os.path.exists(file_path):
             return file_path
         url = f"https://files.rcsb.org/download/{pdb_id}{ext}"
-        try:
-            response = requests.get(url, timeout=10)
-            if response.status_code == 200:
-                with open(file_path, 'wb') as f:
-                    f.write(response.content)
-                return file_path
-        except Exception as e:
-            print(f"Download error for {pdb_id}{ext}: {e}")
     return None
 def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
@@ -100,8 +84,6 @@ def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
 def fetch_pdb(pdb_id):
     pdb_path = fetch_structure(pdb_id)
-    if not pdb_path:
-        return None
     _, ext = os.path.splitext(pdb_path)
     if ext == '.cif':
         pdb_path = convert_cif_to_pdb(pdb_path)
@@ -111,11 +93,9 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     """
     Create a PDB file with only the selected chain and residues, replacing B-factor with prediction scores
     """
-    # Read the original PDB file
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure('protein', input_pdb)
-    # Prepare a new structure with only the specified chain and selected residues
     output_pdb = f"{os.path.splitext(input_pdb)[0]}_{chain_id}_predictions_scores.pdb"
     # Create scores dictionary for easy lookup
@@ -148,8 +128,57 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     return output_pdb
-def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
         pdb_path = pdb_id_or_file
@@ -158,51 +187,37 @@ def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
         pdb_id = pdb_id_or_file
         pdb_path = fetch_pdb(pdb_id)
-    if not pdb_path:
-        return "Failed to fetch PDB file", None, None
     # Determine the file format and choose the appropriate parser
     _, ext = os.path.splitext(pdb_path)
     parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
-    try:
-        # Parse the structure file
-        structure = parser.get_structure('protein', pdb_path)
-    except Exception as e:
-        return f"Error parsing structure file: {e}", None, None
     # Extract the specified chain
-    try:
-        chain = structure[0][segment]
-    except KeyError:
-        return "Invalid Chain ID", None, None
     protein_residues = [res for res in chain if is_aa(res)]
     sequence = "".join(seq1(res.resname) for res in protein_residues)
     sequence_id = [res.id[1] for res in protein_residues]
-    visualized_sequence = "".join(seq1(res.resname) for res in protein_residues)
-    if sequence != visualized_sequence:
-        raise ValueError("The visualized sequence does not match the prediction sequence")
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
-    scores = expit(outputs[:, 1] - outputs[:, 0])
-    normalized_scores = normalize_scores(scores)
     # Choose which scores to use based on score_type
-    display_scores = normalized_scores if score_type == 'normalized' else scores
     # Zip residues with scores to track the residue ID and score
     residue_scores = [(resi, score) for resi, score in zip(sequence_id, display_scores)]
     # Also save both score types for later use
-    raw_residue_scores = [(resi, score) for resi, score in zip(sequence_id, scores)]
     norm_residue_scores = [(resi, score) for resi, score in zip(sequence_id, normalized_scores)]
     # Define the score brackets
     score_brackets = {
@@ -223,79 +238,35 @@ def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
                 residues_by_bracket[bracket].append(resi)
                 break
-    # Preparing the result string
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    result_str = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\n\n"
-    result_str += "Residues by Score Brackets:\n\n"
-    # Add residues for each bracket
-    for bracket, residues in residues_by_bracket.items():
-        result_str += f"Bracket {bracket}:\n"
-        result_str += "Columns: Residue Name, Residue Number, One-letter Code, Normalized Score\n"
-        result_str += "\n".join([
-            f"{res.resname} {res.id[1]} {sequence[i]} {normalized_scores[i]:.2f}"
-            for i, res in enumerate(protein_residues) if res.id[1] in residues
-        ])
-        result_str += "\n\n"
     # Create chain-specific PDB with scores in B-factor
     scored_pdb = create_chain_specific_pdb(pdb_path, segment, residue_scores, protein_residues)
     # Molecule visualization with updated script with color mapping
-    mol_vis = molecule(pdb_path, residue_scores, segment)#, color_map)
-    # Improved PyMOL command suggestions
-    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    pymol_commands = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\n\n"
-    pymol_commands += f"""
-    # PyMOL Visualization Commands
-    fetch {pdb_id}, protein
-    hide everything, all
-    show cartoon, chain {segment}
-    color white, chain {segment}
-    """
-    # Define colors for each score bracket
-    bracket_colors = {
-        "0.0-0.2": "white",
-        "0.2-0.4": "lightorange",
-        "0.4-0.6": "orange",
-        "0.6-0.8": "red",
-        "0.8-1.0": "firebrick"
-    }
-    # Add PyMOL commands for each score bracket
-    for bracket, residues in residues_by_bracket.items():
-        if residues:  # Only add commands if there are residues in this bracket
-            color = bracket_colors[bracket]
-            resi_list = '+'.join(map(str, residues))
-            pymol_commands += f"""
-    select bracket_{bracket.replace('.', '').replace('-', '_')}, resi {resi_list} and chain {segment}
-    show sticks, bracket_{bracket.replace('.', '').replace('-', '_')}
-    color {color}, bracket_{bracket.replace('.', '').replace('-', '_')}
-    """
-    # Create prediction and scored PDB files
-    prediction_file = f"{pdb_id}_binding_site_residues.txt"
     with open(prediction_file, "w") as f:
         f.write(result_str)
-    return pymol_commands, mol_vis, [prediction_file, scored_pdb],raw_residue_scores,norm_residue_scores
 def molecule(input_pdb, residue_scores=None, segment='A'):
-    # Check if the file exists
-    if not os.path.isfile(input_pdb):
-        return f"<p>Error: PDB file not found at {input_pdb}</p>"
-    try:
-        # Read PDB file content
-        mol = read_mol(input_pdb)
-    except Exception as e:
-        return f"<p>Error reading PDB file: {str(e)}</p>"
-    # More granular scoring for visualization
-    #mol = read_mol(input_pdb)  # Read PDB file content
     # Prepare high-scoring residues script if scores are provided
     high_score_script = ""
     if residue_scores is not None:
@@ -491,9 +462,9 @@ with gr.Blocks(css="""
     Score dependent colorcoding:
     - 0.0-0.2: white
     - 0.2–0.4: light orange
-    - 0.4–0.6: orange
-    - 0.6–0.8: red
-    - 0.8–1.0: firebrick
     """)
     predictions_output = gr.Textbox(label="Visualize Prediction with PyMol")
     gr.Markdown("### Download:\n- List of predicted binding site residues\n- PDB with score in beta factor column")
@@ -504,6 +475,7 @@ with gr.Blocks(css="""
     norm_scores_state = gr.State(None)
     last_pdb_path = gr.State(None)
     last_segment = gr.State(None)
     def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
         selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
@@ -511,12 +483,10 @@ with gr.Blocks(css="""
         # First get the actual PDB file path
         if mode == "PDB ID":
             pdb_path = fetch_pdb(pdb_id)  # Get the actual file path
-            if not pdb_path:
-                return "Failed to fetch PDB file", None, None, None, None, None, None
-            pymol_cmd, mol_vis, files, raw_scores, norm_scores = process_pdb(pdb_path, chain_id, selected_score_type)
             # Store the actual file path, not just the PDB ID
-            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
@@ -525,24 +495,70 @@ with gr.Blocks(css="""
             else:
                 pdb_path = file_path
-            pymol_cmd, mol_vis, files, raw_scores, norm_scores = process_pdb(pdb_path, chain_id, selected_score_type)
-            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id
-        else:
-            return "Error: Invalid mode selected", None, None, None, None, None, None
-    def update_visualization(score_type_val, raw_scores, norm_scores, pdb_path, segment):
-        if raw_scores is None or norm_scores is None or pdb_path is None or segment is None:
-            return None
-        # Verify the file exists
-        if not os.path.exists(pdb_path):
-            return f"Error: File not found at {pdb_path}"
         # Choose scores based on radio button selection
-        selected_scores = norm_scores if score_type_val == "Normalized Scores" else raw_scores
         # Generate visualization with selected scores
-        return molecule(pdb_path, selected_scores, segment)
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
@@ -555,8 +571,6 @@ with gr.Blocks(css="""
             else:
                 pdb_path= file_path
             return pdb_path
-        else:
-            return "Error: Invalid mode selected"
     def toggle_mode(selected_mode):
         if selected_mode == "PDB ID":
@@ -574,14 +588,14 @@ with gr.Blocks(css="""
         process_interface,
         inputs=[mode, pdb_input, pdb_file, segment_input, score_type],
         outputs=[predictions_output, molecule_output, download_output,
-                raw_scores_state, norm_scores_state, last_pdb_path, last_segment]
     )
-    # Update visualization when score type changes
     score_type.change(
-        update_visualization,
-        inputs=[score_type, raw_scores_state, norm_scores_state, last_pdb_path, last_segment],
-        outputs=[molecule_output]
     )
     visualize_btn.click(
@@ -600,4 +614,4 @@ with gr.Blocks(css="""
         inputs=[pdb_input, segment_input],
         outputs=[predictions_output, molecule_output, download_output]
     )
-demo.launch(share=True)

 from scipy.special import expit
 # Load model and move to device
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50_database'
 max_length = 1500
 model, tokenizer = load_model(checkpoint, max_length)
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
+    with open(pdb_path, 'r') as f:
+        return f.read()
+def fetch_structure(pdb_id: str, output_dir: str = ".") -> str:
     """
     Fetch the structure file for a given PDB ID. Prioritizes CIF files.
     If a structure file already exists locally, it uses that.
     """
     file_path = download_structure(pdb_id, output_dir)
+    return file_path
+def download_structure(pdb_id: str, output_dir: str) -> str:
     """
     Attempt to download the structure file in CIF or PDB format.
+    Returns the path to the downloaded file.
     """
     for ext in ['.cif', '.pdb']:
         file_path = os.path.join(output_dir, f"{pdb_id}{ext}")
         if os.path.exists(file_path):
             return file_path
         url = f"https://files.rcsb.org/download/{pdb_id}{ext}"
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            return file_path
     return None
 def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
 def fetch_pdb(pdb_id):
     pdb_path = fetch_structure(pdb_id)
     _, ext = os.path.splitext(pdb_path)
     if ext == '.cif':
         pdb_path = convert_cif_to_pdb(pdb_path)
     """
     Create a PDB file with only the selected chain and residues, replacing B-factor with prediction scores
     """
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure('protein', input_pdb)
     output_pdb = f"{os.path.splitext(input_pdb)[0]}_{chain_id}_predictions_scores.pdb"
     # Create scores dictionary for easy lookup
     return output_pdb
+def generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, score_type):
+    """Generate PyMOL commands based on score type"""
+    pymol_commands = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\nScore Type: {score_type}\n\n"
+    pymol_commands += f"""
+    # PyMOL Visualization Commands
+    fetch {pdb_id}, protein
+    hide everything, all
+    show cartoon, chain {segment}
+    color white, chain {segment}
+    """
+    # Define colors for each score bracket
+    bracket_colors = {
+        "0.0-0.2": "white",
+        "0.2-0.4": "lightorange",
+        "0.4-0.6": "yelloworange",
+        "0.6-0.8": "orange",
+        "0.8-1.0": "red"
+    }
+    # Add PyMOL commands for each score bracket
+    for bracket, residues in residues_by_bracket.items():
+        if residues:  # Only add commands if there are residues in this bracket
+            color = bracket_colors[bracket]
+            resi_list = '+'.join(map(str, residues))
+            pymol_commands += f"""
+    select bracket_{bracket.replace('.', '').replace('-', '_')}, resi {resi_list} and chain {segment}
+    show sticks, bracket_{bracket.replace('.', '').replace('-', '_')}
+    color {color}, bracket_{bracket.replace('.', '').replace('-', '_')}
+    """
+    return pymol_commands
+def generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence, scores, current_time, score_type):
+    """Generate results text based on score type"""
+    result_str = f"Prediction for PDB: {pdb_id}, Chain: {segment}\nDate: {current_time}\nScore Type: {score_type}\n\n"
+    result_str += "Residues by Score Brackets:\n\n"
+    # Add residues for each bracket
+    for bracket, residues in residues_by_bracket.items():
+        result_str += f"Bracket {bracket}:\n"
+        result_str += f"Columns: Residue Name, Residue Number, One-letter Code, {score_type} Score\n"
+        result_str += "\n".join([
+            f"{res.resname} {res.id[1]} {sequence[i]} {scores[i]:.2f}"
+            for i, res in enumerate(protein_residues) if res.id[1] in residues
+        ])
+        result_str += "\n\n"
+    return result_str
+def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
         pdb_path = pdb_id_or_file
         pdb_id = pdb_id_or_file
         pdb_path = fetch_pdb(pdb_id)
     # Determine the file format and choose the appropriate parser
     _, ext = os.path.splitext(pdb_path)
     parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
+    # Parse the structure file
+    structure = parser.get_structure('protein', pdb_path)
     # Extract the specified chain
+    chain = structure[0][segment]
     protein_residues = [res for res in chain if is_aa(res)]
     sequence = "".join(seq1(res.resname) for res in protein_residues)
     sequence_id = [res.id[1] for res in protein_residues]
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
         outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
+    raw_scores = expit(outputs[:, 1] - outputs[:, 0])
+    normalized_scores = normalize_scores(raw_scores)
     # Choose which scores to use based on score_type
+    display_scores = normalized_scores if score_type == 'normalized' else raw_scores
     # Zip residues with scores to track the residue ID and score
     residue_scores = [(resi, score) for resi, score in zip(sequence_id, display_scores)]
     # Also save both score types for later use
+    raw_residue_scores = [(resi, score) for resi, score in zip(sequence_id, raw_scores)]
     norm_residue_scores = [(resi, score) for resi, score in zip(sequence_id, normalized_scores)]
     # Define the score brackets
     score_brackets = {
                 residues_by_bracket[bracket].append(resi)
                 break
+    # Generate timestamp
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    # Generate result text and PyMOL commands based on score type
+    display_score_type = "Normalized" if score_type == 'normalized' else "Raw"
+    result_str = generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence,
+                                      display_scores, current_time, display_score_type)
+    pymol_commands = generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, display_score_type)
     # Create chain-specific PDB with scores in B-factor
     scored_pdb = create_chain_specific_pdb(pdb_path, segment, residue_scores, protein_residues)
     # Molecule visualization with updated script with color mapping
+    mol_vis = molecule(pdb_path, residue_scores, segment)
+    # Create prediction file
+    prediction_file = f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt"
     with open(prediction_file, "w") as f:
         f.write(result_str)
+    scored_pdb_name = f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb"
+    os.rename(scored_pdb, scored_pdb_name)
+    return pymol_commands, mol_vis, [prediction_file, scored_pdb_name], raw_residue_scores, norm_residue_scores, pdb_id, segment
 def molecule(input_pdb, residue_scores=None, segment='A'):
+    # Read PDB file content
+    mol = read_mol(input_pdb)
     # Prepare high-scoring residues script if scores are provided
     high_score_script = ""
     if residue_scores is not None:
     Score dependent colorcoding:
     - 0.0-0.2: white
     - 0.2–0.4: light orange
+    - 0.4–0.6: yellow orange
+    - 0.6–0.8: orange
+    - 0.8–1.0: red
     """)
     predictions_output = gr.Textbox(label="Visualize Prediction with PyMol")
     gr.Markdown("### Download:\n- List of predicted binding site residues\n- PDB with score in beta factor column")
     norm_scores_state = gr.State(None)
     last_pdb_path = gr.State(None)
     last_segment = gr.State(None)
+    last_pdb_id = gr.State(None)
     def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
         selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
         # First get the actual PDB file path
         if mode == "PDB ID":
             pdb_path = fetch_pdb(pdb_id)  # Get the actual file path
+            pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
             # Store the actual file path, not just the PDB ID
+            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
             file_path = os.path.join('./', f"{_}{ext}")
             else:
                 pdb_path = file_path
+            pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
+            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
+    def update_visualization_and_files(score_type_val, raw_scores, norm_scores, pdb_path, segment, pdb_id):
+        if raw_scores is None or norm_scores is None or pdb_path is None or segment is None or pdb_id is None:
+            return None, None, None
         # Choose scores based on radio button selection
+        selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
+        selected_scores = norm_scores if selected_score_type == 'normalized' else raw_scores
         # Generate visualization with selected scores
+        mol_vis = molecule(pdb_path, selected_scores, segment)
+        # Generate PyMOL commands and downloadable files
+        # Get structure for residue info
+        _, ext = os.path.splitext(pdb_path)
+        parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
+        structure = parser.get_structure('protein', pdb_path)
+        chain = structure[0][segment]
+        protein_residues = [res for res in chain if is_aa(res)]
+        sequence = "".join(seq1(res.resname) for res in protein_residues)
+        # Define score brackets
+        score_brackets = {
+            "0.0-0.2": (0.0, 0.2),
+            "0.2-0.4": (0.2, 0.4),
+            "0.4-0.6": (0.4, 0.6),
+            "0.6-0.8": (0.6, 0.8),
+            "0.8-1.0": (0.8, 1.0)
+        }
+        # Initialize a dictionary to store residues by bracket
+        residues_by_bracket = {bracket: [] for bracket in score_brackets}
+        # Categorize residues into brackets
+        for resi, score in selected_scores:
+            for bracket, (lower, upper) in score_brackets.items():
+                if lower <= score < upper:
+                    residues_by_bracket[bracket].append(resi)
+                    break
+        # Generate timestamp
+        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # Generate result text and PyMOL commands based on score type
+        display_score_type = "Normalized" if selected_score_type == 'normalized' else "Raw"
+        scores_array = [score for _, score in selected_scores]
+        result_str = generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence,
+                                           scores_array, current_time, display_score_type)
+        pymol_commands = generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, display_score_type)
+        # Create chain-specific PDB with scores in B-factor
+        scored_pdb = create_chain_specific_pdb(pdb_path, segment, selected_scores, protein_residues)
+        # Create prediction file
+        prediction_file = f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt"
+        with open(prediction_file, "w") as f:
+            f.write(result_str)
+        scored_pdb_name = f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb"
+        os.rename(scored_pdb, scored_pdb_name)
+        return mol_vis, pymol_commands, [prediction_file, scored_pdb_name]
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
             else:
                 pdb_path= file_path
             return pdb_path
     def toggle_mode(selected_mode):
         if selected_mode == "PDB ID":
         process_interface,
         inputs=[mode, pdb_input, pdb_file, segment_input, score_type],
         outputs=[predictions_output, molecule_output, download_output,
+                raw_scores_state, norm_scores_state, last_pdb_path, last_segment, last_pdb_id]
     )
+    # Update visualization, PyMOL commands, and files when score type changes
     score_type.change(
+        update_visualization_and_files,
+        inputs=[score_type, raw_scores_state, norm_scores_state, last_pdb_path, last_segment, last_pdb_id],
+        outputs=[molecule_output, predictions_output, download_output]
     )
     visualize_btn.click(
         inputs=[pdb_input, segment_input],
         outputs=[predictions_output, molecule_output, download_output]
     )
+demo.launch(share=True)