Spaces:

samsl
/

rocketshp

Sleeping

File size: 15,027 Bytes

cd1d940

import os
import tempfile

from matplotlib.path import Path

import gradio as gr
from gradio_molecule3d import Molecule3D
import numpy as np
import json
import torch
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
from biotite.sequence import io as seqio
from biotite.structure import io, to_sequence, spread_residue_wise, filter_amino_acids
from biotite.database import rcsb
from rocketshp import RocketSHP, load_sequence, load_structure
from rocketshp.network import (
    build_allosteric_network,
    cluster_network,
    calculate_centrality,
)


def plot_predictions(
    rmsf: np.ndarray,
    gcc_lmi: np.ndarray,
    shp: np.ndarray,
    title: str = "RocketSHP Predictions",
    font_scale: float = 1.0,
):
    with plt.style.context(
        {
            "font.size": 12 * font_scale,
            "legend.fontsize": 12 * font_scale,
            "axes.labelsize": 12 * font_scale,
            "axes.titlesize": 12 * font_scale,
        }
    ):
        plot_file = tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".png")

        fig = plt.figure(figsize=(6, 6))
        gs = fig.add_gridspec(2, 2)
        ax1 = fig.add_subplot(gs[0, 0])
        ax2 = fig.add_subplot(gs[0, 1])
        ax3 = fig.add_subplot(gs[1, :])

        fig.suptitle(title)

        ax1.plot(rmsf, label="RMSF")
        ax1.set_title("RMSF")
        ax1.set_xlabel("Residue Index")
        ax1.set_ylabel("RMSF (Å)")
        ax1.spines["top"].set_visible(False)
        ax1.spines["right"].set_visible(False)

        ax2.imshow(gcc_lmi, cmap="viridis", aspect="equal", vmin=0, vmax=1)
        ax2.set_title("GCC-LMI")
        ax2.set_xlabel("Residue Index")
        ax2.set_ylabel("Residue Index")

        ax3.imshow(shp.T, cmap="binary", vmin=0, vmax=1, interpolation="none")
        ax3.set_title("SHP")
        ax3.set_xlabel("Residue Index")
        ax3.set_ylabel("Structure Token\nIndex")
        ax3.set_ylim(21, -1)

        plt.tight_layout()
        plt.savefig(plot_file.name)
        return fig, plot_file.name


def download_predictions(job_name, rmsf, gcc_lmi, shp):
    outfile = tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json")

    json_content = {
        "model": job_name,
        "rmsf": rmsf.tolist(),
        "gcc_lmi": gcc_lmi.tolist(),
        "shp": shp.tolist(),
    }

    outfile.write(json.dumps(json_content))

    return outfile.name


def toggle_inputs(model):
    if "seq" in model or "mini" in model:
        return (
            gr.update(visible=True),  # sequence input
            gr.update(visible=True),  # fasta upload
            gr.update(visible=False),  # structure input
            gr.update(visible=False),  # structure upload
            gr.update(visible=False),  # structure output
        )
    return (
        gr.update(visible=False),  # sequence input
        gr.update(visible=False),  # fasta upload
        gr.update(visible=True),  # structure input
        gr.update(visible=True),  # structure upload
        gr.update(visible=True),  # structure output
    )


def predict_rocketshp(
    model_variant: str,
    sequence: str | None,
    sequence_file: str | None,
    structure_code: str | None,
    structure_file: str | None,
):
    print(f"sequence text: {sequence}")
    print(f"sequence file: {sequence_file}")
    print(f"structure code: {structure_code}")
    print(f"structure file: {structure_file}")
    print(f"model variant: {model_variant}")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    is_sequence_model = "seq" in model_variant or "mini" in model_variant

    if is_sequence_model:
        if sequence_file is not None:
            if sequence != "":
                gr.Warning("Sequence file provided, ignoring text box.")

            sequence = str(seqio.load_sequence(sequence_file))
            print(sequence)

        elif sequence == "":
            raise gr.Error("Sequence input is required for the selected model.")

        struct_features = None
        seq_features = load_sequence(sequence, device=device)

    else:
        if structure_file is None:
            if structure_code == "":
                raise gr.Error("Structure input is required for the selected model.")

            structure_tmp_dir = tempfile.TemporaryDirectory()
            structure_file = rcsb.fetch(
                structure_code,
                "pdb",
                target_path=structure_tmp_dir.name,
            )
            print(structure_tmp_dir)
            print(structure_file)
        elif structure_code != "":
            gr.Warning(f"PDB file provided, ignoring PDB code {structure_code}.")

        structure = io.load_structure(structure_file)
        structure = structure[filter_amino_acids(structure)]
        chain_id = structure.chain_id[0]
        structure = structure[structure.chain_id == chain_id]

        struct_features = load_structure(structure, device=device)
        sequence = str(to_sequence(structure)[0][0])
        seq_features = load_sequence(sequence, device=device)

    # Load the model
    model = RocketSHP.load_from_checkpoint(model_variant).to(device)

    # Make predictions
    with torch.no_grad():
        try:
            dynamics_pred = model(
                {
                    "seq_feats": seq_features,
                    "struct_feats": struct_features,
                }
            )
        except Exception as e:
            raise gr.Error(f"Error during model prediction: {str(e)}")

    # Extract predictions
    rmsf = dynamics_pred["rmsf"].squeeze().cpu().numpy()
    gcc_lmi = dynamics_pred["gcc_lmi"].squeeze().cpu().numpy()
    shp = dynamics_pred["shp"].squeeze().cpu().numpy()
    ca_dist = dynamics_pred["ca_dist"].squeeze().cpu().numpy()

    fig, plot_file_name = plot_predictions(
        rmsf,
        gcc_lmi,
        shp,
        title=f"RocketSHP Predictions (model={model_variant})",
    )

    json_file_name = download_predictions(model_variant, rmsf, gcc_lmi, shp)

    if is_sequence_model:
        out_structure_file_name = None
    else:
        out_structure_file = tempfile.NamedTemporaryFile(
            mode="w+", delete=False, suffix=".pdb"
        )
        bfactors = spread_residue_wise(structure, rmsf)
        structure.set_annotation("b_factor", bfactors)
        io.save_structure(out_structure_file.name, structure)

        out_structure_file_name = out_structure_file.name

    seq_display_tuples = [*zip(list(sequence), rmsf)]

    return (
        rmsf,
        gcc_lmi,
        shp,
        ca_dist,
        sequence,
        json_file_name,
        plot_file_name,
        fig,
        out_structure_file_name,
        seq_display_tuples,
    )


def visualize_network(
    sequence: str,
    gcc_lmi: np.ndarray,
    ca_dist: np.ndarray,
    ca_threshold: float = 12.0,
    cluster_k: int = 5,
    progress=gr.Progress(),
):
    if sequence == "!=" or not len(gcc_lmi):
        raise gr.Error(
            "No valid GCC-LMI data available for network visualization, please run RocketSHP first."
        )

    # Build network from GCC-LMI predictions and distance mask
    progress(0.1, desc="Building allosteric network...")
    network = build_allosteric_network(gcc_lmi, ca_dist, distance_cutoff=ca_threshold)

    # Apply clustering to identify communities
    progress(0.2, desc="Clustering network...")
    communities = cluster_network(network, k=cluster_k)

    # Calculate betweenness centrality
    progress(0.8, desc="Calculating centrality...")
    centralities = calculate_centrality(network)
    betweenness_centrality = centralities["betweenness"]

    progress(0.9, desc="Generating plot...")
    fig, ax = plt.subplots(2, 1, figsize=(10, 8))

    pos = nx.spring_layout(network)

    cmap = plt.cm.tab10  # or whatever colormap you're using
    cluster_color = []
    cluster_label = []
    for i, (cluster, color) in enumerate(zip(communities, cmap.colors, strict=False)):
        hex_color = mcolors.to_hex(color)
        cluster_color.extend([hex_color] * len(cluster))
        cluster_label.extend([i] * len(cluster))

    nx.draw(
        network,
        pos,
        with_labels=True,
        node_color=betweenness_centrality,
        edge_color="gray",
        ax=ax[0],
        cmap="coolwarm",
    )
    nx.draw(
        network,
        pos,
        with_labels=True,
        node_color=cluster_color,
        edge_color="gray",
        ax=ax[1],
    )

    # For ax[0] - Betweenness Centrality
    ax[0].set_title("Betweenness Centrality")
    norm = Normalize(vmin=min(betweenness_centrality), vmax=max(betweenness_centrality))
    sm = ScalarMappable(cmap="coolwarm", norm=norm)
    sm.set_array([])  # Required for colorbar
    plt.colorbar(sm, ax=ax[0])

    # For ax[1] - Clusters
    ax[1].set_title("Network Clusters")
    unique_clusters = [cmap.colors[i] for i in range(cluster_k)]
    legend_elements = [
        mpatches.Patch(facecolor=color, label=f"Cluster {i + 1}")
        for i, color in enumerate(unique_clusters)
    ]
    ax[1].legend(handles=legend_elements)

    plt.tight_layout()
    progress(1.0, desc="Done")

    normalize_centrality = (betweenness_centrality - betweenness_centrality.min()) / (
        betweenness_centrality.max() - betweenness_centrality.min()
    )

    comm_highlight = [
        (aa, f"Cluster {i + 1}") for aa, i in zip(list(sequence), cluster_label)
    ]
    bc_highlight = [*zip(list(sequence), normalize_centrality)]

    out_cluster_file = tempfile.NamedTemporaryFile(
        mode="w+", delete=False, suffix=".csv"
    )
    out_cluster_file.write("Residue_Index,Amino_Acid,Cluster,Betweenness Centrality\n")
    for i, (aa, cluster_id, bet) in enumerate(
        zip(list(sequence), cluster_label, betweenness_centrality)
    ):
        out_cluster_file.write(f"{i + 1},{aa},Cluster_{cluster_id + 1},{bet}\n")

    out_cluster_file_name = out_cluster_file.name

    return fig, bc_highlight, comm_highlight, out_cluster_file_name


reps = [
    {
        "model": 0,
        "chain": "",
        "resname": "",
        "style": "cartoon",
        "color": """
function(atom) {
    var b = atom.b || 0;
    // Map B-factor to color (adjust min/max as needed)
    var min_b = 0;
    var max_b = 100;
    var normalized = (b - min_b) / (max_b - min_b);

    // Blue (low) to Red (high)
    var r = Math.floor(normalized * 255);
    var b_color = Math.floor((1 - normalized) * 255);
    return 'rgb(' + r + ', 0, ' + b_color + ')';
}
        """,
        # "residue_range": "",
        "around": 0,
        "byres": False,
        # "visible": False,
        "opacity": 1,
    }
]

rocketshp_gradio = gr.Blocks(title="RocketSHP")
# , theme=gr.themes.Monochrome())

with rocketshp_gradio:
    gr.Markdown("""

    # RocketSHP 🚀

    RocketSHP enables ultra-fast prediction of protein dynamics and flexibility from amino acid sequences and/or protein structures. Trained on thousands of molecular dynamics trajectories, it predicts multiple dynamics-related features simultaneously:

    - Root-Mean-Square Fluctuations (RMSF)
    - Generalized Correlation Coefficients with Linear Mutual Information (GCC-LMI)
    - Structural Heterogeneity Profiles (SHP)

    This approach bridges the gap between static structural biology and dynamic functional understanding, providing a computational tool that complements experimental approaches at unprecedented speed and scale.

    - 📄: [Paper](https://www.biorxiv.org/content/10.1101/2025.06.12.659353v1)
    - 💻: [GitHub](https://github.com/flatironinstitute/RocketSHP/tree/main)

    """)

    rmsf = gr.State([])
    gcc = gr.State([])
    shp = gr.State([])
    ca_dist = gr.State([])
    sequence = gr.State([])

    model_variant = gr.Dropdown(
        label="Select RocketSHP Model",
        choices=["latest", "v1_seq", "v1_mini"],
        value="latest",
    )

    structure_input = gr.Textbox(label="Enter PDB ID")
    structure_upload = gr.File(
        label="Upload Structure File (PDB or MMCIF)",
        file_types=[".pdb", ".cif"],
    )

    sequence_input = gr.Textbox(label="Paste FASTA Sequence", visible=False)
    sequence_upload = gr.File(
        label="Upload FASTA File",
        file_types=[".fasta", ".fa"],
        visible=False,
    )

    predict_button = gr.Button("Run RocketSHP")

    with gr.Tabs():
        with gr.Tab("View Results"):
            seq_display = gr.HighlightedText(label="RMSF per Residue")

            mol_display = Molecule3D(
                confidenceLabel="RMSF",
                label="Structure",
                reps=reps,
                show_label=True,
            )

            fig_display = gr.Plot(label="Prediction Plots")

        with gr.Tab("Allosteric Network"):
            ca_threshold = gr.Slider(
                label="Cα Distance Cutoff (Å)",
                minimum=4.0,
                maximum=12.0,
                step=0.1,
                value=8.0,
            )
            cluster_k = gr.Slider(
                label="Number of Clusters (k)",
                minimum=2,
                maximum=10,
                step=1,
                value=5,
            )
            network_button = gr.Button("Visualize Network")

            net_fig = gr.Plot(label="Allosteric Network")

            htext_cmap = {
                f"Cluster {i + 1}": mcolors.to_hex(color)
                for i, color in enumerate(plt.cm.tab10.colors)
            }

            seq_betweenness = gr.HighlightedText(label="Betweenness Centrality")
            seq_clusters = gr.HighlightedText(
                label="Network Clusters", combine_adjacent=True, color_map=htext_cmap
            )

        with gr.Tab("Downloads"):
            download_file = gr.File(label="Download Results")
            fig_file = gr.File(label="Download Plot")
            clusters_file = gr.File(label="Download Network Clusters")

    model_variant.change(
        toggle_inputs,
        inputs=model_variant,
        outputs=[
            sequence_input,
            sequence_upload,
            structure_input,
            structure_upload,
            mol_display,
        ],
    )

    predict_button.click(
        predict_rocketshp,
        inputs=[
            model_variant,
            sequence_input,
            sequence_upload,
            structure_input,
            structure_upload,
        ],
        outputs=[
            rmsf,
            gcc,
            shp,
            ca_dist,
            sequence,
            download_file,
            fig_file,
            fig_display,
            mol_display,
            seq_display,
        ],
    )

    network_button.click(
        visualize_network,
        inputs=[sequence, gcc, ca_dist, ca_threshold, cluster_k],
        outputs=[net_fig, seq_betweenness, seq_clusters, clusters_file],
    )


if __name__ == "__main__":
    rocketshp_gradio.launch(share=False)