Spaces:

samsl
/

rocketshp

Sleeping

App Files Files Community

samsl commited on Oct 24, 2025

Commit

cd1d940

1 Parent(s): 01bc174

Initial app

Browse files

Files changed (5) hide show

.python-version +1 -0
README.md +5 -4
app.py +487 -0
pyproject.toml +39 -0
requirements.txt +29 -0

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md CHANGED Viewed

@@ -1,14 +1,15 @@
 ---
 title: Rocketshp
-emoji: 📉
-colorFrom: green
 colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
-pinned: false
 license: mit
 short_description: Fast structural heterogeneity estimation
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Rocketshp
+emoji: 🚀
+colorFrom: purple
 colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
+python_version: 3.11
 app_file: app.py
+pinned: true
 license: mit
 short_description: Fast structural heterogeneity estimation
 ---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

app.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import os
+import tempfile
+from matplotlib.path import Path
+import gradio as gr
+from gradio_molecule3d import Molecule3D
+import numpy as np
+import json
+import torch
+import networkx as nx
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import matplotlib.colors as mcolors
+from matplotlib.cm import ScalarMappable
+from matplotlib.colors import Normalize
+from biotite.sequence import io as seqio
+from biotite.structure import io, to_sequence, spread_residue_wise, filter_amino_acids
+from biotite.database import rcsb
+from rocketshp import RocketSHP, load_sequence, load_structure
+from rocketshp.network import (
+    build_allosteric_network,
+    cluster_network,
+    calculate_centrality,
+)
+def plot_predictions(
+    rmsf: np.ndarray,
+    gcc_lmi: np.ndarray,
+    shp: np.ndarray,
+    title: str = "RocketSHP Predictions",
+    font_scale: float = 1.0,
+):
+    with plt.style.context(
+        {
+            "font.size": 12 * font_scale,
+            "legend.fontsize": 12 * font_scale,
+            "axes.labelsize": 12 * font_scale,
+            "axes.titlesize": 12 * font_scale,
+        }
+    ):
+        plot_file = tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".png")
+        fig = plt.figure(figsize=(6, 6))
+        gs = fig.add_gridspec(2, 2)
+        ax1 = fig.add_subplot(gs[0, 0])
+        ax2 = fig.add_subplot(gs[0, 1])
+        ax3 = fig.add_subplot(gs[1, :])
+        fig.suptitle(title)
+        ax1.plot(rmsf, label="RMSF")
+        ax1.set_title("RMSF")
+        ax1.set_xlabel("Residue Index")
+        ax1.set_ylabel("RMSF (Å)")
+        ax1.spines["top"].set_visible(False)
+        ax1.spines["right"].set_visible(False)
+        ax2.imshow(gcc_lmi, cmap="viridis", aspect="equal", vmin=0, vmax=1)
+        ax2.set_title("GCC-LMI")
+        ax2.set_xlabel("Residue Index")
+        ax2.set_ylabel("Residue Index")
+        ax3.imshow(shp.T, cmap="binary", vmin=0, vmax=1, interpolation="none")
+        ax3.set_title("SHP")
+        ax3.set_xlabel("Residue Index")
+        ax3.set_ylabel("Structure Token\nIndex")
+        ax3.set_ylim(21, -1)
+        plt.tight_layout()
+        plt.savefig(plot_file.name)
+        return fig, plot_file.name
+def download_predictions(job_name, rmsf, gcc_lmi, shp):
+    outfile = tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json")
+    json_content = {
+        "model": job_name,
+        "rmsf": rmsf.tolist(),
+        "gcc_lmi": gcc_lmi.tolist(),
+        "shp": shp.tolist(),
+    }
+    outfile.write(json.dumps(json_content))
+    return outfile.name
+def toggle_inputs(model):
+    if "seq" in model or "mini" in model:
+        return (
+            gr.update(visible=True),  # sequence input
+            gr.update(visible=True),  # fasta upload
+            gr.update(visible=False),  # structure input
+            gr.update(visible=False),  # structure upload
+            gr.update(visible=False),  # structure output
+        )
+    return (
+        gr.update(visible=False),  # sequence input
+        gr.update(visible=False),  # fasta upload
+        gr.update(visible=True),  # structure input
+        gr.update(visible=True),  # structure upload
+        gr.update(visible=True),  # structure output
+    )
+def predict_rocketshp(
+    model_variant: str,
+    sequence: str | None,
+    sequence_file: str | None,
+    structure_code: str | None,
+    structure_file: str | None,
+):
+    print(f"sequence text: {sequence}")
+    print(f"sequence file: {sequence_file}")
+    print(f"structure code: {structure_code}")
+    print(f"structure file: {structure_file}")
+    print(f"model variant: {model_variant}")
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    is_sequence_model = "seq" in model_variant or "mini" in model_variant
+    if is_sequence_model:
+        if sequence_file is not None:
+            if sequence != "":
+                gr.Warning("Sequence file provided, ignoring text box.")
+            sequence = str(seqio.load_sequence(sequence_file))
+            print(sequence)
+        elif sequence == "":
+            raise gr.Error("Sequence input is required for the selected model.")
+        struct_features = None
+        seq_features = load_sequence(sequence, device=device)
+    else:
+        if structure_file is None:
+            if structure_code == "":
+                raise gr.Error("Structure input is required for the selected model.")
+            structure_tmp_dir = tempfile.TemporaryDirectory()
+            structure_file = rcsb.fetch(
+                structure_code,
+                "pdb",
+                target_path=structure_tmp_dir.name,
+            )
+            print(structure_tmp_dir)
+            print(structure_file)
+        elif structure_code != "":
+            gr.Warning(f"PDB file provided, ignoring PDB code {structure_code}.")
+        structure = io.load_structure(structure_file)
+        structure = structure[filter_amino_acids(structure)]
+        chain_id = structure.chain_id[0]
+        structure = structure[structure.chain_id == chain_id]
+        struct_features = load_structure(structure, device=device)
+        sequence = str(to_sequence(structure)[0][0])
+        seq_features = load_sequence(sequence, device=device)
+    # Load the model
+    model = RocketSHP.load_from_checkpoint(model_variant).to(device)
+    # Make predictions
+    with torch.no_grad():
+        try:
+            dynamics_pred = model(
+                {
+                    "seq_feats": seq_features,
+                    "struct_feats": struct_features,
+                }
+            )
+        except Exception as e:
+            raise gr.Error(f"Error during model prediction: {str(e)}")
+    # Extract predictions
+    rmsf = dynamics_pred["rmsf"].squeeze().cpu().numpy()
+    gcc_lmi = dynamics_pred["gcc_lmi"].squeeze().cpu().numpy()
+    shp = dynamics_pred["shp"].squeeze().cpu().numpy()
+    ca_dist = dynamics_pred["ca_dist"].squeeze().cpu().numpy()
+    fig, plot_file_name = plot_predictions(
+        rmsf,
+        gcc_lmi,
+        shp,
+        title=f"RocketSHP Predictions (model={model_variant})",
+    )
+    json_file_name = download_predictions(model_variant, rmsf, gcc_lmi, shp)
+    if is_sequence_model:
+        out_structure_file_name = None
+    else:
+        out_structure_file = tempfile.NamedTemporaryFile(
+            mode="w+", delete=False, suffix=".pdb"
+        )
+        bfactors = spread_residue_wise(structure, rmsf)
+        structure.set_annotation("b_factor", bfactors)
+        io.save_structure(out_structure_file.name, structure)
+        out_structure_file_name = out_structure_file.name
+    seq_display_tuples = [*zip(list(sequence), rmsf)]
+    return (
+        rmsf,
+        gcc_lmi,
+        shp,
+        ca_dist,
+        sequence,
+        json_file_name,
+        plot_file_name,
+        fig,
+        out_structure_file_name,
+        seq_display_tuples,
+    )
+def visualize_network(
+    sequence: str,
+    gcc_lmi: np.ndarray,
+    ca_dist: np.ndarray,
+    ca_threshold: float = 12.0,
+    cluster_k: int = 5,
+    progress=gr.Progress(),
+):
+    if sequence == "!=" or not len(gcc_lmi):
+        raise gr.Error(
+            "No valid GCC-LMI data available for network visualization, please run RocketSHP first."
+        )
+    # Build network from GCC-LMI predictions and distance mask
+    progress(0.1, desc="Building allosteric network...")
+    network = build_allosteric_network(gcc_lmi, ca_dist, distance_cutoff=ca_threshold)
+    # Apply clustering to identify communities
+    progress(0.2, desc="Clustering network...")
+    communities = cluster_network(network, k=cluster_k)
+    # Calculate betweenness centrality
+    progress(0.8, desc="Calculating centrality...")
+    centralities = calculate_centrality(network)
+    betweenness_centrality = centralities["betweenness"]
+    progress(0.9, desc="Generating plot...")
+    fig, ax = plt.subplots(2, 1, figsize=(10, 8))
+    pos = nx.spring_layout(network)
+    cmap = plt.cm.tab10  # or whatever colormap you're using
+    cluster_color = []
+    cluster_label = []
+    for i, (cluster, color) in enumerate(zip(communities, cmap.colors, strict=False)):
+        hex_color = mcolors.to_hex(color)
+        cluster_color.extend([hex_color] * len(cluster))
+        cluster_label.extend([i] * len(cluster))
+    nx.draw(
+        network,
+        pos,
+        with_labels=True,
+        node_color=betweenness_centrality,
+        edge_color="gray",
+        ax=ax[0],
+        cmap="coolwarm",
+    )
+    nx.draw(
+        network,
+        pos,
+        with_labels=True,
+        node_color=cluster_color,
+        edge_color="gray",
+        ax=ax[1],
+    )
+    # For ax[0] - Betweenness Centrality
+    ax[0].set_title("Betweenness Centrality")
+    norm = Normalize(vmin=min(betweenness_centrality), vmax=max(betweenness_centrality))
+    sm = ScalarMappable(cmap="coolwarm", norm=norm)
+    sm.set_array([])  # Required for colorbar
+    plt.colorbar(sm, ax=ax[0])
+    # For ax[1] - Clusters
+    ax[1].set_title("Network Clusters")
+    unique_clusters = [cmap.colors[i] for i in range(cluster_k)]
+    legend_elements = [
+        mpatches.Patch(facecolor=color, label=f"Cluster {i + 1}")
+        for i, color in enumerate(unique_clusters)
+    ]
+    ax[1].legend(handles=legend_elements)
+    plt.tight_layout()
+    progress(1.0, desc="Done")
+    normalize_centrality = (betweenness_centrality - betweenness_centrality.min()) / (
+        betweenness_centrality.max() - betweenness_centrality.min()
+    )
+    comm_highlight = [
+        (aa, f"Cluster {i + 1}") for aa, i in zip(list(sequence), cluster_label)
+    ]
+    bc_highlight = [*zip(list(sequence), normalize_centrality)]
+    out_cluster_file = tempfile.NamedTemporaryFile(
+        mode="w+", delete=False, suffix=".csv"
+    )
+    out_cluster_file.write("Residue_Index,Amino_Acid,Cluster,Betweenness Centrality\n")
+    for i, (aa, cluster_id, bet) in enumerate(
+        zip(list(sequence), cluster_label, betweenness_centrality)
+    ):
+        out_cluster_file.write(f"{i + 1},{aa},Cluster_{cluster_id + 1},{bet}\n")
+    out_cluster_file_name = out_cluster_file.name
+    return fig, bc_highlight, comm_highlight, out_cluster_file_name
+reps = [
+    {
+        "model": 0,
+        "chain": "",
+        "resname": "",
+        "style": "cartoon",
+        "color": """
+function(atom) {
+    var b = atom.b || 0;
+    // Map B-factor to color (adjust min/max as needed)
+    var min_b = 0;
+    var max_b = 100;
+    var normalized = (b - min_b) / (max_b - min_b);
+    // Blue (low) to Red (high)
+    var r = Math.floor(normalized * 255);
+    var b_color = Math.floor((1 - normalized) * 255);
+    return 'rgb(' + r + ', 0, ' + b_color + ')';
+}
+        """,
+        # "residue_range": "",
+        "around": 0,
+        "byres": False,
+        # "visible": False,
+        "opacity": 1,
+    }
+]
+rocketshp_gradio = gr.Blocks(title="RocketSHP")
+# , theme=gr.themes.Monochrome())
+with rocketshp_gradio:
+    gr.Markdown("""
+    # RocketSHP 🚀
+    RocketSHP enables ultra-fast prediction of protein dynamics and flexibility from amino acid sequences and/or protein structures. Trained on thousands of molecular dynamics trajectories, it predicts multiple dynamics-related features simultaneously:
+    - Root-Mean-Square Fluctuations (RMSF)
+    - Generalized Correlation Coefficients with Linear Mutual Information (GCC-LMI)
+    - Structural Heterogeneity Profiles (SHP)
+    This approach bridges the gap between static structural biology and dynamic functional understanding, providing a computational tool that complements experimental approaches at unprecedented speed and scale.
+    - 📄: [Paper](https://www.biorxiv.org/content/10.1101/2025.06.12.659353v1)
+    - 💻: [GitHub](https://github.com/flatironinstitute/RocketSHP/tree/main)
+    """)
+    rmsf = gr.State([])
+    gcc = gr.State([])
+    shp = gr.State([])
+    ca_dist = gr.State([])
+    sequence = gr.State([])
+    model_variant = gr.Dropdown(
+        label="Select RocketSHP Model",
+        choices=["latest", "v1_seq", "v1_mini"],
+        value="latest",
+    )
+    structure_input = gr.Textbox(label="Enter PDB ID")
+    structure_upload = gr.File(
+        label="Upload Structure File (PDB or MMCIF)",
+        file_types=[".pdb", ".cif"],
+    )
+    sequence_input = gr.Textbox(label="Paste FASTA Sequence", visible=False)
+    sequence_upload = gr.File(
+        label="Upload FASTA File",
+        file_types=[".fasta", ".fa"],
+        visible=False,
+    )
+    predict_button = gr.Button("Run RocketSHP")
+    with gr.Tabs():
+        with gr.Tab("View Results"):
+            seq_display = gr.HighlightedText(label="RMSF per Residue")
+            mol_display = Molecule3D(
+                confidenceLabel="RMSF",
+                label="Structure",
+                reps=reps,
+                show_label=True,
+            )
+            fig_display = gr.Plot(label="Prediction Plots")
+        with gr.Tab("Allosteric Network"):
+            ca_threshold = gr.Slider(
+                label="Cα Distance Cutoff (Å)",
+                minimum=4.0,
+                maximum=12.0,
+                step=0.1,
+                value=8.0,
+            )
+            cluster_k = gr.Slider(
+                label="Number of Clusters (k)",
+                minimum=2,
+                maximum=10,
+                step=1,
+                value=5,
+            )
+            network_button = gr.Button("Visualize Network")
+            net_fig = gr.Plot(label="Allosteric Network")
+            htext_cmap = {
+                f"Cluster {i + 1}": mcolors.to_hex(color)
+                for i, color in enumerate(plt.cm.tab10.colors)
+            }
+            seq_betweenness = gr.HighlightedText(label="Betweenness Centrality")
+            seq_clusters = gr.HighlightedText(
+                label="Network Clusters", combine_adjacent=True, color_map=htext_cmap
+            )
+        with gr.Tab("Downloads"):
+            download_file = gr.File(label="Download Results")
+            fig_file = gr.File(label="Download Plot")
+            clusters_file = gr.File(label="Download Network Clusters")
+    model_variant.change(
+        toggle_inputs,
+        inputs=model_variant,
+        outputs=[
+            sequence_input,
+            sequence_upload,
+            structure_input,
+            structure_upload,
+            mol_display,
+        ],
+    )
+    predict_button.click(
+        predict_rocketshp,
+        inputs=[
+            model_variant,
+            sequence_input,
+            sequence_upload,
+            structure_input,
+            structure_upload,
+        ],
+        outputs=[
+            rmsf,
+            gcc,
+            shp,
+            ca_dist,
+            sequence,
+            download_file,
+            fig_file,
+            fig_display,
+            mol_display,
+            seq_display,
+        ],
+    )
+    network_button.click(
+        visualize_network,
+        inputs=[sequence, gcc, ca_dist, ca_threshold, cluster_k],
+        outputs=[net_fig, seq_betweenness, seq_clusters, clusters_file],
+    )
+if __name__ == "__main__":
+    rocketshp_gradio.launch(share=False)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[project]
+name = "rocketshp-space"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "biopython>=1.79",
+    "biotite==0.41.2",
+    "datasets>=4.3.0",
+    "esm==3.1.3",
+    "gradio>=5.49.1",
+    "gradio-molecule3d>=0.0.7",
+    "h5py>=3.15.1",
+    "huggingface-hub>=0.36.0",
+    "lightning>=2.4.0",
+    "loguru>=0.7.3",
+    "matplotlib>=3.10.7",
+    "mdanalysis>=2.9.0",
+    "mdanalysisdata>=0.9.0",
+    "neptune>=1.13.0",
+    "nglview>=4.0",
+    "numpy>=1.23.5",
+    "numpy-indexed>=0.3.7",
+    "omegaconf>=2.3.0",
+    "openpyxl>=3.1.5",
+    "pandas>=2.3.3",
+    "python-dateutil>=2.9.0.post0",
+    "python-dotenv>=1.1.1",
+    "scikit-learn>=1.7.2",
+    "scipy>=1.16.2",
+    "seaborn>=0.13.2",
+    "statsmodels>=0.14.5",
+    "tokenizers>=0.20.3",
+    "torchmetrics>=1.8.2",
+    "tqdm>=4.67.1",
+    "transformers>=4.46.3",
+    "typer>=0.20.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+biopython>=1.79
+biotite==0.41.2
+datasets
+esm==3.1.3
+h5py
+huggingface_hub
+lightning>=2.4.0
+loguru
+matplotlib
+neptune>=1.13.0
+nglview
+numpy>=1.23.5
+numpy-indexed>=0.3.7
+omegaconf
+pandas
+python-dateutil
+python-dotenv
+scikit_learn
+scipy
+seaborn
+statsmodels
+tokenizers
+torchmetrics
+transformers
+tqdm
+typer
+mdanalysis>=2.9.0
+mdanalysisdata>=0.9.0
+openpyxl>=3.1.5