Spaces:

InstaDeepAI
/

ntv3_tracks

Running on Zero

App Files Files Community

bernardo-de-almeida commited on Dec 15, 2025

Commit

f7c9069

1 Parent(s): 7163437

feat: improve demo

Browse files

Files changed (4) hide show

README.md +1 -12
app.py +540 -97
ntv3_tracks_pipeline.py +71 -52
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -11,15 +11,4 @@ pinned: false
 # NTv3 Tracks Demo
-This Space deploys the custom Hugging Face `Pipeline` in `ntv3_tracks_pipeline.py` and provides both:
-- a UI
-- a REST API (`/api/predict`, auto-generated by Gradio)
-## Environment variables (optional)
-- `MODEL_ID` (default: `InstaDeepAI/NTv3_100M`)
-- `DEFAULT_SPECIES` (default: `human`)
-## Notes
-Genome-coordinate mode may download and decompress large FASTA files. For a lightweight demo, send a DNA sequence directly via `seq`.


11
12	# NTv3 Tracks Demo
13
14	+ This Space deploys the custom Hugging Face `Pipeline` in `ntv3_tracks_pipeline.py`.

app.py CHANGED Viewed

@@ -1,33 +1,195 @@
 import os
 import numpy as np
 import gradio as gr
-# local file in the Space repo
-from ntv3_tracks_pipeline import load_ntv3_tracks_pipeline
-MODEL_ID = os.environ.get("MODEL_ID", "InstaDeepAI/NTv3_650M_pos")
-DEFAULT_SPECIES = os.environ.get("DEFAULT_SPECIES", "human")
 HF_TOKEN = (
-    os.environ.get("HF_TOKEN")
-    or os.environ.get("HUGGINGFACEHUB_API_TOKEN")  # also common in Spaces
 )
-# Load once at startup (Space container)
-pipe = load_ntv3_tracks_pipeline(
-    model=MODEL_ID,
-    device="auto",
-    default_species=DEFAULT_SPECIES,
-    token=HF_TOKEN,
-    verbose=False,
-)
-def _downsample_1d(arr: np.ndarray, max_points: int):
-    if max_points is None or max_points <= 0 or arr.shape[0] <= max_points:
-        return arr, 1
-    stride = int(np.ceil(arr.shape[0] / max_points))
-    return arr[::stride], stride
 def predict(
     seq: str,
     species: str,
@@ -35,125 +197,406 @@ def predict(
     start: int,
     end: int,
     use_coords: bool,
-    tracks: str,
-    elements: str,
-    max_points: int,
 ):
-    """
-    Returns JSON-serializable dict (Gradio also exposes this at /api/predict by default).
-    """
     if use_coords:
         if not chrom:
             raise gr.Error("chrom is required when use_coords=True")
-        if start is None or end is None or end <= start:
             raise gr.Error("start/end must be set and end > start when use_coords=True")
         inputs = {"chrom": chrom, "start": int(start), "end": int(end), "species": species}
     else:
-        if not seq or len(seq.strip()) == 0:
             raise gr.Error("seq is required when use_coords=False")
         inputs = {"seq": seq.strip(), "species": species}
     out = pipe(inputs)
-    # Parse selection lists
-    track_ids = [t.strip() for t in tracks.split(",") if t.strip()] if tracks else []
-    element_names = [e.strip() for e in elements.split(",") if e.strip()] if elements else []
-    # Bigwig tracks
-    bigwig_names = out.bigwig_track_names or []
-    bw = out.bigwig_tracks_logits  # (L, T)
-    bw_selected = {}
-    for tid in track_ids:
-        if tid not in bigwig_names:
-            continue
-        idx = bigwig_names.index(tid)
-        y, stride = _downsample_1d(bw[:, idx], max_points)
-        bw_selected[tid] = {"values": y.astype(float).tolist(), "stride": int(stride)}
-    # BED elements (positive class probability)
-    bed_selected = {}
-    if out.bed_element_names is not None and element_names:
-        logits = out.bed_tracks_logits  # (L, E, C)
-        # softmax over last axis
-        logits = logits - logits.max(axis=-1, keepdims=True)
-        probs = np.exp(logits) / np.exp(logits).sum(axis=-1, keepdims=True)
-        for ename in element_names:
-            if ename not in out.bed_element_names:
-                continue
-            eidx = out.bed_element_names.index(ename)
-            y, stride = _downsample_1d(probs[:, eidx, 1], max_points)
-            bed_selected[ename] = {"values": y.astype(float).tolist(), "stride": int(stride)}
     meta = {
-        "model_id": MODEL_ID,
         "species": out.species,
         "assembly": out.assembly,
         "chrom": out.chrom,
-        "start": out.start,
-        "end": out.end,
-        "window_len": out.window_len,
         "pred_start": out.pred_start,
         "pred_end": out.pred_end,
     }
-    return {
-        "meta": meta,
-        "bigwig_track_names_count": len(bigwig_names),
-        "bigwig_selected": bw_selected,
-        "bed_selected": bed_selected,
-    }
 with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     gr.Markdown(
-        """# NTv3 tracks demo (Space)
-This Space runs your `NTv3TracksPipeline` and exposes:
-- an interactive UI
-- a REST API (Gradio auto-generated endpoint)
-**Tip:** For reliable, fast demos, pass a DNA **sequence** directly. Genome-coordinate mode may download a whole genome FASTA.
-"""
     )
     with gr.Row():
-        use_coords = gr.Checkbox(value=False, label="Use genome coords instead of seq")
-        species = gr.Dropdown(choices=["human","mouse","drosophila_melanogaster"], value=DEFAULT_SPECIES, label="species")
-    seq = gr.Textbox(lines=4, label="DNA sequence (A/C/G/T/N)")
     with gr.Row():
-        chrom = gr.Textbox(label="chrom (e.g. chr1)")
-        start = gr.Number(label="start", value=0, precision=0)
-        end = gr.Number(label="end", value=1024, precision=0)
-    tracks = gr.Textbox(label="BigWig track IDs to return (comma-separated)", placeholder="ENCSR... , ENCSR...")
-    elements = gr.Textbox(label="BED element names to return (comma-separated)", placeholder="e.g. CTCF, H3K27ac")
-    max_points = gr.Slider(100, 5000, value=1000, step=100, label="Max points per returned series (downsample)")
-    btn = gr.Button("Predict")
-    out = gr.JSON(label="Output JSON")
-    btn.click(
-        fn=predict,
-        inputs=[seq, species, chrom, start, end, use_coords, tracks, elements, max_points],
-        outputs=[out],
     )
-    gr.Markdown(
-        """## API usage
-After you deploy, Gradio exposes an endpoint like:
-- `POST https://<your-space>.hf.space/api/predict`
-with JSON body:
-```json
-{"data": ["ACGT...", "human", "", 0, 0, false, "ENCSR...", "CTCF", 1000]}
-```
-The response is a JSON dict with `meta`, plus any requested tracks/elements.
-"""
     )
 if __name__ == "__main__":
-    demo.launch()

 import os
+import uuid
+import tempfile
 import numpy as np
 import gradio as gr
+import matplotlib.pyplot as plt
+import asyncio
+from ntv3_tracks_pipeline import load_ntv3_tracks_pipeline, BED_ELEMENT_COLORS
+# -----------------------------
+# Env / auth
+# -----------------------------
+MODEL_ID = os.environ.get("MODEL_ID", "InstaDeepAI/NTv3_100M_pos")
+DEFAULT_SPECIES = os.environ.get("DEFAULT_SPECIES", "human")
 HF_TOKEN = (
+    os.environ.get("NTV3_HF_TOKEN")
+    or os.environ.get("HF_TOKEN")
+    or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 )
+if HF_TOKEN is None:
+    raise RuntimeError("Missing Hugging Face token. Set NTV3_HF_TOKEN as a Space Secret.")
+asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
+PLOT_TARGET_POINTS = int(os.environ.get("PLOT_TARGET_POINTS", "1500"))
+SEARCH_MAX_RESULTS = int(os.environ.get("SEARCH_MAX_RESULTS", "50"))
+# -----------------------------
+# Load pipeline (reloadable)
+# -----------------------------
+pipe = None
+current_model_id = MODEL_ID
+def load_pipeline(model_id: str, species: str = DEFAULT_SPECIES):
+    """Load or reload the pipeline with a new model."""
+    global pipe, current_model_id
+    pipe = load_ntv3_tracks_pipeline(
+        model=model_id,
+        token=HF_TOKEN,
+        device="auto",
+        default_species=species,
+        verbose=False,
+    )
+    current_model_id = model_id
+    return pipe
+# Load initial pipeline
+load_pipeline(MODEL_ID, DEFAULT_SPECIES)
+# -----------------------------
+# Helpers
+# -----------------------------
+def _softmax_last(x: np.ndarray) -> np.ndarray:
+    x = x - x.max(axis=-1, keepdims=True)
+    ex = np.exp(x)
+    return ex / ex.sum(axis=-1, keepdims=True)
+def _global_stride(L: int, target: int) -> int:
+    if target <= 0 or L <= target:
+        return 1
+    return int(np.ceil(L / target))
+def _make_tracks_figure(x: np.ndarray, series: list[tuple[str, np.ndarray]]):
+    if not series:
+        raise gr.Error("Nothing to plot (no tracks/elements selected).")
+    n = len(series)
+    fig, axes = plt.subplots(n, 1, figsize=(18, 1.35 * n), sharex=True)
+    if n == 1:
+        axes = [axes]
+    # Define color schemes
+    bigwig_color = "#4A90E2"  # Blue
+    for ax, (title, y) in zip(axes, series):
+        # Determine color based on track type
+        if title in BED_ELEMENT_COLORS:
+            color = BED_ELEMENT_COLORS[title]
+        else:
+            color = bigwig_color
+        ax.fill_between(x, y, color=color, alpha=0.3, linewidth=0)
+        ax.plot(x, y, color=color, linewidth=0.8)
+        ax.set_title(title, fontsize=10, loc="left")
+        ax.grid(alpha=0.2)
+        ax.set_yticks([])
+        ax.spines["top"].set_visible(False)
+        ax.spines["right"].set_visible(False)
+    axes[-1].set_xlabel("Genomic position / index")
+    fig.tight_layout()
+    return fig
+def _save_fig_png(fig) -> str:
+    tmpdir = tempfile.gettempdir()
+    out_path = os.path.join(tmpdir, f"ntv3_tracks_{uuid.uuid4().hex}.png")
+    fig.savefig(out_path, dpi=200, bbox_inches="tight")
+    return out_path
+# Cache track lists per species so search is instant after first load
+_BIGWIG_CACHE: dict[str, list[str]] = {}
+def _get_bigwig_names(species: str) -> list[str]:
+    if species not in _BIGWIG_CACHE:
+        _BIGWIG_CACHE[species] = pipe.available_bigwig_track_names(species)
+    return _BIGWIG_CACHE[species]
+def _rank_search(query: str, names: list[str], limit: int) -> list[str]:
+    """
+    Return up to `limit` candidate track IDs matching `query` using a fast,
+    low-overhead ranking suitable for very large `names` lists.
+    Matching & ranking rules:
+      1) Case-insensitive match.
+      2) Items whose ID *starts with* the query are ranked first.
+      3) Remaining items that merely *contain* the query are ranked after.
+      4) Results preserve the original relative order within each group
+         (stable w.r.t. the input `names` order).
+      5) If `query` is empty/whitespace, returns an empty list to avoid
+         flooding the UI with a huge default list.
+    Notes:
+      - `limit` only caps the number of returned results; it does not prevent
+        short queries (e.g. "E") from producing many matches—if you want that,
+        add a minimum query length check (e.g. `if len(q) < 2: return []`).
+      - Time complexity is O(len(names)) per call.
+    """
+    q = (query or "").strip().lower()
+    if not q:
+        return []  # don’t spam a giant default list
+    starts = []
+    contains = []
+    for n in names:
+        nl = n.lower()
+        if nl.startswith(q):
+            starts.append(n)
+        elif q in nl:
+            contains.append(n)
+    out = starts + contains
+    return out[:limit]
+def search_bigwigs(species: str, query: str):
+    names = _get_bigwig_names(species)
+    results = _rank_search(query, names, SEARCH_MAX_RESULTS)
+    return gr.update(choices=results, value=[])
+def add_selected(current_selected: list[str], to_add: list[str]):
+    cur = list(dict.fromkeys(current_selected or []))  # preserve order, unique
+    for x in (to_add or []):
+        if x not in cur:
+            cur.append(x)
+    return gr.update(choices=cur, value=cur)  # show + keep all checked
+def remove_selected(current_selected: list[str], to_remove: list[str]):
+    cur = [x for x in (current_selected or []) if x not in set(to_remove or [])]
+    return gr.update(choices=cur, value=cur)
+def update_coords_on_species_change(species: str):
+    """Update coordinates when species changes."""
+    coords = DEFAULT_COORDS.get(species, DEFAULT_COORDS["human"])
+    return coords["chrom"], coords["start"], coords["end"]
+def reset_on_species_change(species: str):
+    # Clear results + selected when species changes (avoids mismatched IDs)
+    _get_bigwig_names(species)  # warms cache
+    return (
+        gr.update(value=""),          # query textbox
+        gr.update(choices=[], value=[]),  # results list
+        gr.update(choices=[], value=[]),  # selected list
+    )
+# -----------------------------
+# Predict
+# -----------------------------
 def predict(
     seq: str,
     species: str,
     start: int,
     end: int,
     use_coords: bool,
+    bigwig_selected: list[str],
+    bed_elements: list[str],
 ):
     if use_coords:
         if not chrom:
             raise gr.Error("chrom is required when use_coords=True")
+        if start is None or end is None or int(end) <= int(start):
             raise gr.Error("start/end must be set and end > start when use_coords=True")
         inputs = {"chrom": chrom, "start": int(start), "end": int(end), "species": species}
     else:
+        if not seq or not seq.strip():
             raise gr.Error("seq is required when use_coords=False")
         inputs = {"seq": seq.strip(), "species": species}
     out = pipe(inputs)
+    bw_names = out.bigwig_track_names or []
+    bw = out.bigwig_tracks_logits
+    bed_names = out.bed_element_names or []
+    bed_logits = out.bed_tracks_logits
+    if bw is None or not bw_names:
+        raise gr.Error("No BigWig tracks available in model output.")
+    # Defaults if user picked none
+    if not bigwig_selected:
+        default_bigwig_tracks = [
+            "ENCSR056HPM",  # K562 RNA-seq
+            "ENCSR921NMD",  # K562 DNAse
+            "ENCSR000DWD",  # K562 H3k4me3
+            "ENCSR000AKO",  # K562 CTCF
+            "ENCSR561FEE_P",  # HepG2 RNA-seq
+            "ENCSR000EJV",  # HepG2 DNAse
+            "ENCSR000AMP",  # HepG2 H3k4me3
+            "ENCSR000BIE",  # HepG2 CTCF
+        ]
+        # Filter to only include tracks that are available for this species/assembly
+        bigwig_selected = [tid for tid in default_bigwig_tracks if tid in bw_names]
+    if (not bed_elements) and bed_names:
+        default_bed_elements = ["protein_coding_gene", "exon", "intron"]
+        # Filter to only include elements that are available
+        bed_elements = [elem for elem in default_bed_elements if elem in bed_names]
+    # Validate (important for API usage)
+    missing_tracks = [t for t in bigwig_selected if t not in bw_names]
+    if missing_tracks:
+        raise gr.Error(f"Unknown BigWig track id(s): {missing_tracks}")
+    missing_elems = [e for e in bed_elements if e not in bed_names]
+    if missing_elems:
+        raise gr.Error(f"Unknown BED element(s): {missing_elems}")
+    L = bw.shape[0]
+    stride = _global_stride(L, PLOT_TARGET_POINTS)
+    x0 = int(out.pred_start or 0)
+    x1 = int(out.pred_end or (x0 + L))
+    x = np.linspace(x0, x1, num=L, endpoint=False)[::stride]
+    series: list[tuple[str, np.ndarray]] = []
+    for tid in bigwig_selected:
+        idx = bw_names.index(tid)
+        series.append((tid, bw[:, idx][::stride].astype(float)))
+    if bed_logits is not None and bed_elements:
+        probs = _softmax_last(bed_logits)
+        for ename in bed_elements:
+            eidx = bed_names.index(ename)
+            series.append((ename, probs[:, eidx, 1][::stride].astype(float)))
+    fig = _make_tracks_figure(x, series)
+    region = f"{out.chrom}:{out.pred_start}-{out.pred_end}" if out.chrom else f"{x0}-{x1}"
+    if out.assembly:
+        region += f" ({out.assembly})"
+    fig.axes[-1].set_xlabel(region)
+    png_path = _save_fig_png(fig)
     meta = {
+        "model_id": current_model_id,
         "species": out.species,
         "assembly": out.assembly,
         "chrom": out.chrom,
         "pred_start": out.pred_start,
         "pred_end": out.pred_end,
+        "bigwig_selected": bigwig_selected,
+        "bed_selected": bed_elements,
+        "plot_stride": stride,
+        "plot_target_points": PLOT_TARGET_POINTS,
     }
+    return fig, png_path, meta
+# -----------------------------
+# UI (keep your download icon setup)
+# -----------------------------
+CSS = """
+#tracks_plot { position: relative; width: 100% !important; max-width: 100% !important; }
+#tracks_plot .wrap, #tracks_plot .plot-container { width: 100% !important; max-width: 100% !important; }
+#tracks_plot_download {
+  position: absolute;
+  top: 10px;
+  right: 12px;
+  z-index: 50;
+  background: rgba(0,0,0,0.55);
+  border: 1px solid rgba(255,255,255,0.15);
+  border-radius: 10px;
+  padding: 6px 8px;
+  cursor: pointer;
+  user-select: none;
+}
+#tracks_plot_download:hover { background: rgba(0,0,0,0.7); }
+#tracks_plot_download svg { width: 18px; height: 18px; display: block; fill: white; }
+#export_png_hidden { display: none !important; }
+#predict_btn {
+  background-color: #FF6B35 !important;
+  color: white !important;
+  border: none !important;
+}
+#predict_btn:hover {
+  background-color: #E55A2B !important;
+}
+#intro_markdown {
+  font-size: 1.3em !important;
+  line-height: 1.7 !important;
+}
+#intro_markdown h1 {
+  font-size: 2.8em !important;
+  margin-bottom: 0.6em !important;
+}
+#intro_markdown h2, #intro_markdown h3 {
+  font-size: 1.8em !important;
+}
+#intro_markdown p, #intro_markdown li {
+  font-size: 1.2em !important;
+}
+"""
+JS = """
+function addDownloadIcon() {
+  const plot = document.querySelector("#tracks_plot");
+  if (!plot) return;
+  if (document.querySelector("#tracks_plot_download")) return;
+  const btn = document.createElement("div");
+  btn.id = "tracks_plot_download";
+  btn.title = "Download PNG";
+  btn.innerHTML = `
+    <svg viewBox="0 0 24 24" aria-hidden="true">
+      <path d="M5 20h14v-2H5v2zm7-18v10.17l3.59-3.58L17 10l-5 5-5-5 1.41-1.41L11 12.17V2h1z"/>
+    </svg>
+  `;
+  btn.onclick = () => {
+    const link = document.querySelector("#export_png_hidden a");
+    if (link) link.click();
+  };
+  plot.appendChild(btn);
+}
+function setup() {
+  addDownloadIcon();
+  const obs = new MutationObserver(() => addDownloadIcon());
+  obs.observe(document.body, { childList: true, subtree: true });
+}
+setup();
+"""
+# BED list is small enough to keep as dropdown
+_init_bed = pipe.available_bed_element_names()
+# Default BigWig tracks
+DEFAULT_BIGWIG_TRACKS = [
+    "ENCSR056HPM",  # K562 RNA-seq
+    "ENCSR921NMD",  # K562 DNAse
+    "ENCSR000DWD",  # K562 H3k4me3
+    "ENCSR000AKO",  # K562 CTCF
+    "ENCSR561FEE_P",  # HepG2 RNA-seq
+    "ENCSR000EJV",  # HepG2 DNAse
+    "ENCSR000AMP",  # HepG2 H3k4me3
+    "ENCSR000BIE",  # HepG2 CTCF
+]
+# Default BED elements
+DEFAULT_BED_ELEMENTS = ["protein_coding_gene", "exon", "intron"]
+# Get available BigWig tracks for default species and filter defaults
+_init_bigwig = _get_bigwig_names(DEFAULT_SPECIES)
+_init_bigwig_selected = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bigwig]
+# Filter default BED elements to only those available
+_init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
+# Default coordinates per species
+DEFAULT_COORDS = {
+    "human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
+    "mouse": {"chrom": "chr1", "start": 100_000, "end": 200_000},
+    "drosophila_melanogaster": {"chrom": "chr2L", "start": 1_000_000, "end": 2_000_000},
+}
+# Get default coordinates for default species
+_default_coords = DEFAULT_COORDS.get(DEFAULT_SPECIES, DEFAULT_COORDS["human"])
+# Default coordinates per species
+DEFAULT_COORDS = {
+    "human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
+    "mouse": {"chrom": "chr1", "start": 0, "end": 32_768},
+    "drosophila_melanogaster": {"chrom": "chr2L", "start": 0, "end": 32_768},
+}
+# Get default coordinates for default species
+_default_coords = DEFAULT_COORDS.get(DEFAULT_SPECIES, DEFAULT_COORDS["human"])
 with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     gr.Markdown(
+        """
+# 🧬 NTv3 Tracks Demo
+**Predict functional genomics tracks and genome annotation elements from DNA sequences using NTv3 (Nucleotide Transformer v3).**
+This demo allows you to:
+- **Input**: Provide a DNA sequence directly or specify genomic coordinates (chromosome, start, end)
+- **Select tracks**: Choose from hundreds of BigWig functional tracks (e.g., RNA-seq, ChIP-seq, DNase) and genome annotation elements (e.g., exons, introns, promoters)
+- **Visualize**: View NTv3 predictions across the input sequence
+""",
+        elem_id="intro_markdown",
     )
+    gr.Markdown("## Select NTv3 post-trained model")
+    # Model display names (without InstaDeepAI/ prefix) and their full IDs
+    MODEL_OPTIONS = {
+        "NTv3 650M (pos)": "InstaDeepAI/NTv3_650M_pos",
+        "NTv3 100M (pos)": "InstaDeepAI/NTv3_100M_pos",
+    }
+    # Reverse mapping: full ID -> display name
+    MODEL_ID_TO_DISPLAY = {v: k for k, v in MODEL_OPTIONS.items()}
+    # Get display name for current model
+    current_display_name = MODEL_ID_TO_DISPLAY.get(current_model_id, "NTv3 100M (pos)")
+    model_selector = gr.Dropdown(
+        choices=list(MODEL_OPTIONS.keys()),
+        value=current_display_name,
+        label="Model",
+    )
+    model_status = gr.Markdown("", visible=False)
+    gr.Markdown("## Input sequence (Genomic coordinate or DNA sequence)")
     with gr.Row():
+        species = gr.Dropdown(
+            ["human", "mouse", "drosophila_melanogaster"],
+            value=DEFAULT_SPECIES,
+            label="Species",
+        )
+        use_coords = gr.Checkbox(True, label="Use genome coordinates")
     with gr.Row():
+        chrom = gr.Textbox(label="Chromosome", value=_default_coords["chrom"])
+        start = gr.Number(label="Start", value=_default_coords["start"], precision=0)
+        end = gr.Number(label="End", value=_default_coords["end"], precision=0)
+    seq = gr.Textbox(lines=4, label="Input DNA sequence", placeholder="ACGT...")
+    def change_model(display_name: str, species: str):
+        """Reload pipeline with new model."""
+        try:
+            # Convert display name to full model ID
+            if display_name in MODEL_OPTIONS:
+                model_id = MODEL_OPTIONS[display_name]
+            else:
+                # Fallback: assume it's already a model ID or custom value
+                model_id = display_name
+            load_pipeline(model_id, species)
+            # Update available tracks/elements
+            _get_bigwig_names(species)  # warm cache
+            return gr.update(value="✅ Model loaded successfully"), gr.update(visible=True)
+        except Exception as e:
+            return gr.update(value=f"❌ Error loading model: {str(e)}"), gr.update(visible=True)
+    model_selector.change(
+        fn=change_model,
+        inputs=[model_selector, species],
+        outputs=[model_status, model_status],
+    )
+    gr.Markdown("## Select functional tracks")
+    bigwig_selected = gr.CheckboxGroup(
+        choices=_init_bigwig_selected,
+        value=_init_bigwig_selected,
+        label="Selected functional tracks (used for prediction)",
     )
+    bigwig_query = gr.Textbox(
+        label="Search functional tracks (auto-search while typing)",
+        placeholder="Type to search… (e.g. ENCSR056HPM for K562 RNA-seq)",
+    )
+    bigwig_results = gr.CheckboxGroup(
+        choices=[],
+        label="Results (click to add to Selected)",
+    )
+    with gr.Row():
+        bigwig_clear_btn = gr.Button("Clear results")
+        bigwig_remove_btn = gr.Button("Remove checked from Selected")
+    gr.Markdown("## Select genome annotation elements")
+    bed_elements = gr.Dropdown(
+        choices=_init_bed,
+        value=_init_bed_selected if _init_bed_selected else [],
+        multiselect=True,
+        label="Genome annotation elements (search + select)",
+    )
+    btn = gr.Button("Predict", elem_id="predict_btn")
+    gr.Markdown("## NTv3 predictions for selected tracks and elements")
+    plot = gr.Plot(label="", elem_id="tracks_plot")
+    export_png = gr.File(elem_id="export_png_hidden", interactive=False)
+    with gr.Accordion("Meta (click to expand)", open=False):
+        meta = gr.JSON(label="Meta")
+    # --- wiring (live search + auto-add) ---
+    # Live search on every keystroke
+    bigwig_query.input(
+        fn=search_bigwigs,
+        inputs=[species, bigwig_query],
+        outputs=[bigwig_results],
+    )
+    # Auto-add: whenever user checks items in results, add them to Selected,
+    # then clear results selection (so it feels like "click to add")
+    def _auto_add(selected_now: list[str], results_checked: list[str]):
+        upd = add_selected(selected_now, results_checked)  # reuses your function
+        # clear checks in results, keep choices
+        return upd, gr.update(value=[])
+    bigwig_results.change(
+        fn=_auto_add,
+        inputs=[bigwig_selected, bigwig_results],
+        outputs=[bigwig_selected, bigwig_results],
+    )
+    # Clear results list (handy when query is short)
+    def _clear_results():
+        return gr.update(choices=[], value=[]), gr.update(value="")
+    bigwig_clear_btn.click(
+        fn=_clear_results,
+        inputs=[],
+        outputs=[bigwig_results, bigwig_query],
+    )
+    # Remove: check items in Selected, then click Remove
+    bigwig_remove_btn.click(
+        fn=remove_selected,
+        inputs=[bigwig_selected, bigwig_selected],
+        outputs=[bigwig_selected],
+    )
+    species.change(
+        fn=reset_on_species_change,
+        inputs=[species],
+        outputs=[bigwig_query, bigwig_results, bigwig_selected],
+    )
+    # Update coordinates when species changes
+    species.change(
+        fn=update_coords_on_species_change,
+        inputs=[species],
+        outputs=[chrom, start, end],
+    )
+    btn.click(
+        fn=predict,
+        inputs=[seq, species, chrom, start, end, use_coords, bigwig_selected, bed_elements],
+        outputs=[plot, export_png, meta],
+        api_name="predict",
     )
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        ssr_mode=False,
+        show_error=True,
+        allowed_paths=[tempfile.gettempdir()],
+        css=CSS,
+        js=JS,
+    )

ntv3_tracks_pipeline.py CHANGED Viewed

@@ -24,11 +24,6 @@ try:
 except Exception:
     plt = None
-try:
-    import seaborn as sns
-except Exception:
-    sns = None
 # ---------------------------------------------------------------------
 # Assembly <-> species mapping
@@ -66,29 +61,42 @@ ASSEMBLY_TO_SPECIES = {
 }
 SPECIES_TO_ASSEMBLY = {v: k for k, v in ASSEMBLY_TO_SPECIES.items()}
-# Minimal UCSC FASTA sources (extend as needed)
-ASSEMBLY_TO_UCSC_FA_GZ = {
-    "hg38": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz",
-    "mm10": "https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz",
-    "dm6":  "https://hgdownload.soe.ucsc.edu/goldenPath/dm6/bigZips/dm6.fa.gz",
 }
 def _sanitize_dna(seq: str) -> str:
     seq = seq.upper()
     return "".join(ch if ch in ("A", "C", "G", "T", "N") else "N" for ch in seq)
-def _download_file(url: str, dst: Path) -> None:
     if requests is None:
         raise ImportError("requests is required for genome download. Install with: pip install requests")
-    dst.parent.mkdir(parents=True, exist_ok=True)
-    with requests.get(url, stream=True, timeout=60) as r:
-        r.raise_for_status()
-        with open(dst, "wb") as f:
-            for chunk in r.iter_content(chunk_size=1024 * 1024):
-                if chunk:
-                    f.write(chunk)
 def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Path:
@@ -112,11 +120,6 @@ def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Pa
             f"Either pass fasta_path explicitly, or extend ASSEMBLY_TO_UCSC_FA_GZ."
         )
-    url = ASSEMBLY_TO_UCSC_FA_GZ[assembly]
-    if not gz_path.exists():
-        print(f"Downloading {url} -> {gz_path}")
-        _download_file(url, gz_path)
     import gzip
     print(f"Decompressing {gz_path} -> {fa_path}")
     with gzip.open(gz_path, "rb") as fin, open(fa_path, "wb") as fout:
@@ -128,19 +131,6 @@ def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Pa
     return fa_path
-def _fetch_from_fasta(fasta_path: Union[str, Path], chrom: str, start: int, end: int) -> str:
-    if Fasta is None:
-        raise ImportError("pyfaidx is required for fasta windows. Install with: pip install pyfaidx")
-    fasta_path = Path(fasta_path)
-    if fasta_path.suffix == ".gz":
-        raise ValueError(f"Got '{fasta_path}' (gz). Please pass an uncompressed .fa (auto-download returns .fa).")
-    fasta = Fasta(str(fasta_path), rebuild=True)
-    return _sanitize_dna(fasta[chrom][start:end].seq)
 def _pick_device(device: Union[str, int, torch.device]) -> torch.device:
     # Handle torch.device objects
     if isinstance(device, torch.device):
@@ -191,8 +181,6 @@ def _plot_tracks_fillbetween(
 ):
     if plt is None:
         raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib")
-    if sns is None:
-        raise ImportError("seaborn is required for notebook-style plots. Install with: pip install seaborn")
     n = len(tracks)
     if n == 0:
@@ -205,10 +193,25 @@ def _plot_tracks_fillbetween(
     any_track = next(iter(tracks.values()))
     x = np.linspace(start, end, num=len(any_track), endpoint=False)
     for ax, (title, y) in zip(axes, tracks.items()):
-        ax.fill_between(x, y)
-        ax.set_title(title)
-        sns.despine(top=True, right=True, bottom=True)
     label = f"{chrom}:{start}-{end}" if chrom is not None else f"{start}-{end}"
     if assembly is not None:
@@ -263,12 +266,6 @@ class NTv3TracksPipeline(Pipeline):
         self.pred_center_fraction = float(pred_center_fraction)
         self.pred_center_offset_fraction = float(pred_center_offset_fraction)
-        if self.default_species not in SPECIES_TO_ASSEMBLY:
-            raise ValueError(
-                f"default_species='{self.default_species}' is not supported. "
-                f"Supported species: {sorted(SPECIES_TO_ASSEMBLY.keys())}"
-            )
         if isinstance(model, str):
             self.config = AutoConfig.from_pretrained(model, trust_remote_code=trust_remote_code, token=token)
             self.model = AutoModel.from_pretrained(model, trust_remote_code=trust_remote_code, token=token)
@@ -350,6 +347,30 @@ class NTv3TracksPipeline(Pipeline):
                 return torch.device("cpu")
         return dev
     def preprocess(self, inputs: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         species, assembly = self._resolve_species_and_assembly(inputs)
@@ -365,10 +386,8 @@ class NTv3TracksPipeline(Pipeline):
             start = int(inputs["start"])
             end = int(inputs["end"])
             window_len = end - start
-            fasta_path = inputs.get("fasta_path")
-            if fasta_path is None:
-                fasta_path = _ensure_fasta_for_assembly(assembly, self.genome_cache_dir)
-            seq = _fetch_from_fasta(fasta_path, chrom, start, end)
         # Tokenize with padding
         batch = self.tokenizer([seq], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")

 except Exception:
     plt = None
 # ---------------------------------------------------------------------
 # Assembly <-> species mapping
 }
 SPECIES_TO_ASSEMBLY = {v: k for k, v in ASSEMBLY_TO_SPECIES.items()}
+# BED element to color mapping (shared between pipeline and app)
+BED_ELEMENT_COLORS = {
+    "protein_coding_gene": "#E74C3C",  # Red
+    "lncRNA": "#2ECC71",  # Green
+    "exon": "#9B59B6",  # Purple
+    "intron": "#F39C12",  # Orange
+    "splice_donor": "#1ABC9C",  # Teal
+    "splice_acceptor": "#E67E22",  # Dark orange
+    "CTCF-bound": "#3498DB",  # Light blue
+    "polyA_signal": "#95A5A6",  # Gray
+    "enhancer_Tissue_specific": "#D35400",  # Dark red
+    "enhancer_Tissue_invariant": "#16A085",  # Dark teal
+    "promoter_Tissue_specific": "#C0392B",  # Dark red 2
+    "promoter_Tissue_invariant": "#27AE60",  # Dark green
+    "5UTR+": "#8E44AD",  # Dark purple
+    "5UTR-": "#D68910",  # Dark orange 2
+    "3UTR+": "#138D75",  # Dark teal 2
+    "3UTR-": "#2874A6",  # Dark blue
+    "skipped_exon": "#7D3C98",  # Purple 2
+    "always_on_exon": "#A93226",  # Red 2
+    "start_codon": "#196F3D",  # Green 2
+    "stop_codon": "#B9770E",  # Brown
+    "ORF": "#1F618D",  # Blue 2
 }
 def _sanitize_dna(seq: str) -> str:
     seq = seq.upper()
     return "".join(ch if ch in ("A", "C", "G", "T", "N") else "N" for ch in seq)
+def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
     if requests is None:
         raise ImportError("requests is required for genome download. Install with: pip install requests")
+    url = f"https://api.genome.ucsc.edu/getData/sequence?genome={assembly};chrom={chrom};start={start};end={end}"
+    seq = requests.get(url).json()["dna"].upper()
+    return seq
 def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Path:
             f"Either pass fasta_path explicitly, or extend ASSEMBLY_TO_UCSC_FA_GZ."
         )
     import gzip
     print(f"Decompressing {gz_path} -> {fa_path}")
     with gzip.open(gz_path, "rb") as fin, open(fa_path, "wb") as fout:
     return fa_path
 def _pick_device(device: Union[str, int, torch.device]) -> torch.device:
     # Handle torch.device objects
     if isinstance(device, torch.device):
 ):
     if plt is None:
         raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib")
     n = len(tracks)
     if n == 0:
     any_track = next(iter(tracks.values()))
     x = np.linspace(start, end, num=len(any_track), endpoint=False)
+    # Define color schemes
+    # BigWig tracks: use blue/gray tones
+    bigwig_color = "#4A90E2"  # Blue
     for ax, (title, y) in zip(axes, tracks.items()):
+        # Determine color based on track type
+        if title in BED_ELEMENT_COLORS:
+            color = BED_ELEMENT_COLORS[title]
+        else:
+            color = bigwig_color
+        ax.fill_between(x, y, color=color, alpha=0.3, linewidth=0)
+        ax.plot(x, y, color=color, linewidth=0.8)
+        ax.set_title(title, fontsize=10, loc="left")
+        ax.grid(alpha=0.2)
+        ax.set_yticks([])
+        # minimal "despine"
+        ax.spines["top"].set_visible(False)
+        ax.spines["right"].set_visible(False)
     label = f"{chrom}:{start}-{end}" if chrom is not None else f"{start}-{end}"
     if assembly is not None:
         self.pred_center_fraction = float(pred_center_fraction)
         self.pred_center_offset_fraction = float(pred_center_offset_fraction)
         if isinstance(model, str):
             self.config = AutoConfig.from_pretrained(model, trust_remote_code=trust_remote_code, token=token)
             self.model = AutoModel.from_pretrained(model, trust_remote_code=trust_remote_code, token=token)
                 return torch.device("cpu")
         return dev
+    def available_bigwig_track_names(self, species: str | None = None) -> list[str]:
+        """
+        Return BigWig track IDs for the assembly corresponding to `species`.
+        No model forward pass.
+        """
+        sp = species or self.default_species
+        assembly = SPECIES_TO_ASSEMBLY.get(sp)
+        if assembly is None:
+            raise ValueError(f"Unknown species={sp}. Supported: {sorted(SPECIES_TO_ASSEMBLY.keys())}")
+        if assembly not in self.config.bigwigs_per_file_assembly:
+            raise ValueError(
+                f"Assembly {assembly} not found in checkpoint config. "
+                f"Available: {list(self.config.bigwigs_per_file_assembly.keys())}"
+            )
+        return list(self.config.bigwigs_per_file_assembly[assembly])
+    def available_bed_element_names(self) -> List[str]:
+        """
+        Return BED element names available in this checkpoint (no forward pass).
+        """
+        return list(self.bed_element_names or [])
     def preprocess(self, inputs: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         species, assembly = self._resolve_species_and_assembly(inputs)
             start = int(inputs["start"])
             end = int(inputs["end"])
             window_len = end - start
+            seq = _get_dna_sequence(assembly, chrom, start, end)
+            seq = _sanitize_dna(seq)
         # Tokenize with padding
         batch = self.tokenizer([seq], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ numpy
 gradio>=4.0.0
 pyfaidx
 requests

 gradio>=4.0.0
 pyfaidx
 requests
+matplotlib