Spaces:

InstaDeepAI
/

ntv3_tracks

Running on Zero

App Files Files Community

bernardo-de-almeida commited on Dec 17, 2025

Commit

9dd80fe

1 Parent(s): eec0acd

feat: make robust to no GPU

Browse files

Files changed (6) hide show

README.md +1 -1
app.py +337 -174
bigwig_export.py +33 -30
data/functional_tracks_metadata.csv +1 -1
ntv3_tracks_pipeline.py +123 -71
requirements.txt +5 -5

README.md CHANGED Viewed

@@ -11,4 +11,4 @@ pinned: false
 # NTv3 Tracks Demo
-This Space deploys the custom Hugging Face `Pipeline` in `ntv3_tracks_pipeline.py`.


11
12	# NTv3 Tracks Demo
13
14	+ This Space deploys the custom Hugging Face `Pipeline` in `ntv3_tracks_pipeline.py`.

app.py CHANGED Viewed

@@ -4,26 +4,22 @@ import tempfile
 import time
 import uuid
 from pathlib import Path
-import torch
-import numpy as np
-import gradio as gr
-import spaces
-# Set matplotlib to use non-interactive backend before importing pyplot
-# This is required for Gradio which runs on worker threads
 import matplotlib
-matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 from ntv3_tracks_pipeline import (
-    load_ntv3_tracks_pipeline,
-    BED_ELEMENT_COLORS,
     ASSEMBLY_TO_SPECIES,
     SPECIES_WITH_COORDINATE_SUPPORT,
 )
-from bigwig_export import create_bigwig_zip, _softmax_last
 # -----------------------------
 # Env / auth
@@ -36,7 +32,9 @@ HF_TOKEN = (
     or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 )
 if HF_TOKEN is None:
-    raise RuntimeError("Missing Hugging Face token. Set NTV3_HF_TOKEN as a Space Secret.")
 # asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
@@ -49,6 +47,7 @@ SEARCH_MAX_RESULTS = int(os.environ.get("SEARCH_MAX_RESULTS", "50"))
 pipe = None
 current_model_id = MODEL_ID
 def load_pipeline(model_id: str, species: str = DEFAULT_SPECIES):
     """Load or reload the pipeline with a new model."""
     global pipe, current_model_id
@@ -62,6 +61,7 @@ def load_pipeline(model_id: str, species: str = DEFAULT_SPECIES):
     current_model_id = model_id
     return pipe
 # Load initial pipeline
 load_pipeline(MODEL_ID, DEFAULT_SPECIES)
@@ -73,6 +73,7 @@ load_pipeline(MODEL_ID, DEFAULT_SPECIES)
 _t0 = None
 _tlast = None
 def tprint(msg: str):
     "Function to print timing information"
     global _t0, _tlast
@@ -87,6 +88,21 @@ def tprint(msg: str):
     print(f"[timing] {msg}: {now - _tlast:.3f}s (total {now - _t0:.3f}s)")
     _tlast = now
 def _global_stride(L: int, target: int) -> int:
     if target <= 0 or L <= target:
         return 1
@@ -111,7 +127,7 @@ def _make_tracks_figure(x: np.ndarray, series: list[tuple[str, np.ndarray]]):
             color = BED_ELEMENT_COLORS[title]
         else:
             color = bigwig_color
         ax.fill_between(x, y, color=color, alpha=0.3, linewidth=0)
         ax.plot(x, y, color=color, linewidth=0.8)
         ax.set_title(title, fontsize=10, loc="left")
@@ -143,52 +159,52 @@ def _load_track_metadata() -> dict[str, str]:
     """Load track metadata from CSV and create display name mapping."""
     if _TRACK_METADATA_CACHE:
         return _TRACK_METADATA_CACHE
     csv_path = Path(__file__).parent / "data" / "functional_tracks_metadata.csv"
     if not csv_path.exists():
         return {}
     metadata = {}
     try:
-        with open(csv_path, 'r', encoding='utf-8') as f:
             reader = csv.DictReader(f)
             for row in reader:
-                track_id = row['file_id']
-                tissue = row.get('tissue', '').strip()
-                assay = row.get('assay', '').strip()
-                experiment_target = row.get('experiment_target', '').strip()
-                biosample_type = row.get('biosample_type', '').strip()
-                strand = row.get('strand', '').strip()
                 # Build display name from available fields
                 parts = []
-                if biosample_type and biosample_type != 'tissue':
                     parts.append(biosample_type)
                 if tissue:
                     parts.append(tissue)
                 if assay:
                     # For RNA-seq, include strand information if available
                     if strand:
-                        if strand == 'plus':
-                            strand = '+'
-                        elif strand == 'minus':
-                            strand = '-'
                         parts.append(f"{assay} {strand}")
                     else:
                         parts.append(assay)
-                if experiment_target and experiment_target not in ('none', 'RNA-seq'):
                     parts.append(experiment_target)
                 if parts:
                     display_name = " - ".join(parts)
                 else:
                     display_name = track_id  # Fallback to ID if no metadata
                 metadata[track_id] = display_name
     except Exception as e:
         print(f"Warning: Could not load track metadata: {e}")
         return {}
     _TRACK_METADATA_CACHE.update(metadata)
     return metadata
@@ -235,7 +251,7 @@ def _get_species_with_bigwigs() -> set[str]:
     """Get set of species that have BigWig tracks available in the current model."""
     if pipe is None:
         return set()
     species_with_bigwigs = set()
     for species in ASSEMBLY_TO_SPECIES.values():
         if _has_bigwigs(species):
@@ -287,32 +303,38 @@ def search_bigwigs(species: str, query: str, current_selected: list[str]):
     if query is None:
         query = ""
     query_stripped = query.strip()
     # If query is empty, return empty results immediately (don't show all tracks)
     if not query_stripped:
         displayed_selected = current_selected or []
         show_selected = bool(displayed_selected)
         return (
-            gr.update(choices=[], value=[], interactive=True),  # empty results, explicitly clear checked state
-            gr.update(visible=show_selected, choices=displayed_selected, value=displayed_selected),  # show ALL selected tracks
         )
     names = _get_bigwig_names(species)
     # Search in both track IDs and display names
     metadata = _load_track_metadata()
     query_lower = query_stripped.lower()
     # Show selected tracks section if user is typing or has selections
     show_selected = bool(query_stripped) or bool(current_selected)
     # Show ALL selected tracks (not limited to 20)
     displayed_selected = current_selected or []
     # Extract track IDs from already selected tracks (to exclude them from results)
     selected_track_ids = set()
     if current_selected:
         selected_track_ids = {_extract_track_id(x) for x in current_selected}
     # Build list of (display_format, track_id) tuples for searching
     track_display_pairs = []
     for track_id in names:
@@ -322,20 +344,26 @@ def search_bigwigs(species: str, query: str, current_selected: list[str]):
         display_name = metadata.get(track_id, track_id)
         display_format = _format_track_for_display(track_id)
         track_display_pairs.append((display_format, track_id, display_name))
     # Filter by query (search in display name, display format, and track_id)
     matching = []
     for display_format, track_id, display_name in track_display_pairs:
-        if (query_lower in track_id.lower() or
-            query_lower in display_name.lower() or
-            query_lower in display_format.lower()):
             matching.append(display_format)
     # Limit search results
     results = matching[:SEARCH_MAX_RESULTS]
     return (
-        gr.update(choices=results, value=[], interactive=True),  # results - limited to SEARCH_MAX_RESULTS, explicitly clear checked state
-        gr.update(visible=show_selected, choices=displayed_selected, value=displayed_selected),  # show ALL selected tracks
     )
@@ -344,16 +372,16 @@ def add_selected(current_selected: list[str], to_add: list[str]):
     # Extract track IDs from current selection (in case they're in display format)
     cur_ids = [_extract_track_id(x) for x in (current_selected or [])]
     cur_display = [_format_track_for_display(tid) for tid in cur_ids]
     # Extract track IDs from items to add
     to_add_ids = [_extract_track_id(x) for x in (to_add or [])]
     # Add new track IDs
     for tid in to_add_ids:
         if tid not in cur_ids:
             cur_ids.append(tid)
             cur_display.append(_format_track_for_display(tid))
     # Show ALL selected tracks (no limit)
     return gr.update(choices=cur_display, value=cur_display)  # show all selected tracks
@@ -371,29 +399,34 @@ def update_coords_on_species_change(species: str):
     coords = DEFAULT_COORDS.get(species, DEFAULT_COORDS["human"])
     return coords["chrom"], coords["start"], coords["end"]
 def reset_on_species_change(species: str):
     # Clear results + selected when species changes (avoids mismatched IDs)
     try:
         track_ids = _get_bigwig_names(species)  # warms cache if available
         # Format available tracks for display
         formatted_tracks = [_format_track_for_display(tid) for tid in track_ids]
         # Get default tracks for this species (filter to what's available)
         default_track_ids = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in track_ids]
-        default_formatted = [_format_track_for_display(tid) for tid in default_track_ids]
         # Show selected tracks section if there are default tracks
         show_selected = bool(default_formatted)
         return (
-            gr.update(value=""),          # query textbox
             gr.update(choices=[], value=[]),  # results list
-            gr.update(choices=formatted_tracks, value=default_formatted, visible=show_selected),  # selected list with defaults
         )
     except (ValueError, AttributeError):
         # Species doesn't have bigwigs, that's okay
         return (
-            gr.update(value=""),          # query textbox
             gr.update(choices=[], value=[]),  # results list
             gr.update(choices=[], value=[], visible=False),  # selected list (hidden)
         )
@@ -402,7 +435,7 @@ def reset_on_species_change(species: str):
 # -----------------------------
 # Predict
 # -----------------------------
-@spaces.GPU
 def predict(
     seq: str,
     species: str,
@@ -418,13 +451,13 @@ def predict(
     # Debug: verify species is being passed
     if not species:
         raise gr.Error("Species parameter is missing. Please select a species.")
     # Extract track IDs from display format if needed
     bigwig_selected = [_extract_track_id(tid) for tid in bigwig_selected]
     # Determine if using coordinates based on input_type radio button
     use_coords = input_type == "Use genomic coordinates"
     if use_coords:
         # Check if this species supports coordinate-based fetching
         if species not in SPECIES_WITH_COORDINATE_SUPPORT:
@@ -437,7 +470,12 @@ def predict(
             raise gr.Error("chrom is required when use_coords=True")
         if start is None or end is None or int(end) <= int(start):
             raise gr.Error("start/end must be set and end > start when use_coords=True")
-        inputs = {"chrom": chrom, "start": int(start), "end": int(end), "species": species}
     else:
         if not seq or not seq.strip():
             raise gr.Error("seq is required when use_coords=False")
@@ -445,7 +483,9 @@ def predict(
     # Verify species is in inputs before calling pipeline
     if "species" not in inputs:
-        raise gr.Error(f"Internal error: species not found in inputs dict. Inputs: {list(inputs.keys())}")
     tprint("inputs prepared")
@@ -474,12 +514,16 @@ def predict(
     # Check if we have any tracks/elements to plot
     has_bigwigs = bw is not None and len(bw_names) > 0
     has_bed = bed_logits is not None and len(bed_names) > 0
     if not has_bigwigs and not has_bed:
-        raise gr.Error("No BigWig tracks or BED elements available for this species in the current model.")
     if not has_bigwigs and bigwig_selected:
-        raise gr.Error("No BigWig tracks available for this species, but BigWig tracks were selected. Please deselect BigWig tracks or choose a different species.")
     # Defaults if user picked none
     if has_bigwigs and not bigwig_selected:
@@ -495,7 +539,7 @@ def predict(
         ]
         # Filter to only include tracks that are available for this species/assembly
         bigwig_selected = [tid for tid in default_bigwig_tracks if tid in bw_names]
     if (not bed_elements) and bed_names:
         default_bed_elements = ["protein_coding_gene", "exon", "intron"]
         # Filter to only include elements that are available
@@ -519,7 +563,7 @@ def predict(
         L = bed_logits.shape[0]
     else:
         raise gr.Error("No data available for plotting.")
     stride = _global_stride(L, PLOT_TARGET_POINTS)
     x0 = int(out.pred_start or 0)
@@ -527,7 +571,7 @@ def predict(
     x = np.linspace(x0, x1, num=L, endpoint=False)[::stride]
     series: list[tuple[str, np.ndarray]] = []
     # Add BigWig tracks if available and selected
     if has_bigwigs and bigwig_selected:
         for tid in bigwig_selected:
@@ -545,7 +589,9 @@ def predict(
     fig = _make_tracks_figure(x, series)
     tprint("figure created")
-    region = f"{out.chrom}:{out.pred_start}-{out.pred_end}" if out.chrom else f"{x0}-{x1}"
     if out.assembly:
         region += f" ({out.assembly})"
     fig.axes[-1].set_xlabel(region)
@@ -846,9 +892,13 @@ DEFAULT_BED_ELEMENTS = ["protein_coding_gene", "exon", "intron"]
 # Get available BigWig tracks for default species and filter defaults
 _init_bigwig = _get_bigwig_names(DEFAULT_SPECIES)
-_init_bigwig_selected_ids = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bigwig]
 # Format for display
-_init_bigwig_selected = [_format_track_for_display(tid) for tid in _init_bigwig_selected_ids]
 # Filter default BED elements to only those available
 _init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
@@ -864,11 +914,13 @@ DEFAULT_COORDS = {
 # Get default coordinates for default species
 _default_coords = DEFAULT_COORDS.get(DEFAULT_SPECIES, DEFAULT_COORDS["human"])
 # Format species names for display (replace underscores with spaces, capitalize)
 def _format_species_name(species: str) -> str:
     """Format species name for display."""
     return species.replace("_", " ").title()
 # Get all available species and format them
 _all_species = sorted(ASSEMBLY_TO_SPECIES.values())
 _all_species_formatted = [_format_species_name(s) for s in _all_species]
@@ -876,12 +928,18 @@ _all_species_list = ", ".join(_all_species_formatted)
 # Get species with BigWig tracks
 _species_with_bigwigs = _get_species_with_bigwigs()
-_bigwig_species_formatted = sorted([_format_species_name(s) for s in _species_with_bigwigs])
-_bigwig_species_list = ", ".join(_bigwig_species_formatted) if _bigwig_species_formatted else "None (BED elements only)"
 with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     gr.Markdown(
-    f"""
 <div class="intro-hero">
 <div class="intro-title">
@@ -933,34 +991,33 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
 </div>
 """,
-    elem_id="intro_markdown",
-)
     gr.Markdown("## Select NTv3 post-trained model")
     # Model display names (without InstaDeepAI/ prefix) and their full IDs
     MODEL_OPTIONS = {
         "NTv3 650M (pos)": "InstaDeepAI/NTv3_650M_pos",
         "NTv3 100M (pos)": "InstaDeepAI/NTv3_100M_pos",
     }
     # Reverse mapping: full ID -> display name
     MODEL_ID_TO_DISPLAY = {v: k for k, v in MODEL_OPTIONS.items()}
     # Get display name for current model
     current_display_name = MODEL_ID_TO_DISPLAY.get(current_model_id, "NTv3 100M (pos)")
     model_selector = gr.Dropdown(
         choices=list(MODEL_OPTIONS.keys()),
         value=current_display_name,
         label="Model",
     )
     model_status = gr.Markdown("", visible=False)
     gr.Markdown("## Input DNA sequence")
     # Get all available species from the pipeline
     all_species = sorted(ASSEMBLY_TO_SPECIES.values())
@@ -969,35 +1026,47 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         value=DEFAULT_SPECIES,
         label="Species",
     )
     # Radio buttons for input type selection
     is_supported_default = DEFAULT_SPECIES in SPECIES_WITH_COORDINATE_SUPPORT
-    initial_input_type = "Use genomic coordinates" if is_supported_default else "Enter DNA sequence"
     input_type = gr.Radio(
         choices=["Use genomic coordinates", "Enter DNA sequence"],
         value=initial_input_type,
         label="Input method",
         visible=is_supported_default,  # Only show if species supports coordinates
     )
     # Coordinates section - visible only when "Use genomic coordinates" is selected
-    with gr.Group(visible=is_supported_default and initial_input_type == "Use genomic coordinates", elem_id="coords_group") as coords_group:
-        gr.Markdown("**Genomic coordinates** (supported species: " + ", ".join(sorted(SPECIES_WITH_COORDINATE_SUPPORT)) + ")")
         with gr.Row():
             chrom = gr.Textbox(label="Chromosome", value=_default_coords["chrom"])
-            start = gr.Number(label="Start", value=_default_coords["start"], precision=0)
             end = gr.Number(label="End", value=_default_coords["end"], precision=0)
     # DNA sequence section - visible only when "Enter DNA sequence" is selected
     # Using Textbox directly (not wrapped in Group) to avoid visual border/line
     seq = gr.Textbox(
-        lines=4,
-        label="Input DNA sequence",
         placeholder="ACGT...",
         visible=initial_input_type == "Enter DNA sequence",
-        elem_id="dna_sequence_input"
     )
     def change_model(display_name: str, species: str):
         """Reload pipeline with new model."""
         try:
@@ -1007,14 +1076,18 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
             else:
                 # Fallback: assume it's already a model ID or custom value
                 model_id = display_name
             load_pipeline(model_id, species)
             # Update available tracks/elements
             _get_bigwig_names(species)  # warm cache
-            return gr.update(value="✅ Model loaded successfully"), gr.update(visible=True)
         except Exception as e:
-            return gr.update(value=f"❌ Error loading model: {str(e)}"), gr.update(visible=True)
     model_selector.change(
         fn=change_model,
         inputs=[model_selector, species],
@@ -1022,7 +1095,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     )
     gr.Markdown("## Select functional tracks")
     # Button to download tracks metadata
     def get_metadata_file_path():
         """Return path to metadata CSV file for download."""
@@ -1030,7 +1103,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         if csv_path.exists():
             return str(csv_path)
         return None
     metadata_file_path = get_metadata_file_path()
     download_metadata_btn = gr.Button(
         "📋 Download metadata for all functional tracks",
@@ -1041,19 +1114,19 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         label="Tracks metadata",
         visible=False,
     )
     def download_metadata():
         """Return metadata file for download."""
         if metadata_file_path and Path(metadata_file_path).exists():
             return gr.update(value=metadata_file_path, visible=True)
         return gr.update(visible=False)
     download_metadata_btn.click(
         fn=download_metadata,
         inputs=[],
         outputs=[metadata_download_file],
     )
     bigwig_no_tracks_msg = gr.Markdown(
         "⚠️ No functional genomic tracks available for this species in the current model.",
         visible=False,
@@ -1063,7 +1136,9 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         choices=_init_bigwig_selected,
         value=_init_bigwig_selected,
         label="Selected functional tracks (used for prediction)",
-        visible=bool(_init_bigwig_selected),  # Show if there are default tracks, otherwise hidden
     )
     bigwig_query = gr.Textbox(
@@ -1081,7 +1156,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         bigwig_remove_btn = gr.Button("Remove all selected")
     gr.Markdown("## Select genome annotation elements")
     bed_elements = gr.Dropdown(
         choices=_init_bed,
         value=_init_bed_selected if _init_bed_selected else [],
@@ -1092,17 +1167,21 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     btn = gr.Button("Predict", elem_id="predict_btn")
     gr.Markdown("## NTv3 predictions for selected tracks and elements")
-    gr.Markdown("Note: NTv3 predictions are for the 37.5% center of the input sequence.")
     plot = gr.Plot(label="", elem_id="tracks_plot")
     export_png = gr.File(elem_id="export_png_hidden", interactive=False)
     # State to store prediction output and selections for BigWig export
     prediction_state = gr.State(value=None)
     bigwig_selected_state = gr.State(value=[])
     bed_elements_state = gr.State(value=[])
-    download_bigwig_btn = gr.Button("📥 Download tracks as BigWig files (ZIP)", variant="secondary")
     export_bigwig = gr.File(label="Download BigWig files", visible=False)
     with gr.Accordion("Meta (click to expand)", open=False):
@@ -1124,24 +1203,26 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     )
     # Helper function to get search results choices directly (without gr.update wrapper)
-    def _get_search_results_choices(species: str, query: str, current_selected: list[str]) -> list[str]:
         """Get search results choices as a list, excluding selected tracks."""
         if query is None:
             query = ""
         query_stripped = query.strip()
         if not query_stripped:
             return []
         names = _get_bigwig_names(species)
         metadata = _load_track_metadata()
         query_lower = query_stripped.lower()
         # Extract track IDs from already selected tracks
         selected_track_ids = set()
         if current_selected:
             selected_track_ids = {_extract_track_id(x) for x in current_selected}
         # Build and filter results
         matching = []
         for track_id in names:
@@ -1149,46 +1230,70 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
                 continue
             display_name = metadata.get(track_id, track_id)
             display_format = _format_track_for_display(track_id)
-            if (query_lower in track_id.lower() or
-                query_lower in display_name.lower() or
-                query_lower in display_format.lower()):
                 matching.append(display_format)
         return matching[:SEARCH_MAX_RESULTS]
     # Auto-add: whenever user checks items in results, add them to Selected,
     # then clear results selection (so it feels like "click to add")
-    def _auto_add(selected_now: list[str], results_checked: list[str], current_query: str, current_results: list[str], current_species: str):
         upd = add_selected(selected_now, results_checked)  # reuses your function
         # Show selected tracks section if there are selections
         show_selected = bool(upd["value"])
         # Get the new search results choices directly (excluding all selected tracks)
-        new_choices = _get_search_results_choices(current_species, current_query, upd["value"])
         # Create a completely fresh update with explicit empty value to prevent any checked state
         # Force Gradio to clear checked state by explicitly setting value to empty list
         # Use a workaround: set choices to empty first, then to new_choices to force a complete refresh
         # But since we can only return one update, we'll ensure value is explicitly empty
         # and that we're not preserving any state from the previous update
         # Ensure no items from results_checked are in new_choices (they should already be filtered, but double-check)
         checked_track_ids = {_extract_track_id(x) for x in results_checked}
-        new_choices_filtered = [c for c in new_choices if _extract_track_id(c) not in checked_track_ids]
         # Create update with explicit empty value - this should force Gradio to clear all checked items
         fresh_update = gr.update(
             choices=new_choices_filtered,
             value=[],  # CRITICAL: Explicitly empty list to clear all checked state
         )
         return gr.update(**upd, visible=show_selected), fresh_update
     # Use a wrapper that ensures results are cleared before updating
-    def _auto_add_wrapper(selected_now: list[str], results_checked: list[str], current_query: str, current_results: list[str], current_species: str):
         # First, get the updates
-        selected_update, results_update = _auto_add(selected_now, results_checked, current_query, current_results, current_species)
         # Force the results update to have an explicit empty value
         # Extract choices from results_update if it's a dict-like object
         if isinstance(results_update, dict):
@@ -1197,21 +1302,26 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
             # If it's a gr.update object, we need to access it differently
             # Try to get choices from the update
             try:
-                results_choices = results_update.choices if hasattr(results_update, 'choices') else []
             except:
                 # Fallback: get choices from the search function directly
                 results_choices = _get_search_results_choices(
-                    current_species,
-                    current_query,
-                    selected_now + results_checked if isinstance(selected_now, list) and isinstance(results_checked, list) else []
                 )
         # Create a completely fresh update with explicit empty value
         # This should force Gradio to clear all checked items
         fresh_results_update = gr.update(choices=results_choices, value=[])
         return selected_update, fresh_results_update
     bigwig_results.change(
         fn=_auto_add_wrapper,
         inputs=[bigwig_selected, bigwig_results, bigwig_query, bigwig_results, species],
@@ -1219,20 +1329,24 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     )
     # Update selected tracks immediately when user unchecks items
-    def _update_selected_tracks(selected_value: list[str], current_query: str, current_species: str):
         """Update selected tracks when user checks/unchecks items directly."""
         # selected_value contains only the currently checked items
         # Update choices to match the current selections (so unchecked items are removed)
         show_selected = bool(selected_value)
         # Also update search results to reflect the new selection (tracks that were unchecked can now appear in results)
         search_updates = search_bigwigs(current_species, current_query, selected_value)
         return (
-            gr.update(choices=selected_value, value=selected_value, visible=show_selected),  # Update selected tracks
             search_updates[0],  # Update search results
         )
     bigwig_selected.change(
         fn=_update_selected_tracks,
         inputs=[bigwig_selected, bigwig_query, species],
@@ -1261,7 +1375,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         inputs=[species],
         outputs=[bigwig_query, bigwig_results, bigwig_selected],
     )
     # Update coordinates visibility and values when species changes
     def update_on_species_change(species: str, input_type_val: str):
         """Update coordinates visibility and values when species changes."""
@@ -1272,15 +1386,19 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         use_coords = input_type_val == "Use genomic coordinates"
         show_coords = is_supported and use_coords
         show_seq = not show_coords
         # Format available tracks for display if species has bigwigs
         if has_bigwigs:
             try:
                 track_ids = _get_bigwig_names(species)
                 formatted_tracks = [_format_track_for_display(tid) for tid in track_ids]
                 # Get default tracks for this species (filter to what's available)
-                default_track_ids = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in track_ids]
-                default_formatted = [_format_track_for_display(tid) for tid in default_track_ids]
                 # Show selected tracks section if there are default tracks
                 show_selected_tracks = bool(default_formatted)
             except:
@@ -1291,29 +1409,42 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
             formatted_tracks = []
             default_formatted = []
             show_selected_tracks = False
         return (
             gr.update(visible=show_coords, value=coords["chrom"]),
             gr.update(visible=show_coords, value=coords["start"]),
             gr.update(visible=show_coords, value=coords["end"]),
-            gr.update(visible=is_supported, value="Use genomic coordinates" if is_supported else "Enter DNA sequence"),  # Update input_type radio
             gr.update(visible=show_coords),  # Show/hide coords_group
-            gr.update(visible=show_seq),     # Show/hide seq
-            gr.update(visible=not has_bigwigs),  # Show "no tracks" message if no bigwigs
-            gr.update(visible=show_selected_tracks, choices=formatted_tracks, value=default_formatted),  # Show bigwig selection with defaults if available
             gr.update(visible=has_bigwigs),  # Show bigwig query if available
             gr.update(visible=has_bigwigs),  # Show bigwig results if available
             gr.update(visible=has_bigwigs),  # Show bigwig buttons if available
         )
     # Update input type radio visibility and value when species changes
     def update_input_type_on_species_change(species: str):
         """Update input type radio when species changes."""
         is_supported = species in SPECIES_WITH_COORDINATE_SUPPORT
         # If species doesn't support coordinates, default to sequence input
-        default_value = "Use genomic coordinates" if is_supported else "Enter DNA sequence"
         return gr.update(visible=is_supported, value=default_value)
     # Update input visibility when radio button changes
     def update_input_visibility(input_type_val: str, species: str):
         """Update input visibility when radio button changes."""
@@ -1321,15 +1452,21 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
         if input_type_val == "Enter DNA sequence":
             # Hide coordinates, show sequence
             return (
-                gr.update(visible=False),  # coords_group - always hide when sequence is selected
-                gr.update(visible=True),   # seq - always show when sequence is selected
             )
         elif input_type_val == "Use genomic coordinates":
             # Show coordinates only if species supports it
             is_supported = species in SPECIES_WITH_COORDINATE_SUPPORT
             return (
-                gr.update(visible=is_supported),  # coords_group - show only if supported
-                gr.update(visible=not is_supported),  # seq - hide when coordinates are shown
             )
         else:
             # Fallback: hide both (shouldn't happen)
@@ -1337,22 +1474,31 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
                 gr.update(visible=False),
                 gr.update(visible=False),
             )
     species.change(
         fn=update_input_type_on_species_change,
         inputs=[species],
         outputs=[input_type],
     )
     species.change(
         fn=update_on_species_change,
         inputs=[species, input_type],
         outputs=[
-            chrom, start, end, input_type, coords_group, seq,
-            bigwig_no_tracks_msg, bigwig_selected, bigwig_query, bigwig_results, bigwig_buttons_row
         ],
     )
     input_type.change(
         fn=update_input_visibility,
         inputs=[input_type, species],
@@ -1361,21 +1507,39 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     btn.click(
         fn=predict,
-        inputs=[seq, species, chrom, start, end, input_type, bigwig_selected, bed_elements],
-        outputs=[plot, export_png, meta, prediction_state, bigwig_selected_state, bed_elements_state],
         api_name="predict",
     )
     def download_bigwig_zip(out, bw_selected, bed_selected):
         """Create and return BigWig zip file."""
         try:
             zip_path = create_bigwig_zip(out, bw_selected, bed_selected)
             return gr.update(value=zip_path, visible=True)
         except ImportError as e:
-            raise gr.Error("pyBigWig is required for BigWig export. Install with: pip install pyBigWig")
         except Exception as e:
             raise gr.Error(f"Error creating BigWig files: {str(e)}")
     download_bigwig_btn.click(
         fn=download_bigwig_zip,
         inputs=[prediction_state, bigwig_selected_state, bed_elements_state],
@@ -1392,4 +1556,3 @@ if __name__ == "__main__":
         css=CSS,
         js=JS,
     )

 import time
 import uuid
 from pathlib import Path
+import gradio as gr
 import matplotlib
 import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from bigwig_export import _softmax_last, create_bigwig_zip
 from ntv3_tracks_pipeline import (
     ASSEMBLY_TO_SPECIES,
+    BED_ELEMENT_COLORS,
     SPECIES_WITH_COORDINATE_SUPPORT,
+    load_ntv3_tracks_pipeline,
 )
+matplotlib.use("Agg")
 # -----------------------------
 # Env / auth
     or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 )
 if HF_TOKEN is None:
+    raise RuntimeError(
+        "Missing Hugging Face token. Set NTV3_HF_TOKEN as a Space Secret."
+    )
 # asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
 pipe = None
 current_model_id = MODEL_ID
 def load_pipeline(model_id: str, species: str = DEFAULT_SPECIES):
     """Load or reload the pipeline with a new model."""
     global pipe, current_model_id
     current_model_id = model_id
     return pipe
 # Load initial pipeline
 load_pipeline(MODEL_ID, DEFAULT_SPECIES)
 _t0 = None
 _tlast = None
 def tprint(msg: str):
     "Function to print timing information"
     global _t0, _tlast
     print(f"[timing] {msg}: {now - _tlast:.3f}s (total {now - _t0:.3f}s)")
     _tlast = now
+# GPU decorator
+try:
+    import spaces
+    gpu = spaces.GPU
+except Exception:
+    def gpu(*args, **kwargs):
+        def wrap(fn):
+            return fn
+        return wrap
 def _global_stride(L: int, target: int) -> int:
     if target <= 0 or L <= target:
         return 1
             color = BED_ELEMENT_COLORS[title]
         else:
             color = bigwig_color
         ax.fill_between(x, y, color=color, alpha=0.3, linewidth=0)
         ax.plot(x, y, color=color, linewidth=0.8)
         ax.set_title(title, fontsize=10, loc="left")
     """Load track metadata from CSV and create display name mapping."""
     if _TRACK_METADATA_CACHE:
         return _TRACK_METADATA_CACHE
     csv_path = Path(__file__).parent / "data" / "functional_tracks_metadata.csv"
     if not csv_path.exists():
         return {}
     metadata = {}
     try:
+        with open(csv_path, encoding="utf-8") as f:
             reader = csv.DictReader(f)
             for row in reader:
+                track_id = row["file_id"]
+                tissue = row.get("tissue", "").strip()
+                assay = row.get("assay", "").strip()
+                experiment_target = row.get("experiment_target", "").strip()
+                biosample_type = row.get("biosample_type", "").strip()
+                strand = row.get("strand", "").strip()
                 # Build display name from available fields
                 parts = []
+                if biosample_type and biosample_type != "tissue":
                     parts.append(biosample_type)
                 if tissue:
                     parts.append(tissue)
                 if assay:
                     # For RNA-seq, include strand information if available
                     if strand:
+                        if strand == "plus":
+                            strand = "+"
+                        elif strand == "minus":
+                            strand = "-"
                         parts.append(f"{assay} {strand}")
                     else:
                         parts.append(assay)
+                if experiment_target and experiment_target not in ("none", "RNA-seq"):
                     parts.append(experiment_target)
                 if parts:
                     display_name = " - ".join(parts)
                 else:
                     display_name = track_id  # Fallback to ID if no metadata
                 metadata[track_id] = display_name
     except Exception as e:
         print(f"Warning: Could not load track metadata: {e}")
         return {}
     _TRACK_METADATA_CACHE.update(metadata)
     return metadata
     """Get set of species that have BigWig tracks available in the current model."""
     if pipe is None:
         return set()
     species_with_bigwigs = set()
     for species in ASSEMBLY_TO_SPECIES.values():
         if _has_bigwigs(species):
     if query is None:
         query = ""
     query_stripped = query.strip()
     # If query is empty, return empty results immediately (don't show all tracks)
     if not query_stripped:
         displayed_selected = current_selected or []
         show_selected = bool(displayed_selected)
         return (
+            gr.update(
+                choices=[], value=[], interactive=True
+            ),  # empty results, explicitly clear checked state
+            gr.update(
+                visible=show_selected,
+                choices=displayed_selected,
+                value=displayed_selected,
+            ),  # show ALL selected tracks
         )
     names = _get_bigwig_names(species)
     # Search in both track IDs and display names
     metadata = _load_track_metadata()
     query_lower = query_stripped.lower()
     # Show selected tracks section if user is typing or has selections
     show_selected = bool(query_stripped) or bool(current_selected)
     # Show ALL selected tracks (not limited to 20)
     displayed_selected = current_selected or []
     # Extract track IDs from already selected tracks (to exclude them from results)
     selected_track_ids = set()
     if current_selected:
         selected_track_ids = {_extract_track_id(x) for x in current_selected}
     # Build list of (display_format, track_id) tuples for searching
     track_display_pairs = []
     for track_id in names:
         display_name = metadata.get(track_id, track_id)
         display_format = _format_track_for_display(track_id)
         track_display_pairs.append((display_format, track_id, display_name))
     # Filter by query (search in display name, display format, and track_id)
     matching = []
     for display_format, track_id, display_name in track_display_pairs:
+        if (
+            query_lower in track_id.lower()
+            or query_lower in display_name.lower()
+            or query_lower in display_format.lower()
+        ):
             matching.append(display_format)
     # Limit search results
     results = matching[:SEARCH_MAX_RESULTS]
     return (
+        gr.update(
+            choices=results, value=[], interactive=True
+        ),  # results - limited to SEARCH_MAX_RESULTS, explicitly clear checked state
+        gr.update(
+            visible=show_selected, choices=displayed_selected, value=displayed_selected
+        ),  # show ALL selected tracks
     )
     # Extract track IDs from current selection (in case they're in display format)
     cur_ids = [_extract_track_id(x) for x in (current_selected or [])]
     cur_display = [_format_track_for_display(tid) for tid in cur_ids]
     # Extract track IDs from items to add
     to_add_ids = [_extract_track_id(x) for x in (to_add or [])]
     # Add new track IDs
     for tid in to_add_ids:
         if tid not in cur_ids:
             cur_ids.append(tid)
             cur_display.append(_format_track_for_display(tid))
     # Show ALL selected tracks (no limit)
     return gr.update(choices=cur_display, value=cur_display)  # show all selected tracks
     coords = DEFAULT_COORDS.get(species, DEFAULT_COORDS["human"])
     return coords["chrom"], coords["start"], coords["end"]
 def reset_on_species_change(species: str):
     # Clear results + selected when species changes (avoids mismatched IDs)
     try:
         track_ids = _get_bigwig_names(species)  # warms cache if available
         # Format available tracks for display
         formatted_tracks = [_format_track_for_display(tid) for tid in track_ids]
         # Get default tracks for this species (filter to what's available)
         default_track_ids = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in track_ids]
+        default_formatted = [
+            _format_track_for_display(tid) for tid in default_track_ids
+        ]
         # Show selected tracks section if there are default tracks
         show_selected = bool(default_formatted)
         return (
+            gr.update(value=""),  # query textbox
             gr.update(choices=[], value=[]),  # results list
+            gr.update(
+                choices=formatted_tracks, value=default_formatted, visible=show_selected
+            ),  # selected list with defaults
         )
     except (ValueError, AttributeError):
         # Species doesn't have bigwigs, that's okay
         return (
+            gr.update(value=""),  # query textbox
             gr.update(choices=[], value=[]),  # results list
             gr.update(choices=[], value=[], visible=False),  # selected list (hidden)
         )
 # -----------------------------
 # Predict
 # -----------------------------
+@gpu
 def predict(
     seq: str,
     species: str,
     # Debug: verify species is being passed
     if not species:
         raise gr.Error("Species parameter is missing. Please select a species.")
     # Extract track IDs from display format if needed
     bigwig_selected = [_extract_track_id(tid) for tid in bigwig_selected]
     # Determine if using coordinates based on input_type radio button
     use_coords = input_type == "Use genomic coordinates"
     if use_coords:
         # Check if this species supports coordinate-based fetching
         if species not in SPECIES_WITH_COORDINATE_SUPPORT:
             raise gr.Error("chrom is required when use_coords=True")
         if start is None or end is None or int(end) <= int(start):
             raise gr.Error("start/end must be set and end > start when use_coords=True")
+        inputs = {
+            "chrom": chrom,
+            "start": int(start),
+            "end": int(end),
+            "species": species,
+        }
     else:
         if not seq or not seq.strip():
             raise gr.Error("seq is required when use_coords=False")
     # Verify species is in inputs before calling pipeline
     if "species" not in inputs:
+        raise gr.Error(
+            f"Internal error: species not found in inputs dict. Inputs: {list(inputs.keys())}"
+        )
     tprint("inputs prepared")
     # Check if we have any tracks/elements to plot
     has_bigwigs = bw is not None and len(bw_names) > 0
     has_bed = bed_logits is not None and len(bed_names) > 0
     if not has_bigwigs and not has_bed:
+        raise gr.Error(
+            "No BigWig tracks or BED elements available for this species in the current model."
+        )
     if not has_bigwigs and bigwig_selected:
+        raise gr.Error(
+            "No BigWig tracks available for this species, but BigWig tracks were selected. Please deselect BigWig tracks or choose a different species."
+        )
     # Defaults if user picked none
     if has_bigwigs and not bigwig_selected:
         ]
         # Filter to only include tracks that are available for this species/assembly
         bigwig_selected = [tid for tid in default_bigwig_tracks if tid in bw_names]
     if (not bed_elements) and bed_names:
         default_bed_elements = ["protein_coding_gene", "exon", "intron"]
         # Filter to only include elements that are available
         L = bed_logits.shape[0]
     else:
         raise gr.Error("No data available for plotting.")
     stride = _global_stride(L, PLOT_TARGET_POINTS)
     x0 = int(out.pred_start or 0)
     x = np.linspace(x0, x1, num=L, endpoint=False)[::stride]
     series: list[tuple[str, np.ndarray]] = []
     # Add BigWig tracks if available and selected
     if has_bigwigs and bigwig_selected:
         for tid in bigwig_selected:
     fig = _make_tracks_figure(x, series)
     tprint("figure created")
+    region = (
+        f"{out.chrom}:{out.pred_start}-{out.pred_end}" if out.chrom else f"{x0}-{x1}"
+    )
     if out.assembly:
         region += f" ({out.assembly})"
     fig.axes[-1].set_xlabel(region)
 # Get available BigWig tracks for default species and filter defaults
 _init_bigwig = _get_bigwig_names(DEFAULT_SPECIES)
+_init_bigwig_selected_ids = [
+    tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bigwig
+]
 # Format for display
+_init_bigwig_selected = [
+    _format_track_for_display(tid) for tid in _init_bigwig_selected_ids
+]
 # Filter default BED elements to only those available
 _init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
 # Get default coordinates for default species
 _default_coords = DEFAULT_COORDS.get(DEFAULT_SPECIES, DEFAULT_COORDS["human"])
 # Format species names for display (replace underscores with spaces, capitalize)
 def _format_species_name(species: str) -> str:
     """Format species name for display."""
     return species.replace("_", " ").title()
 # Get all available species and format them
 _all_species = sorted(ASSEMBLY_TO_SPECIES.values())
 _all_species_formatted = [_format_species_name(s) for s in _all_species]
 # Get species with BigWig tracks
 _species_with_bigwigs = _get_species_with_bigwigs()
+_bigwig_species_formatted = sorted(
+    [_format_species_name(s) for s in _species_with_bigwigs]
+)
+_bigwig_species_list = (
+    ", ".join(_bigwig_species_formatted)
+    if _bigwig_species_formatted
+    else "None (BED elements only)"
+)
 with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     gr.Markdown(
+        f"""
 <div class="intro-hero">
 <div class="intro-title">
 </div>
 """,
+        elem_id="intro_markdown",
+    )
     gr.Markdown("## Select NTv3 post-trained model")
     # Model display names (without InstaDeepAI/ prefix) and their full IDs
     MODEL_OPTIONS = {
         "NTv3 650M (pos)": "InstaDeepAI/NTv3_650M_pos",
         "NTv3 100M (pos)": "InstaDeepAI/NTv3_100M_pos",
     }
     # Reverse mapping: full ID -> display name
     MODEL_ID_TO_DISPLAY = {v: k for k, v in MODEL_OPTIONS.items()}
     # Get display name for current model
     current_display_name = MODEL_ID_TO_DISPLAY.get(current_model_id, "NTv3 100M (pos)")
     model_selector = gr.Dropdown(
         choices=list(MODEL_OPTIONS.keys()),
         value=current_display_name,
         label="Model",
     )
     model_status = gr.Markdown("", visible=False)
     gr.Markdown("## Input DNA sequence")
     # Get all available species from the pipeline
     all_species = sorted(ASSEMBLY_TO_SPECIES.values())
         value=DEFAULT_SPECIES,
         label="Species",
     )
     # Radio buttons for input type selection
     is_supported_default = DEFAULT_SPECIES in SPECIES_WITH_COORDINATE_SUPPORT
+    initial_input_type = (
+        "Use genomic coordinates" if is_supported_default else "Enter DNA sequence"
+    )
     input_type = gr.Radio(
         choices=["Use genomic coordinates", "Enter DNA sequence"],
         value=initial_input_type,
         label="Input method",
         visible=is_supported_default,  # Only show if species supports coordinates
     )
     # Coordinates section - visible only when "Use genomic coordinates" is selected
+    with gr.Group(
+        visible=is_supported_default
+        and initial_input_type == "Use genomic coordinates",
+        elem_id="coords_group",
+    ) as coords_group:
+        gr.Markdown(
+            "**Genomic coordinates** (supported species: "
+            + ", ".join(sorted(SPECIES_WITH_COORDINATE_SUPPORT))
+            + ")"
+        )
         with gr.Row():
             chrom = gr.Textbox(label="Chromosome", value=_default_coords["chrom"])
+            start = gr.Number(
+                label="Start", value=_default_coords["start"], precision=0
+            )
             end = gr.Number(label="End", value=_default_coords["end"], precision=0)
     # DNA sequence section - visible only when "Enter DNA sequence" is selected
     # Using Textbox directly (not wrapped in Group) to avoid visual border/line
     seq = gr.Textbox(
+        lines=4,
+        label="Input DNA sequence",
         placeholder="ACGT...",
         visible=initial_input_type == "Enter DNA sequence",
+        elem_id="dna_sequence_input",
     )
     def change_model(display_name: str, species: str):
         """Reload pipeline with new model."""
         try:
             else:
                 # Fallback: assume it's already a model ID or custom value
                 model_id = display_name
             load_pipeline(model_id, species)
             # Update available tracks/elements
             _get_bigwig_names(species)  # warm cache
+            return gr.update(value="✅ Model loaded successfully"), gr.update(
+                visible=True
+            )
         except Exception as e:
+            return gr.update(value=f"❌ Error loading model: {str(e)}"), gr.update(
+                visible=True
+            )
     model_selector.change(
         fn=change_model,
         inputs=[model_selector, species],
     )
     gr.Markdown("## Select functional tracks")
     # Button to download tracks metadata
     def get_metadata_file_path():
         """Return path to metadata CSV file for download."""
         if csv_path.exists():
             return str(csv_path)
         return None
     metadata_file_path = get_metadata_file_path()
     download_metadata_btn = gr.Button(
         "📋 Download metadata for all functional tracks",
         label="Tracks metadata",
         visible=False,
     )
     def download_metadata():
         """Return metadata file for download."""
         if metadata_file_path and Path(metadata_file_path).exists():
             return gr.update(value=metadata_file_path, visible=True)
         return gr.update(visible=False)
     download_metadata_btn.click(
         fn=download_metadata,
         inputs=[],
         outputs=[metadata_download_file],
     )
     bigwig_no_tracks_msg = gr.Markdown(
         "⚠️ No functional genomic tracks available for this species in the current model.",
         visible=False,
         choices=_init_bigwig_selected,
         value=_init_bigwig_selected,
         label="Selected functional tracks (used for prediction)",
+        visible=bool(
+            _init_bigwig_selected
+        ),  # Show if there are default tracks, otherwise hidden
     )
     bigwig_query = gr.Textbox(
         bigwig_remove_btn = gr.Button("Remove all selected")
     gr.Markdown("## Select genome annotation elements")
     bed_elements = gr.Dropdown(
         choices=_init_bed,
         value=_init_bed_selected if _init_bed_selected else [],
     btn = gr.Button("Predict", elem_id="predict_btn")
     gr.Markdown("## NTv3 predictions for selected tracks and elements")
+    gr.Markdown(
+        "Note: NTv3 predictions are for the 37.5% center of the input sequence."
+    )
     plot = gr.Plot(label="", elem_id="tracks_plot")
     export_png = gr.File(elem_id="export_png_hidden", interactive=False)
     # State to store prediction output and selections for BigWig export
     prediction_state = gr.State(value=None)
     bigwig_selected_state = gr.State(value=[])
     bed_elements_state = gr.State(value=[])
+    download_bigwig_btn = gr.Button(
+        "📥 Download tracks as BigWig files (ZIP)", variant="secondary"
+    )
     export_bigwig = gr.File(label="Download BigWig files", visible=False)
     with gr.Accordion("Meta (click to expand)", open=False):
     )
     # Helper function to get search results choices directly (without gr.update wrapper)
+    def _get_search_results_choices(
+        species: str, query: str, current_selected: list[str]
+    ) -> list[str]:
         """Get search results choices as a list, excluding selected tracks."""
         if query is None:
             query = ""
         query_stripped = query.strip()
         if not query_stripped:
             return []
         names = _get_bigwig_names(species)
         metadata = _load_track_metadata()
         query_lower = query_stripped.lower()
         # Extract track IDs from already selected tracks
         selected_track_ids = set()
         if current_selected:
             selected_track_ids = {_extract_track_id(x) for x in current_selected}
         # Build and filter results
         matching = []
         for track_id in names:
                 continue
             display_name = metadata.get(track_id, track_id)
             display_format = _format_track_for_display(track_id)
+            if (
+                query_lower in track_id.lower()
+                or query_lower in display_name.lower()
+                or query_lower in display_format.lower()
+            ):
                 matching.append(display_format)
         return matching[:SEARCH_MAX_RESULTS]
     # Auto-add: whenever user checks items in results, add them to Selected,
     # then clear results selection (so it feels like "click to add")
+    def _auto_add(
+        selected_now: list[str],
+        results_checked: list[str],
+        current_query: str,
+        current_results: list[str],
+        current_species: str,
+    ):
         upd = add_selected(selected_now, results_checked)  # reuses your function
         # Show selected tracks section if there are selections
         show_selected = bool(upd["value"])
         # Get the new search results choices directly (excluding all selected tracks)
+        new_choices = _get_search_results_choices(
+            current_species, current_query, upd["value"]
+        )
         # Create a completely fresh update with explicit empty value to prevent any checked state
         # Force Gradio to clear checked state by explicitly setting value to empty list
         # Use a workaround: set choices to empty first, then to new_choices to force a complete refresh
         # But since we can only return one update, we'll ensure value is explicitly empty
         # and that we're not preserving any state from the previous update
         # Ensure no items from results_checked are in new_choices (they should already be filtered, but double-check)
         checked_track_ids = {_extract_track_id(x) for x in results_checked}
+        new_choices_filtered = [
+            c for c in new_choices if _extract_track_id(c) not in checked_track_ids
+        ]
         # Create update with explicit empty value - this should force Gradio to clear all checked items
         fresh_update = gr.update(
             choices=new_choices_filtered,
             value=[],  # CRITICAL: Explicitly empty list to clear all checked state
         )
         return gr.update(**upd, visible=show_selected), fresh_update
     # Use a wrapper that ensures results are cleared before updating
+    def _auto_add_wrapper(
+        selected_now: list[str],
+        results_checked: list[str],
+        current_query: str,
+        current_results: list[str],
+        current_species: str,
+    ):
         # First, get the updates
+        selected_update, results_update = _auto_add(
+            selected_now,
+            results_checked,
+            current_query,
+            current_results,
+            current_species,
+        )
         # Force the results update to have an explicit empty value
         # Extract choices from results_update if it's a dict-like object
         if isinstance(results_update, dict):
             # If it's a gr.update object, we need to access it differently
             # Try to get choices from the update
             try:
+                results_choices = (
+                    results_update.choices if hasattr(results_update, "choices") else []
+                )
             except:
                 # Fallback: get choices from the search function directly
                 results_choices = _get_search_results_choices(
+                    current_species,
+                    current_query,
+                    selected_now + results_checked
+                    if isinstance(selected_now, list)
+                    and isinstance(results_checked, list)
+                    else [],
                 )
         # Create a completely fresh update with explicit empty value
         # This should force Gradio to clear all checked items
         fresh_results_update = gr.update(choices=results_choices, value=[])
         return selected_update, fresh_results_update
     bigwig_results.change(
         fn=_auto_add_wrapper,
         inputs=[bigwig_selected, bigwig_results, bigwig_query, bigwig_results, species],
     )
     # Update selected tracks immediately when user unchecks items
+    def _update_selected_tracks(
+        selected_value: list[str], current_query: str, current_species: str
+    ):
         """Update selected tracks when user checks/unchecks items directly."""
         # selected_value contains only the currently checked items
         # Update choices to match the current selections (so unchecked items are removed)
         show_selected = bool(selected_value)
         # Also update search results to reflect the new selection (tracks that were unchecked can now appear in results)
         search_updates = search_bigwigs(current_species, current_query, selected_value)
         return (
+            gr.update(
+                choices=selected_value, value=selected_value, visible=show_selected
+            ),  # Update selected tracks
             search_updates[0],  # Update search results
         )
     bigwig_selected.change(
         fn=_update_selected_tracks,
         inputs=[bigwig_selected, bigwig_query, species],
         inputs=[species],
         outputs=[bigwig_query, bigwig_results, bigwig_selected],
     )
     # Update coordinates visibility and values when species changes
     def update_on_species_change(species: str, input_type_val: str):
         """Update coordinates visibility and values when species changes."""
         use_coords = input_type_val == "Use genomic coordinates"
         show_coords = is_supported and use_coords
         show_seq = not show_coords
         # Format available tracks for display if species has bigwigs
         if has_bigwigs:
             try:
                 track_ids = _get_bigwig_names(species)
                 formatted_tracks = [_format_track_for_display(tid) for tid in track_ids]
                 # Get default tracks for this species (filter to what's available)
+                default_track_ids = [
+                    tid for tid in DEFAULT_BIGWIG_TRACKS if tid in track_ids
+                ]
+                default_formatted = [
+                    _format_track_for_display(tid) for tid in default_track_ids
+                ]
                 # Show selected tracks section if there are default tracks
                 show_selected_tracks = bool(default_formatted)
             except:
             formatted_tracks = []
             default_formatted = []
             show_selected_tracks = False
         return (
             gr.update(visible=show_coords, value=coords["chrom"]),
             gr.update(visible=show_coords, value=coords["start"]),
             gr.update(visible=show_coords, value=coords["end"]),
+            gr.update(
+                visible=is_supported,
+                value="Use genomic coordinates"
+                if is_supported
+                else "Enter DNA sequence",
+            ),  # Update input_type radio
             gr.update(visible=show_coords),  # Show/hide coords_group
+            gr.update(visible=show_seq),  # Show/hide seq
+            gr.update(
+                visible=not has_bigwigs
+            ),  # Show "no tracks" message if no bigwigs
+            gr.update(
+                visible=show_selected_tracks,
+                choices=formatted_tracks,
+                value=default_formatted,
+            ),  # Show bigwig selection with defaults if available
             gr.update(visible=has_bigwigs),  # Show bigwig query if available
             gr.update(visible=has_bigwigs),  # Show bigwig results if available
             gr.update(visible=has_bigwigs),  # Show bigwig buttons if available
         )
     # Update input type radio visibility and value when species changes
     def update_input_type_on_species_change(species: str):
         """Update input type radio when species changes."""
         is_supported = species in SPECIES_WITH_COORDINATE_SUPPORT
         # If species doesn't support coordinates, default to sequence input
+        default_value = (
+            "Use genomic coordinates" if is_supported else "Enter DNA sequence"
+        )
         return gr.update(visible=is_supported, value=default_value)
     # Update input visibility when radio button changes
     def update_input_visibility(input_type_val: str, species: str):
         """Update input visibility when radio button changes."""
         if input_type_val == "Enter DNA sequence":
             # Hide coordinates, show sequence
             return (
+                gr.update(
+                    visible=False
+                ),  # coords_group - always hide when sequence is selected
+                gr.update(visible=True),  # seq - always show when sequence is selected
             )
         elif input_type_val == "Use genomic coordinates":
             # Show coordinates only if species supports it
             is_supported = species in SPECIES_WITH_COORDINATE_SUPPORT
             return (
+                gr.update(
+                    visible=is_supported
+                ),  # coords_group - show only if supported
+                gr.update(
+                    visible=not is_supported
+                ),  # seq - hide when coordinates are shown
             )
         else:
             # Fallback: hide both (shouldn't happen)
                 gr.update(visible=False),
                 gr.update(visible=False),
             )
     species.change(
         fn=update_input_type_on_species_change,
         inputs=[species],
         outputs=[input_type],
     )
     species.change(
         fn=update_on_species_change,
         inputs=[species, input_type],
         outputs=[
+            chrom,
+            start,
+            end,
+            input_type,
+            coords_group,
+            seq,
+            bigwig_no_tracks_msg,
+            bigwig_selected,
+            bigwig_query,
+            bigwig_results,
+            bigwig_buttons_row,
         ],
     )
     input_type.change(
         fn=update_input_visibility,
         inputs=[input_type, species],
     btn.click(
         fn=predict,
+        inputs=[
+            seq,
+            species,
+            chrom,
+            start,
+            end,
+            input_type,
+            bigwig_selected,
+            bed_elements,
+        ],
+        outputs=[
+            plot,
+            export_png,
+            meta,
+            prediction_state,
+            bigwig_selected_state,
+            bed_elements_state,
+        ],
         api_name="predict",
     )
     def download_bigwig_zip(out, bw_selected, bed_selected):
         """Create and return BigWig zip file."""
         try:
             zip_path = create_bigwig_zip(out, bw_selected, bed_selected)
             return gr.update(value=zip_path, visible=True)
         except ImportError as e:
+            raise gr.Error(
+                "pyBigWig is required for BigWig export. Install with: pip install pyBigWig"
+            )
         except Exception as e:
             raise gr.Error(f"Error creating BigWig files: {str(e)}")
     download_bigwig_btn.click(
         fn=download_bigwig_zip,
         inputs=[prediction_state, bigwig_selected_state, bed_elements_state],
         css=CSS,
         js=JS,
     )

bigwig_export.py CHANGED Viewed

@@ -3,8 +3,8 @@ BigWig export functionality for NTv3 tracks.
 """
 import os
-import uuid
 import tempfile
 import zipfile
 from typing import TYPE_CHECKING
@@ -33,7 +33,7 @@ def create_bigwig_zip(
 ) -> str:
     """
     Create BigWig files for selected tracks and save them in a zip file.
     Parameters
     ----------
     out : NTv3TracksOutput
@@ -42,12 +42,12 @@ def create_bigwig_zip(
         List of BigWig track IDs to export.
     bed_elements : list[str]
         List of BED element names to export.
     Returns
     -------
     str
         Path to the created zip file containing BigWig files.
     Raises
     ------
     ImportError
@@ -56,46 +56,50 @@ def create_bigwig_zip(
         If no predictions are available or no tracks are selected.
     """
     if pyBigWig is None:
-        raise ImportError("pyBigWig is required for BigWig export. Install with: pip install pyBigWig")
     if out is None:
         raise ValueError("No predictions available. Please run a prediction first.")
     bw_names = out.bigwig_track_names or []
     bw_logits = out.bigwig_tracks_logits
     bed_names = out.bed_element_names or []
     bed_logits = out.bed_tracks_logits
     if bw_logits is None or not bw_names:
         raise ValueError("No BigWig tracks available in model output.")
     # Get genomic coordinates
     chrom = out.chrom
     if chrom is None:
-        raise ValueError("Chromosome information not available. Use genomic coordinates for BigWig export.")
     start = out.start
     end = out.end
     window_len = out.window_len or (end - start)
     # Calculate prediction region (center 37.5%)
     pred_start = out.pred_start or (start + int(window_len * 0.3125))
     pred_end = out.pred_end or (pred_start + int(window_len * 0.375))
     # Create temporary directory for BigWig files
     tmpdir = tempfile.gettempdir()
     output_dir = os.path.join(tmpdir, f"bigwig_outputs_{uuid.uuid4().hex}")
     os.makedirs(output_dir, exist_ok=True)
     # Prepare track data list
     track_data_list = []
     # Add BigWig tracks
     for track_id in bigwig_selected:
         if track_id in bw_names:
             idx = bw_names.index(track_id)
             track_data_list.append(("bigwig", track_id, idx, None))
     # Add BED elements (as probabilities)
     if bed_logits is not None and bed_elements:
         probs = _softmax_last(bed_logits)
@@ -104,10 +108,10 @@ def create_bigwig_zip(
                 eidx = bed_names.index(elem_name)
                 # Store as bed element with probability data
                 track_data_list.append(("bed", elem_name, eidx, probs[:, eidx, 1]))
     if not track_data_list:
         raise ValueError("No tracks selected for export.")
     # Create BigWig files
     created_files = []
     for track_type, track_id, track_idx, bed_probs in track_data_list:
@@ -119,39 +123,39 @@ def create_bigwig_zip(
                 continue
             track_data = bed_probs.astype(np.float32)
             display_name = track_id
         # Clean filename
         clean_name = display_name.replace(" ", "_").replace("/", "_").replace("-", "_")
         bw_filename = os.path.join(output_dir, f"{clean_name}.bw")
         # Create BigWig file
         bw = pyBigWig.open(bw_filename, "w")
         # Add header - use end of genomic window as chromosome size
         bw.addHeader([(chrom, end)])
         # Add entries
         num_positions = len(track_data)
         starts = np.arange(pred_start, pred_start + num_positions, dtype=np.int64)
         ends = starts + 1
         values = track_data.tolist()
         bw.addEntries(
             chroms=[chrom] * len(starts),
             starts=starts.tolist(),
             ends=ends.tolist(),
-            values=values
         )
         bw.close()
         created_files.append(bw_filename)
     # Create zip file
     zip_path = os.path.join(tmpdir, f"ntv3_tracks_{uuid.uuid4().hex}.zip")
-    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
         for bw_file in created_files:
             zipf.write(bw_file, os.path.basename(bw_file))
     # Clean up individual BigWig files
     for bw_file in created_files:
         try:
@@ -162,6 +166,5 @@ def create_bigwig_zip(
         os.rmdir(output_dir)
     except:
         pass
-    return zip_path

 """
 import os
 import tempfile
+import uuid
 import zipfile
 from typing import TYPE_CHECKING
 ) -> str:
     """
     Create BigWig files for selected tracks and save them in a zip file.
     Parameters
     ----------
     out : NTv3TracksOutput
         List of BigWig track IDs to export.
     bed_elements : list[str]
         List of BED element names to export.
     Returns
     -------
     str
         Path to the created zip file containing BigWig files.
     Raises
     ------
     ImportError
         If no predictions are available or no tracks are selected.
     """
     if pyBigWig is None:
+        raise ImportError(
+            "pyBigWig is required for BigWig export. Install with: pip install pyBigWig"
+        )
     if out is None:
         raise ValueError("No predictions available. Please run a prediction first.")
     bw_names = out.bigwig_track_names or []
     bw_logits = out.bigwig_tracks_logits
     bed_names = out.bed_element_names or []
     bed_logits = out.bed_tracks_logits
     if bw_logits is None or not bw_names:
         raise ValueError("No BigWig tracks available in model output.")
     # Get genomic coordinates
     chrom = out.chrom
     if chrom is None:
+        raise ValueError(
+            "Chromosome information not available. Use genomic coordinates for BigWig export."
+        )
     start = out.start
     end = out.end
     window_len = out.window_len or (end - start)
     # Calculate prediction region (center 37.5%)
     pred_start = out.pred_start or (start + int(window_len * 0.3125))
     pred_end = out.pred_end or (pred_start + int(window_len * 0.375))
     # Create temporary directory for BigWig files
     tmpdir = tempfile.gettempdir()
     output_dir = os.path.join(tmpdir, f"bigwig_outputs_{uuid.uuid4().hex}")
     os.makedirs(output_dir, exist_ok=True)
     # Prepare track data list
     track_data_list = []
     # Add BigWig tracks
     for track_id in bigwig_selected:
         if track_id in bw_names:
             idx = bw_names.index(track_id)
             track_data_list.append(("bigwig", track_id, idx, None))
     # Add BED elements (as probabilities)
     if bed_logits is not None and bed_elements:
         probs = _softmax_last(bed_logits)
                 eidx = bed_names.index(elem_name)
                 # Store as bed element with probability data
                 track_data_list.append(("bed", elem_name, eidx, probs[:, eidx, 1]))
     if not track_data_list:
         raise ValueError("No tracks selected for export.")
     # Create BigWig files
     created_files = []
     for track_type, track_id, track_idx, bed_probs in track_data_list:
                 continue
             track_data = bed_probs.astype(np.float32)
             display_name = track_id
         # Clean filename
         clean_name = display_name.replace(" ", "_").replace("/", "_").replace("-", "_")
         bw_filename = os.path.join(output_dir, f"{clean_name}.bw")
         # Create BigWig file
         bw = pyBigWig.open(bw_filename, "w")
         # Add header - use end of genomic window as chromosome size
         bw.addHeader([(chrom, end)])
         # Add entries
         num_positions = len(track_data)
         starts = np.arange(pred_start, pred_start + num_positions, dtype=np.int64)
         ends = starts + 1
         values = track_data.tolist()
         bw.addEntries(
             chroms=[chrom] * len(starts),
             starts=starts.tolist(),
             ends=ends.tolist(),
+            values=values,
         )
         bw.close()
         created_files.append(bw_filename)
     # Create zip file
     zip_path = os.path.join(tmpdir, f"ntv3_tracks_{uuid.uuid4().hex}.zip")
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
         for bw_file in created_files:
             zipf.write(bw_file, os.path.basename(bw_file))
     # Clean up individual BigWig files
     for bw_file in created_files:
         try:
         os.rmdir(output_dir)
     except:
         pass
+    return zip_path

data/functional_tracks_metadata.csv CHANGED Viewed

@@ -15887,4 +15887,4 @@ GSM874952,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
 GSM874953,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
 GSM874954,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
 GSM874955,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
-GSM874956,Unknown,,TF ChIP-seq,,RPB2,mouse,geo

 GSM874953,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
 GSM874954,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
 GSM874955,Unknown,,TF ChIP-seq,,RPB2,mouse,geo
+GSM874956,Unknown,,TF ChIP-seq,,RPB2,mouse,geo

ntv3_tracks_pipeline.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import torch
@@ -109,6 +109,7 @@ BED_ELEMENT_COLORS = {
     "ORF": "#1F618D",  # Blue 2
 }
 def _sanitize_dna(seq: str) -> str:
     seq = seq.upper()
     return "".join(ch if ch in ("A", "C", "G", "T", "N") else "N" for ch in seq)
@@ -117,24 +118,26 @@ def _sanitize_dna(seq: str) -> str:
 def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
     """
     Fetch DNA sequence from API based on assembly, chromosome, and coordinates.
     Uses ASSEMBLY_TO_API_URL_TEMPLATE to determine the API URL format for each assembly.
     Falls back to DEFAULT_API_URL_TEMPLATE if assembly is not in the mapping.
     """
     if requests is None:
-        raise ImportError("requests is required for genome download. Install with: pip install requests")
     # Get API URL template for this assembly, or use default
     url_template = ASSEMBLY_TO_API_URL_TEMPLATE.get(assembly, DEFAULT_API_URL_TEMPLATE)
     # Format the URL with the provided parameters
     url = url_template.format(assembly=assembly, chrom=chrom, start=start, end=end)
     seq = requests.get(url).json()["dna"].upper()
     return seq
-def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Path:
     """
     Download <assembly>.fa.gz, decompress to <assembly>.fa, return the .fa path.
     pyfaidx works reliably on uncompressed FASTA.
@@ -156,6 +159,7 @@ def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Pa
         )
     import gzip
     print(f"Decompressing {gz_path} -> {fa_path}")
     with gzip.open(gz_path, "rb") as fin, open(fa_path, "wb") as fout:
         while True:
@@ -166,11 +170,12 @@ def _ensure_fasta_for_assembly(assembly: str, cache_dir: Union[str, Path]) -> Pa
     return fa_path
-def _pick_device(device: Union[str, int, torch.device]) -> torch.device:
     # Handle torch.device objects
     if isinstance(device, torch.device):
         return device
     # Handle integer device IDs (transformers pipeline convention)
     if isinstance(device, int):
         if device == -1:
@@ -182,7 +187,7 @@ def _pick_device(device: Union[str, int, torch.device]) -> torch.device:
                 return torch.device("cpu")
         else:
             raise ValueError(f"Invalid device integer: {device}")
     # Handle string device names
     if isinstance(device, str):
         d = device.lower()
@@ -194,9 +199,13 @@ def _pick_device(device: Union[str, int, torch.device]) -> torch.device:
             return torch.device("cpu")
         if d in ("cuda", "cpu", "mps"):
             return torch.device(d)
-        raise ValueError("device must be one of: 'auto', 'cpu', 'cuda', 'mps', or an integer")
-    raise ValueError(f"device must be a string, integer, or torch.device, got {type(device)}")
 def _softmax_last(x: np.ndarray) -> np.ndarray:
@@ -206,16 +215,18 @@ def _softmax_last(x: np.ndarray) -> np.ndarray:
 def _plot_tracks_fillbetween(
-    tracks: Dict[str, np.ndarray],
-    chrom: Optional[str],
     start: int,
     end: int,
-    assembly: Optional[str],
     height: float = 1.0,
     figsize_x: float = 20.0,
 ):
     if plt is None:
-        raise ImportError("matplotlib is required for plotting. Install with: pip install matplotlib")
     n = len(tracks)
     if n == 0:
@@ -238,7 +249,7 @@ def _plot_tracks_fillbetween(
             color = BED_ELEMENT_COLORS[title]
         else:
             color = bigwig_color
         ax.fill_between(x, y, color=color, alpha=0.3, linewidth=0)
         ax.plot(x, y, color=color, linewidth=0.8)
         ax.set_title(title, fontsize=10, loc="left")
@@ -260,29 +271,31 @@ def _plot_tracks_fillbetween(
 @dataclass
 class NTv3TracksOutput:
     bigwig_tracks_logits: np.ndarray  # (L_pred, T)
-    bed_tracks_logits: np.ndarray     # (L_pred, E, C)
     mlm_logits: np.ndarray
-    chrom: Optional[str] = None
-    start: Optional[int] = None
-    end: Optional[int] = None
-    species: Optional[str] = None
-    assembly: Optional[str] = None
-    bigwig_track_names: Optional[List[str]] = None  # from cfg.bigwigs_per_file_assembly[assembly]
-    bed_element_names: Optional[List[str]] = None
-    window_len: Optional[int] = None
-    pred_start: Optional[int] = None
-    pred_end: Optional[int] = None
 class NTv3TracksPipeline(Pipeline):
     def __init__(
         self,
-        model: Union[str, torch.nn.Module],
-        tokenizer: Optional[Union[str, Any]] = None,
         trust_remote_code: bool = True,
-        token: Optional[str] = None,
         default_species: str = "human",
-        genome_cache_dir: Union[str, Path] = "~/.cache/ntv3/genomes",
         device: str = "auto",
         mps_force_cpu: bool = True,
         mps_force_cpu_length: int = 16384,
@@ -302,24 +315,36 @@ class NTv3TracksPipeline(Pipeline):
         self.pred_center_offset_fraction = float(pred_center_offset_fraction)
         if isinstance(model, str):
-            self.config = AutoConfig.from_pretrained(model, trust_remote_code=trust_remote_code, token=token)
-            self.model = AutoModel.from_pretrained(model, trust_remote_code=trust_remote_code, token=token)
         else:
             self.model = model
             self.config = getattr(model, "config", None)
         if tokenizer is None:
             if not self.model_id:
-                raise ValueError("If passing a model module, pass tokenizer explicitly.")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, token=token)
         elif isinstance(tokenizer, str):
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=trust_remote_code, token=token)
         else:
             self.tokenizer = tokenizer
         # Extract model_id from config if not already set (following ntv3_gff_pipeline.py pattern)
         if self.model_id is None and self.config is not None:
-            self.model_id = getattr(self.config, "_name_or_path", None) or getattr(self.config, "name_or_path", None)
         # Load species_tokenizer (following ntv3_gff_pipeline.py pattern)
         if self.model_id:
@@ -332,19 +357,22 @@ class NTv3TracksPipeline(Pipeline):
         else:
             self.species_tokenizer = kwargs.get("species_tokenizer", None)
             if self.species_tokenizer is None:
-                raise ValueError("Pass species_tokenizer=... when constructing with a model module.")
         # bed names (your notebooks refer to bed_element_names)
-        self.bed_element_names = (
-            getattr(self.config, "bed_elements_names", None)
-            or getattr(self.config, "bed_element_names", None)
-        )
         self._target_device = _pick_device(device)
         self.model.to(self._target_device)
         self.model.eval()
-        super().__init__(model=self.model, tokenizer=self.tokenizer, device=-1, **kwargs)
     def _sanitize_parameters(self, **kwargs):
         return {}, {}, {}
@@ -352,10 +380,12 @@ class NTv3TracksPipeline(Pipeline):
     def _get_model_device(self) -> torch.device:
         return next(self.model.parameters()).device
-    def _resolve_species_and_assembly(self, inputs: Dict[str, Any]) -> tuple[str, str]:
         species = inputs.get("species", self.default_species)
         if species not in SPECIES_TO_ASSEMBLY:
-            raise ValueError(f"Unsupported species='{species}'. Supported species: {sorted(SPECIES_TO_ASSEMBLY.keys())}")
         assembly = SPECIES_TO_ASSEMBLY[species]
         cfg_assemblies = list(self.config.bigwigs_per_file_assembly.keys())
@@ -366,8 +396,9 @@ class NTv3TracksPipeline(Pipeline):
             )
         return species, assembly
-    def _maybe_force_cpu_for_mps_long(self, input_ids_cpu: torch.Tensor) -> torch.device:
         dev = self._get_model_device()
         if self.mps_force_cpu and dev.type == "mps":
             seq_len = int(input_ids_cpu.shape[-1])
@@ -390,7 +421,9 @@ class NTv3TracksPipeline(Pipeline):
         sp = species or self.default_species
         assembly = SPECIES_TO_ASSEMBLY.get(sp)
         if assembly is None:
-            raise ValueError(f"Unknown species={sp}. Supported: {sorted(SPECIES_TO_ASSEMBLY.keys())}")
         if assembly not in self.config.bigwigs_per_file_assembly:
             raise ValueError(
@@ -400,13 +433,13 @@ class NTv3TracksPipeline(Pipeline):
         return list(self.config.bigwigs_per_file_assembly[assembly])
-    def available_bed_element_names(self) -> List[str]:
         """
         Return BED element names available in this checkpoint (no forward pass).
         """
         return list(self.bed_element_names or [])
-    def preprocess(self, inputs: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         species, assembly = self._resolve_species_and_assembly(inputs)
         # Resolve sequence
@@ -425,7 +458,13 @@ class NTv3TracksPipeline(Pipeline):
             seq = _sanitize_dna(seq)
         # Tokenize with padding
-        batch = self.tokenizer([seq], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")
         input_ids_cpu = batch["input_ids"]
         # MPS-long fallback decision
@@ -435,7 +474,9 @@ class NTv3TracksPipeline(Pipeline):
         input_ids = input_ids_cpu.to(device)
         # Species tokenization - match batch size
         batch_size = input_ids.shape[0]
-        species_ids = self.species_tokenizer([species] * batch_size, add_special_tokens=False, return_tensors="pt")
         species_ids_tensor = species_ids["input_ids"].to(device)
         # Prediction interval (not used for slicing logits, just x-axis)
@@ -465,7 +506,7 @@ class NTv3TracksPipeline(Pipeline):
     def forward(self, model_inputs, **forward_params):
         return self._forward(model_inputs, **forward_params)
-    def _forward(self, model_inputs: Dict[str, Any], **kwargs: Any) -> Dict[str, Any]:
         meta = model_inputs.pop("meta")
         if self.verbose:
             print(f"Running on device: {self._get_model_device()}")
@@ -478,7 +519,9 @@ class NTv3TracksPipeline(Pipeline):
         out["meta"] = meta
         return out
-    def postprocess(self, model_outputs: Dict[str, Any], **kwargs: Any) -> NTv3TracksOutput:
         meta = model_outputs.pop("meta", {})
         def to_np(x):
@@ -490,16 +533,16 @@ class NTv3TracksPipeline(Pipeline):
         # Normalize shapes to remove batch/(optional assembly) dims
         if bigwig_np.ndim == 3:
-            bigwig_np = bigwig_np[0]          # (L, T)
         elif bigwig_np.ndim == 4:
-            bigwig_np = bigwig_np[0, 0]       # (L, T) if (B, A, L, T)
         else:
             raise ValueError(f"Unexpected bigwig_tracks_logits ndim: {bigwig_np.ndim}")
         if bed_np.ndim == 4:
-            bed_np = bed_np[0]                # (L, E, C)
         elif bed_np.ndim == 5:
-            bed_np = bed_np[0, 0]             # (L, E, C) if (B, A, L, E, C)
         else:
             raise ValueError(f"Unexpected bed_tracks_logits ndim: {bed_np.ndim}")
@@ -527,8 +570,8 @@ class NTv3TracksPipeline(Pipeline):
         inputs,
         *args,
         plot: bool = False,
-        tracks_to_plot: Optional[Dict[str, str]] = None,   # title -> track_id (ENCSR...)
-        elements_to_plot: Optional[List[str]] = None,       # element names
         plot_height: float = 1.0,
         plot_figsize_x: float = 20.0,
         **kwargs,
@@ -540,7 +583,9 @@ class NTv3TracksPipeline(Pipeline):
         if plot:
             if out.bigwig_track_names is None:
-                raise ValueError("bigwig_track_names missing; expected cfg.bigwigs_per_file_assembly[assembly].")
             if out.bed_element_names is None:
                 raise ValueError("bed element names missing from config.")
             tracks_to_plot = tracks_to_plot or {}
@@ -550,14 +595,18 @@ class NTv3TracksPipeline(Pipeline):
             bed_element_names = out.bed_element_names
             # Validate
-            missing_tracks = [tid for tid in tracks_to_plot.values() if tid not in bigwig_names]
             if missing_tracks:
                 raise ValueError(
                     f"The following tracks are not available in bigwig_names: {missing_tracks}\n"
                     f"First 50 available: {bigwig_names[:50]}{'...' if len(bigwig_names) > 50 else ''}"
                 )
-            missing_elements = [e for e in elements_to_plot if e not in bed_element_names]
             if missing_elements:
                 raise ValueError(
                     f"The following elements are not available in bed_element_names: {missing_elements}\n"
@@ -565,14 +614,14 @@ class NTv3TracksPipeline(Pipeline):
                 )
             # Build bigwig tracks dict (title -> y)
-            bigwig_tracks: Dict[str, np.ndarray] = {}
             bigwig = out.bigwig_tracks_logits  # (L_pred, T)
             for title, track_id in tracks_to_plot.items():
                 track_idx = bigwig_names.index(track_id)
                 bigwig_tracks[title] = bigwig[:, track_idx]
             # Bed positive class probabilities (title -> y)
-            bed_probs: Dict[str, np.ndarray] = {}
             probs = _softmax_last(out.bed_tracks_logits)  # (L_pred, E, C)
             for element_name in elements_to_plot:
                 element_idx = bed_element_names.index(element_name)
@@ -581,8 +630,10 @@ class NTv3TracksPipeline(Pipeline):
             all_tracks = {**bigwig_tracks, **bed_probs}
             plot_start = int(out.pred_start or 0)
-            plot_end = int(out.pred_end or (plot_start + len(next(iter(all_tracks.values())))))
             _plot_tracks_fillbetween(
                 all_tracks,
                 chrom=out.chrom,
@@ -595,6 +646,7 @@ class NTv3TracksPipeline(Pipeline):
         return out
 def load_ntv3_tracks_pipeline(
     model: str,
     device: str = "auto",
@@ -618,4 +670,4 @@ def load_ntv3_tracks_pipeline(
         device=device,
         **pipeline_kwargs,
     )
-    return pipe

 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any
 import numpy as np
 import torch
     "ORF": "#1F618D",  # Blue 2
 }
 def _sanitize_dna(seq: str) -> str:
     seq = seq.upper()
     return "".join(ch if ch in ("A", "C", "G", "T", "N") else "N" for ch in seq)
 def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
     """
     Fetch DNA sequence from API based on assembly, chromosome, and coordinates.
     Uses ASSEMBLY_TO_API_URL_TEMPLATE to determine the API URL format for each assembly.
     Falls back to DEFAULT_API_URL_TEMPLATE if assembly is not in the mapping.
     """
     if requests is None:
+        raise ImportError(
+            "requests is required for genome download. Install with: pip install requests"
+        )
     # Get API URL template for this assembly, or use default
     url_template = ASSEMBLY_TO_API_URL_TEMPLATE.get(assembly, DEFAULT_API_URL_TEMPLATE)
     # Format the URL with the provided parameters
     url = url_template.format(assembly=assembly, chrom=chrom, start=start, end=end)
     seq = requests.get(url).json()["dna"].upper()
     return seq
+def _ensure_fasta_for_assembly(assembly: str, cache_dir: str | Path) -> Path:
     """
     Download <assembly>.fa.gz, decompress to <assembly>.fa, return the .fa path.
     pyfaidx works reliably on uncompressed FASTA.
         )
     import gzip
     print(f"Decompressing {gz_path} -> {fa_path}")
     with gzip.open(gz_path, "rb") as fin, open(fa_path, "wb") as fout:
         while True:
     return fa_path
+def _pick_device(device: str | int | torch.device) -> torch.device:
     # Handle torch.device objects
     if isinstance(device, torch.device):
         return device
     # Handle integer device IDs (transformers pipeline convention)
     if isinstance(device, int):
         if device == -1:
                 return torch.device("cpu")
         else:
             raise ValueError(f"Invalid device integer: {device}")
     # Handle string device names
     if isinstance(device, str):
         d = device.lower()
             return torch.device("cpu")
         if d in ("cuda", "cpu", "mps"):
             return torch.device(d)
+        raise ValueError(
+            "device must be one of: 'auto', 'cpu', 'cuda', 'mps', or an integer"
+        )
+    raise ValueError(
+        f"device must be a string, integer, or torch.device, got {type(device)}"
+    )
 def _softmax_last(x: np.ndarray) -> np.ndarray:
 def _plot_tracks_fillbetween(
+    tracks: dict[str, np.ndarray],
+    chrom: str | None,
     start: int,
     end: int,
+    assembly: str | None,
     height: float = 1.0,
     figsize_x: float = 20.0,
 ):
     if plt is None:
+        raise ImportError(
+            "matplotlib is required for plotting. Install with: pip install matplotlib"
+        )
     n = len(tracks)
     if n == 0:
             color = BED_ELEMENT_COLORS[title]
         else:
             color = bigwig_color
         ax.fill_between(x, y, color=color, alpha=0.3, linewidth=0)
         ax.plot(x, y, color=color, linewidth=0.8)
         ax.set_title(title, fontsize=10, loc="left")
 @dataclass
 class NTv3TracksOutput:
     bigwig_tracks_logits: np.ndarray  # (L_pred, T)
+    bed_tracks_logits: np.ndarray  # (L_pred, E, C)
     mlm_logits: np.ndarray
+    chrom: str | None = None
+    start: int | None = None
+    end: int | None = None
+    species: str | None = None
+    assembly: str | None = None
+    bigwig_track_names: list[str] | None = (
+        None  # from cfg.bigwigs_per_file_assembly[assembly]
+    )
+    bed_element_names: list[str] | None = None
+    window_len: int | None = None
+    pred_start: int | None = None
+    pred_end: int | None = None
 class NTv3TracksPipeline(Pipeline):
     def __init__(
         self,
+        model: str | torch.nn.Module,
+        tokenizer: str | Any | None = None,
         trust_remote_code: bool = True,
+        token: str | None = None,
         default_species: str = "human",
+        genome_cache_dir: str | Path = "~/.cache/ntv3/genomes",
         device: str = "auto",
         mps_force_cpu: bool = True,
         mps_force_cpu_length: int = 16384,
         self.pred_center_offset_fraction = float(pred_center_offset_fraction)
         if isinstance(model, str):
+            self.config = AutoConfig.from_pretrained(
+                model, trust_remote_code=trust_remote_code, token=token
+            )
+            self.model = AutoModel.from_pretrained(
+                model, trust_remote_code=trust_remote_code, token=token
+            )
         else:
             self.model = model
             self.config = getattr(model, "config", None)
         if tokenizer is None:
             if not self.model_id:
+                raise ValueError(
+                    "If passing a model module, pass tokenizer explicitly."
+                )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_id, trust_remote_code=trust_remote_code, token=token
+            )
         elif isinstance(tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer, trust_remote_code=trust_remote_code, token=token
+            )
         else:
             self.tokenizer = tokenizer
         # Extract model_id from config if not already set (following ntv3_gff_pipeline.py pattern)
         if self.model_id is None and self.config is not None:
+            self.model_id = getattr(self.config, "_name_or_path", None) or getattr(
+                self.config, "name_or_path", None
+            )
         # Load species_tokenizer (following ntv3_gff_pipeline.py pattern)
         if self.model_id:
         else:
             self.species_tokenizer = kwargs.get("species_tokenizer", None)
             if self.species_tokenizer is None:
+                raise ValueError(
+                    "Pass species_tokenizer=... when constructing with a model module."
+                )
         # bed names (your notebooks refer to bed_element_names)
+        self.bed_element_names = getattr(
+            self.config, "bed_elements_names", None
+        ) or getattr(self.config, "bed_element_names", None)
         self._target_device = _pick_device(device)
         self.model.to(self._target_device)
         self.model.eval()
+        super().__init__(
+            model=self.model, tokenizer=self.tokenizer, device=-1, **kwargs
+        )
     def _sanitize_parameters(self, **kwargs):
         return {}, {}, {}
     def _get_model_device(self) -> torch.device:
         return next(self.model.parameters()).device
+    def _resolve_species_and_assembly(self, inputs: dict[str, Any]) -> tuple[str, str]:
         species = inputs.get("species", self.default_species)
         if species not in SPECIES_TO_ASSEMBLY:
+            raise ValueError(
+                f"Unsupported species='{species}'. Supported species: {sorted(SPECIES_TO_ASSEMBLY.keys())}"
+            )
         assembly = SPECIES_TO_ASSEMBLY[species]
         cfg_assemblies = list(self.config.bigwigs_per_file_assembly.keys())
             )
         return species, assembly
+    def _maybe_force_cpu_for_mps_long(
+        self, input_ids_cpu: torch.Tensor
+    ) -> torch.device:
         dev = self._get_model_device()
         if self.mps_force_cpu and dev.type == "mps":
             seq_len = int(input_ids_cpu.shape[-1])
         sp = species or self.default_species
         assembly = SPECIES_TO_ASSEMBLY.get(sp)
         if assembly is None:
+            raise ValueError(
+                f"Unknown species={sp}. Supported: {sorted(SPECIES_TO_ASSEMBLY.keys())}"
+            )
         if assembly not in self.config.bigwigs_per_file_assembly:
             raise ValueError(
         return list(self.config.bigwigs_per_file_assembly[assembly])
+    def available_bed_element_names(self) -> list[str]:
         """
         Return BED element names available in this checkpoint (no forward pass).
         """
         return list(self.bed_element_names or [])
+    def preprocess(self, inputs: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
         species, assembly = self._resolve_species_and_assembly(inputs)
         # Resolve sequence
             seq = _sanitize_dna(seq)
         # Tokenize with padding
+        batch = self.tokenizer(
+            [seq],
+            add_special_tokens=False,
+            padding=True,
+            pad_to_multiple_of=128,
+            return_tensors="pt",
+        )
         input_ids_cpu = batch["input_ids"]
         # MPS-long fallback decision
         input_ids = input_ids_cpu.to(device)
         # Species tokenization - match batch size
         batch_size = input_ids.shape[0]
+        species_ids = self.species_tokenizer(
+            [species] * batch_size, add_special_tokens=False, return_tensors="pt"
+        )
         species_ids_tensor = species_ids["input_ids"].to(device)
         # Prediction interval (not used for slicing logits, just x-axis)
     def forward(self, model_inputs, **forward_params):
         return self._forward(model_inputs, **forward_params)
+    def _forward(self, model_inputs: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
         meta = model_inputs.pop("meta")
         if self.verbose:
             print(f"Running on device: {self._get_model_device()}")
         out["meta"] = meta
         return out
+    def postprocess(
+        self, model_outputs: dict[str, Any], **kwargs: Any
+    ) -> NTv3TracksOutput:
         meta = model_outputs.pop("meta", {})
         def to_np(x):
         # Normalize shapes to remove batch/(optional assembly) dims
         if bigwig_np.ndim == 3:
+            bigwig_np = bigwig_np[0]  # (L, T)
         elif bigwig_np.ndim == 4:
+            bigwig_np = bigwig_np[0, 0]  # (L, T) if (B, A, L, T)
         else:
             raise ValueError(f"Unexpected bigwig_tracks_logits ndim: {bigwig_np.ndim}")
         if bed_np.ndim == 4:
+            bed_np = bed_np[0]  # (L, E, C)
         elif bed_np.ndim == 5:
+            bed_np = bed_np[0, 0]  # (L, E, C) if (B, A, L, E, C)
         else:
             raise ValueError(f"Unexpected bed_tracks_logits ndim: {bed_np.ndim}")
         inputs,
         *args,
         plot: bool = False,
+        tracks_to_plot: dict[str, str] | None = None,  # title -> track_id (ENCSR...)
+        elements_to_plot: list[str] | None = None,  # element names
         plot_height: float = 1.0,
         plot_figsize_x: float = 20.0,
         **kwargs,
         if plot:
             if out.bigwig_track_names is None:
+                raise ValueError(
+                    "bigwig_track_names missing; expected cfg.bigwigs_per_file_assembly[assembly]."
+                )
             if out.bed_element_names is None:
                 raise ValueError("bed element names missing from config.")
             tracks_to_plot = tracks_to_plot or {}
             bed_element_names = out.bed_element_names
             # Validate
+            missing_tracks = [
+                tid for tid in tracks_to_plot.values() if tid not in bigwig_names
+            ]
             if missing_tracks:
                 raise ValueError(
                     f"The following tracks are not available in bigwig_names: {missing_tracks}\n"
                     f"First 50 available: {bigwig_names[:50]}{'...' if len(bigwig_names) > 50 else ''}"
                 )
+            missing_elements = [
+                e for e in elements_to_plot if e not in bed_element_names
+            ]
             if missing_elements:
                 raise ValueError(
                     f"The following elements are not available in bed_element_names: {missing_elements}\n"
                 )
             # Build bigwig tracks dict (title -> y)
+            bigwig_tracks: dict[str, np.ndarray] = {}
             bigwig = out.bigwig_tracks_logits  # (L_pred, T)
             for title, track_id in tracks_to_plot.items():
                 track_idx = bigwig_names.index(track_id)
                 bigwig_tracks[title] = bigwig[:, track_idx]
             # Bed positive class probabilities (title -> y)
+            bed_probs: dict[str, np.ndarray] = {}
             probs = _softmax_last(out.bed_tracks_logits)  # (L_pred, E, C)
             for element_name in elements_to_plot:
                 element_idx = bed_element_names.index(element_name)
             all_tracks = {**bigwig_tracks, **bed_probs}
             plot_start = int(out.pred_start or 0)
+            plot_end = int(
+                out.pred_end or (plot_start + len(next(iter(all_tracks.values()))))
+            )
             _plot_tracks_fillbetween(
                 all_tracks,
                 chrom=out.chrom,
         return out
 def load_ntv3_tracks_pipeline(
     model: str,
     device: str = "auto",
         device=device,
         **pipeline_kwargs,
     )
+    return pipe

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-transformers>=4.41.0
-torch
-numpy
 gradio>=4.0.0
-pyfaidx
-requests
 matplotlib
 pyBigWig

 gradio>=4.0.0
 matplotlib
+numpy
 pyBigWig
+pyfaidx
+requests
+torch
+transformers>=4.41.0