Spaces:

InstaDeepAI
/

ntv3_tracks

Running on Zero

App Files Files Community

bernardo-de-almeida commited on Dec 19, 2025

Commit

a56975c

1 Parent(s): 7fad7e0

fix: pipeline

Browse files

Files changed (2) hide show

app.py +3 -3
ntv3_tracks_pipeline.py +13 -39

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ matplotlib.use("Agg")
 # -----------------------------
 # Env / auth
 # -----------------------------
-MODEL_ID = os.environ.get("MODEL_ID", "InstaDeepAI/NTv3_650M_pos")
 DEFAULT_SPECIES = os.environ.get("DEFAULT_SPECIES", "human")
 HF_TOKEN = (
     os.environ.get("NTV3_HF_TOKEN")
@@ -887,8 +887,8 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
     # Model display names (without InstaDeepAI/ prefix) and their full IDs
     MODEL_OPTIONS = {
-        "NTv3 650M (post)": "InstaDeepAI/NTv3_650M_pos",
-        "NTv3 100M (post)": "InstaDeepAI/NTv3_100M_pos",
     }
     # Reverse mapping: full ID -> display name

 # -----------------------------
 # Env / auth
 # -----------------------------
+MODEL_ID = os.environ.get("MODEL_ID", "InstaDeepAI/NTv3_650M_post")
 DEFAULT_SPECIES = os.environ.get("DEFAULT_SPECIES", "human")
 HF_TOKEN = (
     os.environ.get("NTV3_HF_TOKEN")
     # Model display names (without InstaDeepAI/ prefix) and their full IDs
     MODEL_OPTIONS = {
+        "NTv3 650M (post)": "InstaDeepAI/NTv3_650M_post",
+        "NTv3 100M (post)": "InstaDeepAI/NTv3_100M_post",
     }
     # Reverse mapping: full ID -> display name

ntv3_tracks_pipeline.py CHANGED Viewed

@@ -279,7 +279,7 @@ class NTv3TracksOutput:
     species: str | None = None
     assembly: str | None = None
     bigwig_track_names: list[str] | None = (
-        None  # from cfg.bigwigs_per_file_assembly[assembly]
     )
     bed_element_names: list[str] | None = None
     window_len: int | None = None
@@ -347,21 +347,6 @@ class NTv3TracksPipeline(Pipeline):
                 self.config, "name_or_path", None
             )
-        # Load species_tokenizer (following ntv3_gff_pipeline.py pattern)
-        if self.model_id:
-            self.species_tokenizer = AutoTokenizer.from_pretrained(
-                self.model_id,
-                subfolder="species_tokenizer",
-                trust_remote_code=trust_remote_code,
-                token=token,
-            )
-        else:
-            self.species_tokenizer = kwargs.get("species_tokenizer", None)
-            if self.species_tokenizer is None:
-                raise ValueError(
-                    "Pass species_tokenizer=... when constructing with a model module."
-                )
         # bed names (your notebooks refer to bed_element_names)
         self.bed_element_names = getattr(
             self.config, "bed_elements_names", None
@@ -380,20 +365,13 @@ class NTv3TracksPipeline(Pipeline):
         Return BigWig track IDs for the assembly corresponding to `species`.
         No model forward pass.
         """
-        sp = species or self.default_species
-        assembly = SPECIES_TO_ASSEMBLY.get(sp)
-        if assembly is None:
             raise ValueError(
-                f"Unknown species={sp}. Supported: {sorted(SPECIES_TO_ASSEMBLY.keys())}"
             )
-        if assembly not in self.config.bigwigs_per_file_assembly:
-            raise ValueError(
-                f"Assembly {assembly} not found in checkpoint config. "
-                f"Available: {list(self.config.bigwigs_per_file_assembly.keys())}"
-            )
-        return list(self.config.bigwigs_per_file_assembly[assembly])
     def available_bed_element_names(self) -> list[str]:
         """
@@ -416,12 +394,11 @@ class NTv3TracksPipeline(Pipeline):
             )
         assembly = SPECIES_TO_ASSEMBLY[species]
-        cfg_assemblies = list(self.config.bigwigs_per_file_assembly.keys())
-        if assembly not in cfg_assemblies:
             raise ValueError(
-                f"Species '{species}' maps to assembly '{assembly}', "
-                f"but that assembly is not available in this checkpoint. "
-                f"Available assemblies: {cfg_assemblies}"
             )
         return species, assembly
@@ -478,17 +455,15 @@ class NTv3TracksPipeline(Pipeline):
         input_ids = input_ids_cpu.to(device)
         # Species tokenization - match batch size
         batch_size = input_ids.shape[0]
-        species_ids = self.species_tokenizer(
-            [species] * batch_size, add_special_tokens=False, return_tensors="pt"
-        )
-        species_ids_tensor = species_ids["input_ids"].to(device)
         # Prediction interval (not used for slicing logits, just x-axis)
         pred_start = start + int(window_len * self.pred_center_offset_fraction)
         pred_end = pred_start + int(window_len * self.pred_center_fraction)
         # ✅ The source of truth for track IDs/names (your note)
-        bigwig_track_names = list(self.config.bigwigs_per_file_assembly[assembly])
         return {
             "input_ids": input_ids,
@@ -564,7 +539,6 @@ class NTv3TracksPipeline(Pipeline):
             out = self.model(
                 input_ids=model_inputs["input_ids"],
                 species_ids=model_inputs["species_ids"],
-                return_dict=True,
             )
         out["meta"] = meta
         return out
@@ -589,7 +563,7 @@ class NTv3TracksPipeline(Pipeline):
             if out.bigwig_track_names is None:
                 raise ValueError(
                     "bigwig_track_names missing; expected "
-                    "cfg.bigwigs_per_file_assembly[assembly]."
                 )
             if out.bed_element_names is None:
                 raise ValueError("bed element names missing from config.")

     species: str | None = None
     assembly: str | None = None
     bigwig_track_names: list[str] | None = (
+        None  # from cfg.bigwigs_per_species[species]
     )
     bed_element_names: list[str] | None = None
     window_len: int | None = None
                 self.config, "name_or_path", None
             )
         # bed names (your notebooks refer to bed_element_names)
         self.bed_element_names = getattr(
             self.config, "bed_elements_names", None
         Return BigWig track IDs for the assembly corresponding to `species`.
         No model forward pass.
         """
+        if species not in self.config.bigwigs_per_species:
             raise ValueError(
+                f"Species {species} not found in checkpoint config. "
+                f"Available: {list(self.config.bigwigs_per_species.keys())}"
             )
+        return list(self.config.bigwigs_per_species[species])
     def available_bed_element_names(self) -> list[str]:
         """
             )
         assembly = SPECIES_TO_ASSEMBLY[species]
+        cfg_species = list(self.config.bigwigs_per_species.keys())
+        if species not in cfg_species:
             raise ValueError(
+                f"Species '{species}' is not available in this checkpoint. "
+                f"Available species: {cfg_species}"
             )
         return species, assembly
         input_ids = input_ids_cpu.to(device)
         # Species tokenization - match batch size
         batch_size = input_ids.shape[0]
+        species_ids = self.model.encode_species([species] * batch_size)
+        species_ids_tensor = species_ids.to(device)
         # Prediction interval (not used for slicing logits, just x-axis)
         pred_start = start + int(window_len * self.pred_center_offset_fraction)
         pred_end = pred_start + int(window_len * self.pred_center_fraction)
         # ✅ The source of truth for track IDs/names (your note)
+        bigwig_track_names = list(self.config.bigwigs_per_species[species])
         return {
             "input_ids": input_ids,
             out = self.model(
                 input_ids=model_inputs["input_ids"],
                 species_ids=model_inputs["species_ids"],
             )
         out["meta"] = meta
         return out
             if out.bigwig_track_names is None:
                 raise ValueError(
                     "bigwig_track_names missing; expected "
+                    "cfg.bigwigs_per_species[species]."
                 )
             if out.bed_element_names is None:
                 raise ValueError("bed element names missing from config.")