Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
b21c184
1
Parent(s):
36bd60e
feat: clean track names
Browse files- bigwig_export.py +167 -0
bigwig_export.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BigWig export functionality for NTv3 tracks.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import uuid
|
| 7 |
+
import tempfile
|
| 8 |
+
import zipfile
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
import pyBigWig
|
| 15 |
+
except ImportError:
|
| 16 |
+
pyBigWig = None
|
| 17 |
+
|
| 18 |
+
if TYPE_CHECKING:
|
| 19 |
+
from ntv3_tracks_pipeline import NTv3TracksOutput
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _softmax_last(x: np.ndarray) -> np.ndarray:
|
| 23 |
+
"""Compute softmax over the last dimension."""
|
| 24 |
+
x = x - x.max(axis=-1, keepdims=True)
|
| 25 |
+
ex = np.exp(x)
|
| 26 |
+
return ex / ex.sum(axis=-1, keepdims=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def create_bigwig_zip(
|
| 30 |
+
out: "NTv3TracksOutput",
|
| 31 |
+
bigwig_selected: list[str],
|
| 32 |
+
bed_elements: list[str],
|
| 33 |
+
) -> str:
|
| 34 |
+
"""
|
| 35 |
+
Create BigWig files for selected tracks and save them in a zip file.
|
| 36 |
+
|
| 37 |
+
Parameters
|
| 38 |
+
----------
|
| 39 |
+
out : NTv3TracksOutput
|
| 40 |
+
The prediction output from the pipeline.
|
| 41 |
+
bigwig_selected : list[str]
|
| 42 |
+
List of BigWig track IDs to export.
|
| 43 |
+
bed_elements : list[str]
|
| 44 |
+
List of BED element names to export.
|
| 45 |
+
|
| 46 |
+
Returns
|
| 47 |
+
-------
|
| 48 |
+
str
|
| 49 |
+
Path to the created zip file containing BigWig files.
|
| 50 |
+
|
| 51 |
+
Raises
|
| 52 |
+
------
|
| 53 |
+
ImportError
|
| 54 |
+
If pyBigWig is not installed.
|
| 55 |
+
ValueError
|
| 56 |
+
If no predictions are available or no tracks are selected.
|
| 57 |
+
"""
|
| 58 |
+
if pyBigWig is None:
|
| 59 |
+
raise ImportError("pyBigWig is required for BigWig export. Install with: pip install pyBigWig")
|
| 60 |
+
|
| 61 |
+
if out is None:
|
| 62 |
+
raise ValueError("No predictions available. Please run a prediction first.")
|
| 63 |
+
|
| 64 |
+
bw_names = out.bigwig_track_names or []
|
| 65 |
+
bw_logits = out.bigwig_tracks_logits
|
| 66 |
+
bed_names = out.bed_element_names or []
|
| 67 |
+
bed_logits = out.bed_tracks_logits
|
| 68 |
+
|
| 69 |
+
if bw_logits is None or not bw_names:
|
| 70 |
+
raise ValueError("No BigWig tracks available in model output.")
|
| 71 |
+
|
| 72 |
+
# Get genomic coordinates
|
| 73 |
+
chrom = out.chrom
|
| 74 |
+
if chrom is None:
|
| 75 |
+
raise ValueError("Chromosome information not available. Use genomic coordinates for BigWig export.")
|
| 76 |
+
|
| 77 |
+
start = out.start
|
| 78 |
+
end = out.end
|
| 79 |
+
window_len = out.window_len or (end - start)
|
| 80 |
+
|
| 81 |
+
# Calculate prediction region (center 37.5%)
|
| 82 |
+
pred_start = out.pred_start or (start + int(window_len * 0.3125))
|
| 83 |
+
pred_end = out.pred_end or (pred_start + int(window_len * 0.375))
|
| 84 |
+
|
| 85 |
+
# Create temporary directory for BigWig files
|
| 86 |
+
tmpdir = tempfile.gettempdir()
|
| 87 |
+
output_dir = os.path.join(tmpdir, f"bigwig_outputs_{uuid.uuid4().hex}")
|
| 88 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 89 |
+
|
| 90 |
+
# Prepare track data list
|
| 91 |
+
track_data_list = []
|
| 92 |
+
|
| 93 |
+
# Add BigWig tracks
|
| 94 |
+
for track_id in bigwig_selected:
|
| 95 |
+
if track_id in bw_names:
|
| 96 |
+
idx = bw_names.index(track_id)
|
| 97 |
+
track_data_list.append(("bigwig", track_id, idx, None))
|
| 98 |
+
|
| 99 |
+
# Add BED elements (as probabilities)
|
| 100 |
+
if bed_logits is not None and bed_elements:
|
| 101 |
+
probs = _softmax_last(bed_logits)
|
| 102 |
+
for elem_name in bed_elements:
|
| 103 |
+
if elem_name in bed_names:
|
| 104 |
+
eidx = bed_names.index(elem_name)
|
| 105 |
+
# Store as bed element with probability data
|
| 106 |
+
track_data_list.append(("bed", elem_name, eidx, probs[:, eidx, 1]))
|
| 107 |
+
|
| 108 |
+
if not track_data_list:
|
| 109 |
+
raise ValueError("No tracks selected for export.")
|
| 110 |
+
|
| 111 |
+
# Create BigWig files
|
| 112 |
+
created_files = []
|
| 113 |
+
for track_type, track_id, track_idx, bed_probs in track_data_list:
|
| 114 |
+
if track_type == "bigwig":
|
| 115 |
+
track_data = bw_logits[:, track_idx].astype(np.float32)
|
| 116 |
+
display_name = track_id
|
| 117 |
+
else: # bed
|
| 118 |
+
if bed_probs is None:
|
| 119 |
+
continue
|
| 120 |
+
track_data = bed_probs.astype(np.float32)
|
| 121 |
+
display_name = track_id
|
| 122 |
+
|
| 123 |
+
# Clean filename
|
| 124 |
+
clean_name = display_name.replace(" ", "_").replace("/", "_").replace("-", "_")
|
| 125 |
+
bw_filename = os.path.join(output_dir, f"{clean_name}.bw")
|
| 126 |
+
|
| 127 |
+
# Create BigWig file
|
| 128 |
+
bw = pyBigWig.open(bw_filename, "w")
|
| 129 |
+
|
| 130 |
+
# Add header - use end of genomic window as chromosome size
|
| 131 |
+
bw.addHeader([(chrom, end)])
|
| 132 |
+
|
| 133 |
+
# Add entries
|
| 134 |
+
num_positions = len(track_data)
|
| 135 |
+
starts = np.arange(pred_start, pred_start + num_positions, dtype=np.int64)
|
| 136 |
+
ends = starts + 1
|
| 137 |
+
values = track_data.tolist()
|
| 138 |
+
|
| 139 |
+
bw.addEntries(
|
| 140 |
+
chroms=[chrom] * len(starts),
|
| 141 |
+
starts=starts.tolist(),
|
| 142 |
+
ends=ends.tolist(),
|
| 143 |
+
values=values
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
bw.close()
|
| 147 |
+
created_files.append(bw_filename)
|
| 148 |
+
|
| 149 |
+
# Create zip file
|
| 150 |
+
zip_path = os.path.join(tmpdir, f"ntv3_tracks_{uuid.uuid4().hex}.zip")
|
| 151 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 152 |
+
for bw_file in created_files:
|
| 153 |
+
zipf.write(bw_file, os.path.basename(bw_file))
|
| 154 |
+
|
| 155 |
+
# Clean up individual BigWig files
|
| 156 |
+
for bw_file in created_files:
|
| 157 |
+
try:
|
| 158 |
+
os.remove(bw_file)
|
| 159 |
+
except:
|
| 160 |
+
pass
|
| 161 |
+
try:
|
| 162 |
+
os.rmdir(output_dir)
|
| 163 |
+
except:
|
| 164 |
+
pass
|
| 165 |
+
|
| 166 |
+
return zip_path
|
| 167 |
+
|