bernardo-de-almeida commited on
Commit
b21c184
·
1 Parent(s): 36bd60e

feat: clean track names

Browse files
Files changed (1) hide show
  1. bigwig_export.py +167 -0
bigwig_export.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BigWig export functionality for NTv3 tracks.
3
+ """
4
+
5
+ import os
6
+ import uuid
7
+ import tempfile
8
+ import zipfile
9
+ from typing import TYPE_CHECKING
10
+
11
+ import numpy as np
12
+
13
+ try:
14
+ import pyBigWig
15
+ except ImportError:
16
+ pyBigWig = None
17
+
18
+ if TYPE_CHECKING:
19
+ from ntv3_tracks_pipeline import NTv3TracksOutput
20
+
21
+
22
+ def _softmax_last(x: np.ndarray) -> np.ndarray:
23
+ """Compute softmax over the last dimension."""
24
+ x = x - x.max(axis=-1, keepdims=True)
25
+ ex = np.exp(x)
26
+ return ex / ex.sum(axis=-1, keepdims=True)
27
+
28
+
29
+ def create_bigwig_zip(
30
+ out: "NTv3TracksOutput",
31
+ bigwig_selected: list[str],
32
+ bed_elements: list[str],
33
+ ) -> str:
34
+ """
35
+ Create BigWig files for selected tracks and save them in a zip file.
36
+
37
+ Parameters
38
+ ----------
39
+ out : NTv3TracksOutput
40
+ The prediction output from the pipeline.
41
+ bigwig_selected : list[str]
42
+ List of BigWig track IDs to export.
43
+ bed_elements : list[str]
44
+ List of BED element names to export.
45
+
46
+ Returns
47
+ -------
48
+ str
49
+ Path to the created zip file containing BigWig files.
50
+
51
+ Raises
52
+ ------
53
+ ImportError
54
+ If pyBigWig is not installed.
55
+ ValueError
56
+ If no predictions are available or no tracks are selected.
57
+ """
58
+ if pyBigWig is None:
59
+ raise ImportError("pyBigWig is required for BigWig export. Install with: pip install pyBigWig")
60
+
61
+ if out is None:
62
+ raise ValueError("No predictions available. Please run a prediction first.")
63
+
64
+ bw_names = out.bigwig_track_names or []
65
+ bw_logits = out.bigwig_tracks_logits
66
+ bed_names = out.bed_element_names or []
67
+ bed_logits = out.bed_tracks_logits
68
+
69
+ if bw_logits is None or not bw_names:
70
+ raise ValueError("No BigWig tracks available in model output.")
71
+
72
+ # Get genomic coordinates
73
+ chrom = out.chrom
74
+ if chrom is None:
75
+ raise ValueError("Chromosome information not available. Use genomic coordinates for BigWig export.")
76
+
77
+ start = out.start
78
+ end = out.end
79
+ window_len = out.window_len or (end - start)
80
+
81
+ # Calculate prediction region (center 37.5%)
82
+ pred_start = out.pred_start or (start + int(window_len * 0.3125))
83
+ pred_end = out.pred_end or (pred_start + int(window_len * 0.375))
84
+
85
+ # Create temporary directory for BigWig files
86
+ tmpdir = tempfile.gettempdir()
87
+ output_dir = os.path.join(tmpdir, f"bigwig_outputs_{uuid.uuid4().hex}")
88
+ os.makedirs(output_dir, exist_ok=True)
89
+
90
+ # Prepare track data list
91
+ track_data_list = []
92
+
93
+ # Add BigWig tracks
94
+ for track_id in bigwig_selected:
95
+ if track_id in bw_names:
96
+ idx = bw_names.index(track_id)
97
+ track_data_list.append(("bigwig", track_id, idx, None))
98
+
99
+ # Add BED elements (as probabilities)
100
+ if bed_logits is not None and bed_elements:
101
+ probs = _softmax_last(bed_logits)
102
+ for elem_name in bed_elements:
103
+ if elem_name in bed_names:
104
+ eidx = bed_names.index(elem_name)
105
+ # Store as bed element with probability data
106
+ track_data_list.append(("bed", elem_name, eidx, probs[:, eidx, 1]))
107
+
108
+ if not track_data_list:
109
+ raise ValueError("No tracks selected for export.")
110
+
111
+ # Create BigWig files
112
+ created_files = []
113
+ for track_type, track_id, track_idx, bed_probs in track_data_list:
114
+ if track_type == "bigwig":
115
+ track_data = bw_logits[:, track_idx].astype(np.float32)
116
+ display_name = track_id
117
+ else: # bed
118
+ if bed_probs is None:
119
+ continue
120
+ track_data = bed_probs.astype(np.float32)
121
+ display_name = track_id
122
+
123
+ # Clean filename
124
+ clean_name = display_name.replace(" ", "_").replace("/", "_").replace("-", "_")
125
+ bw_filename = os.path.join(output_dir, f"{clean_name}.bw")
126
+
127
+ # Create BigWig file
128
+ bw = pyBigWig.open(bw_filename, "w")
129
+
130
+ # Add header - use end of genomic window as chromosome size
131
+ bw.addHeader([(chrom, end)])
132
+
133
+ # Add entries
134
+ num_positions = len(track_data)
135
+ starts = np.arange(pred_start, pred_start + num_positions, dtype=np.int64)
136
+ ends = starts + 1
137
+ values = track_data.tolist()
138
+
139
+ bw.addEntries(
140
+ chroms=[chrom] * len(starts),
141
+ starts=starts.tolist(),
142
+ ends=ends.tolist(),
143
+ values=values
144
+ )
145
+
146
+ bw.close()
147
+ created_files.append(bw_filename)
148
+
149
+ # Create zip file
150
+ zip_path = os.path.join(tmpdir, f"ntv3_tracks_{uuid.uuid4().hex}.zip")
151
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
152
+ for bw_file in created_files:
153
+ zipf.write(bw_file, os.path.basename(bw_file))
154
+
155
+ # Clean up individual BigWig files
156
+ for bw_file in created_files:
157
+ try:
158
+ os.remove(bw_file)
159
+ except:
160
+ pass
161
+ try:
162
+ os.rmdir(output_dir)
163
+ except:
164
+ pass
165
+
166
+ return zip_path
167
+