bernardo-de-almeida commited on
Commit
36bd60e
·
1 Parent(s): 25532d4

feat: clean track names

Browse files
Files changed (2) hide show
  1. app.py +181 -15
  2. data/functional_tracks_metadata.csv +0 -0
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import os
2
  import uuid
3
  import tempfile
 
 
4
  import numpy as np
5
  import gradio as gr
6
  import asyncio
@@ -114,6 +116,85 @@ def _save_fig_png(fig) -> str:
114
  # Cache track lists per species so search is instant after first load
115
  _BIGWIG_CACHE: dict[str, list[str]] = {}
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def _get_bigwig_names(species: str) -> list[str]:
119
  if species not in _BIGWIG_CACHE:
@@ -182,20 +263,52 @@ def _rank_search(query: str, names: list[str], limit: int) -> list[str]:
182
 
183
 
184
  def search_bigwigs(species: str, query: str):
 
185
  names = _get_bigwig_names(species)
186
- results = _rank_search(query, names, SEARCH_MAX_RESULTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return gr.update(choices=results, value=[])
188
 
189
 
190
  def add_selected(current_selected: list[str], to_add: list[str]):
191
- cur = list(dict.fromkeys(current_selected or [])) # preserve order, unique
192
- for x in (to_add or []):
193
- if x not in cur:
194
- cur.append(x)
195
- return gr.update(choices=cur, value=cur) # show + keep all checked
 
 
 
 
 
 
 
 
 
 
196
 
197
 
198
  def remove_selected(current_selected: list[str], to_remove: list[str]):
 
199
  cur = [x for x in (current_selected or []) if x not in set(to_remove or [])]
200
  return gr.update(choices=cur, value=cur)
201
 
@@ -208,15 +321,21 @@ def update_coords_on_species_change(species: str):
208
  def reset_on_species_change(species: str):
209
  # Clear results + selected when species changes (avoids mismatched IDs)
210
  try:
211
- _get_bigwig_names(species) # warms cache if available
 
 
 
 
 
 
 
212
  except (ValueError, AttributeError):
213
  # Species doesn't have bigwigs, that's okay
214
- pass
215
- return (
216
- gr.update(value=""), # query textbox
217
- gr.update(choices=[], value=[]), # results list
218
- gr.update(choices=[], value=[]), # selected list
219
- )
220
 
221
 
222
  # -----------------------------
@@ -236,6 +355,9 @@ def predict(
236
  if not species:
237
  raise gr.Error("Species parameter is missing. Please select a species.")
238
 
 
 
 
239
  if use_coords:
240
  # Check if this species supports coordinate-based fetching
241
  if species not in SPECIES_WITH_COORDINATE_SUPPORT:
@@ -583,7 +705,9 @@ DEFAULT_BED_ELEMENTS = ["protein_coding_gene", "exon", "intron"]
583
 
584
  # Get available BigWig tracks for default species and filter defaults
585
  _init_bigwig = _get_bigwig_names(DEFAULT_SPECIES)
586
- _init_bigwig_selected = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bigwig]
 
 
587
 
588
  # Filter default BED elements to only those available
589
  _init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
@@ -745,6 +869,37 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
745
 
746
  gr.Markdown("## Select functional tracks")
747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  bigwig_no_tracks_msg = gr.Markdown(
749
  "⚠️ No functional genomic tracks available for this species in the current model.",
750
  visible=False,
@@ -850,6 +1005,17 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
850
  coords = DEFAULT_COORDS.get(species, DEFAULT_COORDS["human"])
851
  # Show coordinates only if species is supported AND use_coords is True
852
  show_coords = is_supported and use_coords_val
 
 
 
 
 
 
 
 
 
 
 
853
  return (
854
  gr.update(visible=show_coords, value=coords["chrom"]),
855
  gr.update(visible=show_coords, value=coords["start"]),
@@ -857,7 +1023,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
857
  gr.update(value=is_supported, visible=is_supported), # Show/hide and enable use_coords only if supported
858
  gr.update(visible=show_coords), # Show/hide the row
859
  gr.update(visible=not has_bigwigs), # Show "no tracks" message if no bigwigs
860
- gr.update(visible=has_bigwigs), # Show bigwig selection if available
861
  gr.update(visible=has_bigwigs), # Show bigwig query if available
862
  gr.update(visible=has_bigwigs), # Show bigwig results if available
863
  gr.update(visible=has_bigwigs), # Show bigwig buttons if available
 
1
  import os
2
  import uuid
3
  import tempfile
4
+ import csv
5
+ from pathlib import Path
6
  import numpy as np
7
  import gradio as gr
8
  import asyncio
 
116
  # Cache track lists per species so search is instant after first load
117
  _BIGWIG_CACHE: dict[str, list[str]] = {}
118
 
119
+ # Cache for track metadata (track_id -> display_name)
120
+ _TRACK_METADATA_CACHE: dict[str, str] = {}
121
+
122
+
123
+ def _load_track_metadata() -> dict[str, str]:
124
+ """Load track metadata from CSV and create display name mapping."""
125
+ if _TRACK_METADATA_CACHE:
126
+ return _TRACK_METADATA_CACHE
127
+
128
+ csv_path = Path(__file__).parent / "data" / "functional_tracks_metadata.csv"
129
+ if not csv_path.exists():
130
+ return {}
131
+
132
+ metadata = {}
133
+ try:
134
+ with open(csv_path, 'r', encoding='utf-8') as f:
135
+ reader = csv.DictReader(f)
136
+ for row in reader:
137
+ track_id = row['file_id']
138
+ tissue = row.get('tissue', '').strip()
139
+ assay = row.get('assay', '').strip()
140
+ experiment_target = row.get('experiment_target', '').strip()
141
+ biosample_type = row.get('biosample_type', '').strip()
142
+ strand = row.get('strand', '').strip()
143
+
144
+ # Build display name from available fields
145
+ parts = []
146
+ if biosample_type and biosample_type != 'tissue':
147
+ parts.append(biosample_type)
148
+ if tissue:
149
+ parts.append(tissue)
150
+ if assay:
151
+ # For RNA-seq, include strand information if available
152
+ if strand:
153
+ if strand == 'plus':
154
+ strand = '+'
155
+ elif strand == 'minus':
156
+ strand = '-'
157
+ parts.append(f"{assay} {strand}")
158
+ else:
159
+ parts.append(assay)
160
+ if experiment_target and experiment_target not in ('none', 'RNA-seq'):
161
+ parts.append(experiment_target)
162
+
163
+ if parts:
164
+ display_name = " - ".join(parts)
165
+ else:
166
+ display_name = track_id # Fallback to ID if no metadata
167
+
168
+ metadata[track_id] = display_name
169
+ except Exception as e:
170
+ print(f"Warning: Could not load track metadata: {e}")
171
+ return {}
172
+
173
+ _TRACK_METADATA_CACHE.update(metadata)
174
+ return metadata
175
+
176
+
177
+ def _get_track_display_name(track_id: str) -> str:
178
+ """Get display name for a track ID, or return the ID if not found."""
179
+ metadata = _load_track_metadata()
180
+ return metadata.get(track_id, track_id)
181
+
182
+
183
+ def _format_track_for_display(track_id: str) -> str:
184
+ """Format track ID for display: 'display_name (track_id)'."""
185
+ display_name = _get_track_display_name(track_id)
186
+ if display_name == track_id:
187
+ return track_id # No metadata available, just show ID
188
+ return f"{display_name} ({track_id})"
189
+
190
+
191
+ def _extract_track_id(display_value: str) -> str:
192
+ """Extract track ID from display format 'display_name (track_id)' or return as-is."""
193
+ if " (" in display_value and display_value.endswith(")"):
194
+ # Extract track_id from format "display_name (track_id)"
195
+ return display_value.rsplit(" (", 1)[1][:-1]
196
+ return display_value # No parentheses, assume it's already just the ID
197
+
198
 
199
  def _get_bigwig_names(species: str) -> list[str]:
200
  if species not in _BIGWIG_CACHE:
 
263
 
264
 
265
  def search_bigwigs(species: str, query: str):
266
+ """Search BigWig tracks and return formatted display names."""
267
  names = _get_bigwig_names(species)
268
+ # Search in both track IDs and display names
269
+ metadata = _load_track_metadata()
270
+ query_lower = query.lower()
271
+
272
+ # Build list of (display_format, track_id) tuples for searching
273
+ track_display_pairs = []
274
+ for track_id in names:
275
+ display_name = metadata.get(track_id, track_id)
276
+ display_format = _format_track_for_display(track_id)
277
+ track_display_pairs.append((display_format, track_id, display_name))
278
+
279
+ # Filter by query (search in display name, display format, and track_id)
280
+ matching = []
281
+ for display_format, track_id, display_name in track_display_pairs:
282
+ if (query_lower in track_id.lower() or
283
+ query_lower in display_name.lower() or
284
+ query_lower in display_format.lower()):
285
+ matching.append(display_format)
286
+
287
+ # Limit results
288
+ results = matching[:SEARCH_MAX_RESULTS]
289
  return gr.update(choices=results, value=[])
290
 
291
 
292
  def add_selected(current_selected: list[str], to_add: list[str]):
293
+ """Add tracks to selected list, converting display format to track IDs if needed."""
294
+ # Extract track IDs from current selection (in case they're in display format)
295
+ cur_ids = [_extract_track_id(x) for x in (current_selected or [])]
296
+ cur_display = [_format_track_for_display(tid) for tid in cur_ids]
297
+
298
+ # Extract track IDs from items to add
299
+ to_add_ids = [_extract_track_id(x) for x in (to_add or [])]
300
+
301
+ # Add new track IDs
302
+ for tid in to_add_ids:
303
+ if tid not in cur_ids:
304
+ cur_ids.append(tid)
305
+ cur_display.append(_format_track_for_display(tid))
306
+
307
+ return gr.update(choices=cur_display, value=cur_display) # show + keep all checked
308
 
309
 
310
  def remove_selected(current_selected: list[str], to_remove: list[str]):
311
+ """Remove tracks from selected list."""
312
  cur = [x for x in (current_selected or []) if x not in set(to_remove or [])]
313
  return gr.update(choices=cur, value=cur)
314
 
 
321
  def reset_on_species_change(species: str):
322
  # Clear results + selected when species changes (avoids mismatched IDs)
323
  try:
324
+ track_ids = _get_bigwig_names(species) # warms cache if available
325
+ # Format available tracks for display
326
+ formatted_tracks = [_format_track_for_display(tid) for tid in track_ids]
327
+ return (
328
+ gr.update(value=""), # query textbox
329
+ gr.update(choices=[], value=[]), # results list
330
+ gr.update(choices=formatted_tracks, value=[]), # selected list (with formatted names)
331
+ )
332
  except (ValueError, AttributeError):
333
  # Species doesn't have bigwigs, that's okay
334
+ return (
335
+ gr.update(value=""), # query textbox
336
+ gr.update(choices=[], value=[]), # results list
337
+ gr.update(choices=[], value=[]), # selected list
338
+ )
 
339
 
340
 
341
  # -----------------------------
 
355
  if not species:
356
  raise gr.Error("Species parameter is missing. Please select a species.")
357
 
358
+ # Extract track IDs from display format if needed
359
+ bigwig_selected = [_extract_track_id(tid) for tid in bigwig_selected]
360
+
361
  if use_coords:
362
  # Check if this species supports coordinate-based fetching
363
  if species not in SPECIES_WITH_COORDINATE_SUPPORT:
 
705
 
706
  # Get available BigWig tracks for default species and filter defaults
707
  _init_bigwig = _get_bigwig_names(DEFAULT_SPECIES)
708
+ _init_bigwig_selected_ids = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bigwig]
709
+ # Format for display
710
+ _init_bigwig_selected = [_format_track_for_display(tid) for tid in _init_bigwig_selected_ids]
711
 
712
  # Filter default BED elements to only those available
713
  _init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
 
869
 
870
  gr.Markdown("## Select functional tracks")
871
 
872
+ # Button to download tracks metadata
873
+ def get_metadata_file_path():
874
+ """Return path to metadata CSV file for download."""
875
+ csv_path = Path(__file__).parent / "data" / "functional_tracks_metadata.csv"
876
+ if csv_path.exists():
877
+ return str(csv_path)
878
+ return None
879
+
880
+ metadata_file_path = get_metadata_file_path()
881
+ download_metadata_btn = gr.Button(
882
+ "📋 Download metadata for all functional tracks",
883
+ variant="secondary",
884
+ visible=metadata_file_path is not None,
885
+ )
886
+ metadata_download_file = gr.File(
887
+ label="Tracks metadata",
888
+ visible=False,
889
+ )
890
+
891
+ def download_metadata():
892
+ """Return metadata file for download."""
893
+ if metadata_file_path and Path(metadata_file_path).exists():
894
+ return gr.update(value=metadata_file_path, visible=True)
895
+ return gr.update(visible=False)
896
+
897
+ download_metadata_btn.click(
898
+ fn=download_metadata,
899
+ inputs=[],
900
+ outputs=[metadata_download_file],
901
+ )
902
+
903
  bigwig_no_tracks_msg = gr.Markdown(
904
  "⚠️ No functional genomic tracks available for this species in the current model.",
905
  visible=False,
 
1005
  coords = DEFAULT_COORDS.get(species, DEFAULT_COORDS["human"])
1006
  # Show coordinates only if species is supported AND use_coords is True
1007
  show_coords = is_supported and use_coords_val
1008
+
1009
+ # Format available tracks for display if species has bigwigs
1010
+ if has_bigwigs:
1011
+ try:
1012
+ track_ids = _get_bigwig_names(species)
1013
+ formatted_tracks = [_format_track_for_display(tid) for tid in track_ids]
1014
+ except:
1015
+ formatted_tracks = []
1016
+ else:
1017
+ formatted_tracks = []
1018
+
1019
  return (
1020
  gr.update(visible=show_coords, value=coords["chrom"]),
1021
  gr.update(visible=show_coords, value=coords["start"]),
 
1023
  gr.update(value=is_supported, visible=is_supported), # Show/hide and enable use_coords only if supported
1024
  gr.update(visible=show_coords), # Show/hide the row
1025
  gr.update(visible=not has_bigwigs), # Show "no tracks" message if no bigwigs
1026
+ gr.update(visible=has_bigwigs, choices=formatted_tracks, value=[]), # Show bigwig selection if available
1027
  gr.update(visible=has_bigwigs), # Show bigwig query if available
1028
  gr.update(visible=has_bigwigs), # Show bigwig results if available
1029
  gr.update(visible=has_bigwigs), # Show bigwig buttons if available
data/functional_tracks_metadata.csv ADDED
The diff for this file is too large to render. See raw diff