bernardo-de-almeida commited on
Commit
9daf043
·
1 Parent(s): d71c881

feat: add more species

Browse files
Files changed (2) hide show
  1. app.py +6 -12
  2. ntv3_tracks_pipeline.py +27 -1
app.py CHANGED
@@ -521,21 +521,12 @@ _init_bigwig_selected = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bi
521
  # Filter default BED elements to only those available
522
  _init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
523
 
524
- # Default coordinates per species
525
- DEFAULT_COORDS = {
526
- "human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
527
- "mouse": {"chrom": "chr1", "start": 100_000, "end": 200_000},
528
- "drosophila_melanogaster": {"chrom": "chr2L", "start": 1_000_000, "end": 2_000_000},
529
- }
530
-
531
- # Get default coordinates for default species
532
- _default_coords = DEFAULT_COORDS.get(DEFAULT_SPECIES, DEFAULT_COORDS["human"])
533
-
534
  # Default coordinates per species
535
  DEFAULT_COORDS = {
536
  "human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
537
  "mouse": {"chrom": "chr1", "start": 0, "end": 32_768},
538
  "drosophila_melanogaster": {"chrom": "chr2L", "start": 0, "end": 32_768},
 
539
  }
540
 
541
  # Get default coordinates for default species
@@ -552,6 +543,9 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
552
  Predict and visualize functional genomics signals directly from DNA using
553
  <strong>Nucleotide Transformer v3</strong>.
554
  </p>
 
 
 
555
  </div>
556
 
557
  <div class="intro-grid">
@@ -584,7 +578,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
584
 
585
  <div class="intro-tip">
586
  <span class="intro-tip-icon">💡</span>
587
- <span><strong>Tip:</strong> The demo include default settings that you can use to get started, taking ~ 1 minute to run.</span>
588
  </div>
589
 
590
  </div>
@@ -619,7 +613,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
619
 
620
  with gr.Row():
621
  species = gr.Dropdown(
622
- ["human", "mouse", "drosophila_melanogaster"],
623
  value=DEFAULT_SPECIES,
624
  label="Species",
625
  )
 
521
  # Filter default BED elements to only those available
522
  _init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
523
 
 
 
 
 
 
 
 
 
 
 
524
  # Default coordinates per species
525
  DEFAULT_COORDS = {
526
  "human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
527
  "mouse": {"chrom": "chr1", "start": 0, "end": 32_768},
528
  "drosophila_melanogaster": {"chrom": "chr2L", "start": 0, "end": 32_768},
529
+ "arabidopsis_thaliana": {"chrom": "chr1", "start": 0, "end": 32_768},
530
  }
531
 
532
  # Get default coordinates for default species
 
543
  Predict and visualize functional genomics signals directly from DNA using
544
  <strong>Nucleotide Transformer v3</strong>.
545
  </p>
546
+ <p style="margin-top: 8px; font-size: 0.95rem; opacity: 0.85;">
547
+ <strong>Currently available species:</strong> Human, Mouse, Drosophila melanogaster, Arabidopsis thaliana, Gorilla
548
+ </p>
549
  </div>
550
 
551
  <div class="intro-grid">
 
578
 
579
  <div class="intro-tip">
580
  <span class="intro-tip-icon">💡</span>
581
+ <span><strong>Tip:</strong> The demo includes default settings that you can use to get started, taking ~ 1 minute to run.</span>
582
  </div>
583
 
584
  </div>
 
613
 
614
  with gr.Row():
615
  species = gr.Dropdown(
616
+ ["human", "mouse", "drosophila_melanogaster", "arabidopsis_thaliana", "gorilla_gorilla"],
617
  value=DEFAULT_SPECIES,
618
  label="Species",
619
  )
ntv3_tracks_pipeline.py CHANGED
@@ -61,6 +61,20 @@ ASSEMBLY_TO_SPECIES = {
61
  }
62
  SPECIES_TO_ASSEMBLY = {v: k for k, v in ASSEMBLY_TO_SPECIES.items()}
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # BED element to color mapping (shared between pipeline and app)
65
  BED_ELEMENT_COLORS = {
66
  "protein_coding_gene": "#E74C3C", # Red
@@ -92,9 +106,21 @@ def _sanitize_dna(seq: str) -> str:
92
 
93
 
94
  def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
 
 
 
 
 
 
95
  if requests is None:
96
  raise ImportError("requests is required for genome download. Install with: pip install requests")
97
- url = f"https://api.genome.ucsc.edu/getData/sequence?genome={assembly};chrom={chrom};start={start};end={end}"
 
 
 
 
 
 
98
  seq = requests.get(url).json()["dna"].upper()
99
  return seq
100
 
 
61
  }
62
  SPECIES_TO_ASSEMBLY = {v: k for k, v in ASSEMBLY_TO_SPECIES.items()}
63
 
64
+ # ---------------------------------------------------------------------
65
+ # Assembly -> API URL template mapping
66
+ # ---------------------------------------------------------------------
67
+ # Default API URL template (UCSC format) that works for most species
68
+ DEFAULT_API_URL_TEMPLATE = "https://api.genome.ucsc.edu/getData/sequence?genome={assembly};chrom={chrom};start={start};end={end}"
69
+
70
+ # for species with different format, add the assembly name to the mapping
71
+ # The template should use {chrom}, {start}, and {end} as placeholders.
72
+ ASSEMBLY_TO_API_URL_TEMPLATE = {
73
+ # Arabidopsis thaliana (TAIR10) - uses hub URL format
74
+ "TAIR10": "https://api.genome.ucsc.edu/getData/sequence?hubUrl=http://genome.ucsc.edu/goldenPath/help/examples/hubExamples/hubAssembly/plantAraTha1/hub.txt;genome=araTha1;chrom={chrom};start={start};end={end}",
75
+ }
76
+
77
+
78
  # BED element to color mapping (shared between pipeline and app)
79
  BED_ELEMENT_COLORS = {
80
  "protein_coding_gene": "#E74C3C", # Red
 
106
 
107
 
108
  def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
109
+ """
110
+ Fetch DNA sequence from API based on assembly, chromosome, and coordinates.
111
+
112
+ Uses ASSEMBLY_TO_API_URL_TEMPLATE to determine the API URL format for each assembly.
113
+ Falls back to DEFAULT_API_URL_TEMPLATE if assembly is not in the mapping.
114
+ """
115
  if requests is None:
116
  raise ImportError("requests is required for genome download. Install with: pip install requests")
117
+
118
+ # Get API URL template for this assembly, or use default
119
+ url_template = ASSEMBLY_TO_API_URL_TEMPLATE.get(assembly, DEFAULT_API_URL_TEMPLATE)
120
+
121
+ # Format the URL with the provided parameters
122
+ url = url_template.format(assembly=assembly, chrom=chrom, start=start, end=end)
123
+
124
  seq = requests.get(url).json()["dna"].upper()
125
  return seq
126