Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
9daf043
1
Parent(s):
d71c881
feat: add more species
Browse files- app.py +6 -12
- ntv3_tracks_pipeline.py +27 -1
app.py
CHANGED
|
@@ -521,21 +521,12 @@ _init_bigwig_selected = [tid for tid in DEFAULT_BIGWIG_TRACKS if tid in _init_bi
|
|
| 521 |
# Filter default BED elements to only those available
|
| 522 |
_init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
|
| 523 |
|
| 524 |
-
# Default coordinates per species
|
| 525 |
-
DEFAULT_COORDS = {
|
| 526 |
-
"human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
|
| 527 |
-
"mouse": {"chrom": "chr1", "start": 100_000, "end": 200_000},
|
| 528 |
-
"drosophila_melanogaster": {"chrom": "chr2L", "start": 1_000_000, "end": 2_000_000},
|
| 529 |
-
}
|
| 530 |
-
|
| 531 |
-
# Get default coordinates for default species
|
| 532 |
-
_default_coords = DEFAULT_COORDS.get(DEFAULT_SPECIES, DEFAULT_COORDS["human"])
|
| 533 |
-
|
| 534 |
# Default coordinates per species
|
| 535 |
DEFAULT_COORDS = {
|
| 536 |
"human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
|
| 537 |
"mouse": {"chrom": "chr1", "start": 0, "end": 32_768},
|
| 538 |
"drosophila_melanogaster": {"chrom": "chr2L", "start": 0, "end": 32_768},
|
|
|
|
| 539 |
}
|
| 540 |
|
| 541 |
# Get default coordinates for default species
|
|
@@ -552,6 +543,9 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
|
|
| 552 |
Predict and visualize functional genomics signals directly from DNA using
|
| 553 |
<strong>Nucleotide Transformer v3</strong>.
|
| 554 |
</p>
|
|
|
|
|
|
|
|
|
|
| 555 |
</div>
|
| 556 |
|
| 557 |
<div class="intro-grid">
|
|
@@ -584,7 +578,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
|
|
| 584 |
|
| 585 |
<div class="intro-tip">
|
| 586 |
<span class="intro-tip-icon">💡</span>
|
| 587 |
-
<span><strong>Tip:</strong> The demo
|
| 588 |
</div>
|
| 589 |
|
| 590 |
</div>
|
|
@@ -619,7 +613,7 @@ with gr.Blocks(title="NTv3 Tracks Demo") as demo:
|
|
| 619 |
|
| 620 |
with gr.Row():
|
| 621 |
species = gr.Dropdown(
|
| 622 |
-
["human", "mouse", "drosophila_melanogaster"],
|
| 623 |
value=DEFAULT_SPECIES,
|
| 624 |
label="Species",
|
| 625 |
)
|
|
|
|
| 521 |
# Filter default BED elements to only those available
|
| 522 |
_init_bed_selected = [elem for elem in DEFAULT_BED_ELEMENTS if elem in _init_bed]
|
| 523 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
# Default coordinates per species
|
| 525 |
DEFAULT_COORDS = {
|
| 526 |
"human": {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072},
|
| 527 |
"mouse": {"chrom": "chr1", "start": 0, "end": 32_768},
|
| 528 |
"drosophila_melanogaster": {"chrom": "chr2L", "start": 0, "end": 32_768},
|
| 529 |
+
"arabidopsis_thaliana": {"chrom": "chr1", "start": 0, "end": 32_768},
|
| 530 |
}
|
| 531 |
|
| 532 |
# Get default coordinates for default species
|
|
|
|
| 543 |
Predict and visualize functional genomics signals directly from DNA using
|
| 544 |
<strong>Nucleotide Transformer v3</strong>.
|
| 545 |
</p>
|
| 546 |
+
<p style="margin-top: 8px; font-size: 0.95rem; opacity: 0.85;">
|
| 547 |
+
<strong>Currently available species:</strong> Human, Mouse, Drosophila melanogaster, Arabidopsis thaliana, Gorilla
|
| 548 |
+
</p>
|
| 549 |
</div>
|
| 550 |
|
| 551 |
<div class="intro-grid">
|
|
|
|
| 578 |
|
| 579 |
<div class="intro-tip">
|
| 580 |
<span class="intro-tip-icon">💡</span>
|
| 581 |
+
<span><strong>Tip:</strong> The demo includes default settings that you can use to get started, taking ~ 1 minute to run.</span>
|
| 582 |
</div>
|
| 583 |
|
| 584 |
</div>
|
|
|
|
| 613 |
|
| 614 |
with gr.Row():
|
| 615 |
species = gr.Dropdown(
|
| 616 |
+
["human", "mouse", "drosophila_melanogaster", "arabidopsis_thaliana", "gorilla_gorilla"],
|
| 617 |
value=DEFAULT_SPECIES,
|
| 618 |
label="Species",
|
| 619 |
)
|
ntv3_tracks_pipeline.py
CHANGED
|
@@ -61,6 +61,20 @@ ASSEMBLY_TO_SPECIES = {
|
|
| 61 |
}
|
| 62 |
SPECIES_TO_ASSEMBLY = {v: k for k, v in ASSEMBLY_TO_SPECIES.items()}
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# BED element to color mapping (shared between pipeline and app)
|
| 65 |
BED_ELEMENT_COLORS = {
|
| 66 |
"protein_coding_gene": "#E74C3C", # Red
|
|
@@ -92,9 +106,21 @@ def _sanitize_dna(seq: str) -> str:
|
|
| 92 |
|
| 93 |
|
| 94 |
def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
if requests is None:
|
| 96 |
raise ImportError("requests is required for genome download. Install with: pip install requests")
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
seq = requests.get(url).json()["dna"].upper()
|
| 99 |
return seq
|
| 100 |
|
|
|
|
| 61 |
}
|
| 62 |
SPECIES_TO_ASSEMBLY = {v: k for k, v in ASSEMBLY_TO_SPECIES.items()}
|
| 63 |
|
| 64 |
+
# ---------------------------------------------------------------------
|
| 65 |
+
# Assembly -> API URL template mapping
|
| 66 |
+
# ---------------------------------------------------------------------
|
| 67 |
+
# Default API URL template (UCSC format) that works for most species
|
| 68 |
+
DEFAULT_API_URL_TEMPLATE = "https://api.genome.ucsc.edu/getData/sequence?genome={assembly};chrom={chrom};start={start};end={end}"
|
| 69 |
+
|
| 70 |
+
# for species with different format, add the assembly name to the mapping
|
| 71 |
+
# The template should use {chrom}, {start}, and {end} as placeholders.
|
| 72 |
+
ASSEMBLY_TO_API_URL_TEMPLATE = {
|
| 73 |
+
# Arabidopsis thaliana (TAIR10) - uses hub URL format
|
| 74 |
+
"TAIR10": "https://api.genome.ucsc.edu/getData/sequence?hubUrl=http://genome.ucsc.edu/goldenPath/help/examples/hubExamples/hubAssembly/plantAraTha1/hub.txt;genome=araTha1;chrom={chrom};start={start};end={end}",
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
# BED element to color mapping (shared between pipeline and app)
|
| 79 |
BED_ELEMENT_COLORS = {
|
| 80 |
"protein_coding_gene": "#E74C3C", # Red
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
def _get_dna_sequence(assembly: str, chrom: str, start: int, end: int) -> str:
|
| 109 |
+
"""
|
| 110 |
+
Fetch DNA sequence from API based on assembly, chromosome, and coordinates.
|
| 111 |
+
|
| 112 |
+
Uses ASSEMBLY_TO_API_URL_TEMPLATE to determine the API URL format for each assembly.
|
| 113 |
+
Falls back to DEFAULT_API_URL_TEMPLATE if assembly is not in the mapping.
|
| 114 |
+
"""
|
| 115 |
if requests is None:
|
| 116 |
raise ImportError("requests is required for genome download. Install with: pip install requests")
|
| 117 |
+
|
| 118 |
+
# Get API URL template for this assembly, or use default
|
| 119 |
+
url_template = ASSEMBLY_TO_API_URL_TEMPLATE.get(assembly, DEFAULT_API_URL_TEMPLATE)
|
| 120 |
+
|
| 121 |
+
# Format the URL with the provided parameters
|
| 122 |
+
url = url_template.format(assembly=assembly, chrom=chrom, start=start, end=end)
|
| 123 |
+
|
| 124 |
seq = requests.get(url).json()["dna"].upper()
|
| 125 |
return seq
|
| 126 |
|