Spaces:
Build error
Build error
Ashkan Taghipour (The University of Western Australia) commited on
Commit ·
16e4ad5
1
Parent(s): ad78656
Initial deploy: Pigeon Pea Pangenome Atlas
Browse files- .gitattributes +5 -0
- .gitignore +7 -0
- README.md +6 -8
- app.py +266 -0
- data/89_line_PAV.txt +3 -0
- data/Pigenopea_89lines_pangenome_clean.all_leAED1_sort_byPerl_clean.gff +3 -0
- data/Pigenopea_89lines_pangenome_clean.fasta +3 -0
- data/Pigenopea_89lines_pangenome_clean_bg1k_protein.fasta +3 -0
- precomputed/cluster_markers.parquet +3 -0
- precomputed/genome_contig_index.json +3 -0
- precomputed/gff_gene_index.parquet +3 -0
- precomputed/hotspot_bins.parquet +3 -0
- precomputed/line_embedding.parquet +3 -0
- precomputed/line_similarity_topk.parquet +3 -0
- precomputed/line_stats.parquet +3 -0
- precomputed/pav_gene_frequency.parquet +3 -0
- precomputed/pav_matrix.parquet +3 -0
- precomputed/protein_index.parquet +3 -0
- requirements.txt +11 -0
- scripts/run_precompute.py +111 -0
- src/__init__.py +0 -0
- src/callbacks.py +524 -0
- src/data_loader.py +206 -0
- src/field_report.py +175 -0
- src/gene_card.py +148 -0
- src/precompute.py +259 -0
- src/state.py +37 -0
- src/utils.py +55 -0
- tests/__init__.py +0 -0
- tests/conftest.py +124 -0
- tests/test_callbacks.py +97 -0
- tests/test_data_loader.py +90 -0
- tests/test_field_report.py +63 -0
- tests/test_gene_card.py +54 -0
- tests/test_integration.py +69 -0
- tests/test_precompute.py +112 -0
- ui/__init__.py +0 -0
- ui/final.py +33 -0
- ui/gene_card_ui.py +30 -0
- ui/layout.py +108 -0
- ui/quest0.py +54 -0
- ui/quest1.py +51 -0
- ui/quest2.py +70 -0
- ui/quest3.py +45 -0
- ui/quest4.py +39 -0
- ui/theme.py +101 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.fasta filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.fasta.gz filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.gff filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
89_line_PAV.txt filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
genome_contig_index.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CREDENTIALS_AND_CONFIG.txt
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.env
|
| 5 |
+
*.egg-info/
|
| 6 |
+
.conda/
|
| 7 |
+
.pytest_cache/
|
README.md
CHANGED
|
@@ -1,13 +1,11 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
short_description:
|
| 11 |
---
|
| 12 |
-
|
| 13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Pigeon Pea Pangenome Atlas
|
| 3 |
+
emoji: "\U0001F331"
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.9.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
short_description: Interactive pangenome exploration of 89 pigeon pea lines
|
| 11 |
---
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pigeon Pea Pangenome Atlas — Main entry point.
|
| 3 |
+
Gradio app for exploring the pangenome of 89 pigeon pea lines.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import json
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import gradio as gr
|
| 11 |
+
|
| 12 |
+
from src.state import AppState
|
| 13 |
+
from src.utils import logger, PRECOMPUTED_DIR
|
| 14 |
+
from src.data_loader import validate_joins
|
| 15 |
+
from src.callbacks import (
|
| 16 |
+
on_line_selected, on_start_journey,
|
| 17 |
+
build_umap_plot, on_umap_select, on_compare_party,
|
| 18 |
+
build_donut_chart, build_frequency_histogram, build_treasure_table,
|
| 19 |
+
on_pin_gene, on_gene_click_table,
|
| 20 |
+
build_hotspot_heatmap, on_contig_selected,
|
| 21 |
+
get_protein_stats_html, build_backpack_comparison, build_composition_heatmap,
|
| 22 |
+
on_open_gene_card, on_download_gene_report,
|
| 23 |
+
on_generate_report, build_data_health_html,
|
| 24 |
+
)
|
| 25 |
+
from ui.layout import build_app
|
| 26 |
+
|
| 27 |
+
# ===========================================================
|
| 28 |
+
# Load precomputed data
|
| 29 |
+
# ===========================================================
|
| 30 |
+
logger.info("Loading precomputed data...")
|
| 31 |
+
|
| 32 |
+
DATA = {}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def load_data():
|
| 36 |
+
"""Load all precomputed parquets into memory."""
|
| 37 |
+
p = PRECOMPUTED_DIR
|
| 38 |
+
|
| 39 |
+
DATA["gene_freq"] = pd.read_parquet(p / "pav_gene_frequency.parquet")
|
| 40 |
+
DATA["line_stats"] = pd.read_parquet(p / "line_stats.parquet")
|
| 41 |
+
DATA["embedding"] = pd.read_parquet(p / "line_embedding.parquet")
|
| 42 |
+
DATA["similarity"] = pd.read_parquet(p / "line_similarity_topk.parquet")
|
| 43 |
+
DATA["gff_index"] = pd.read_parquet(p / "gff_gene_index.parquet")
|
| 44 |
+
DATA["protein"] = pd.read_parquet(p / "protein_index.parquet")
|
| 45 |
+
DATA["hotspots"] = pd.read_parquet(p / "hotspot_bins.parquet")
|
| 46 |
+
DATA["markers"] = pd.read_parquet(p / "cluster_markers.parquet")
|
| 47 |
+
|
| 48 |
+
# Load PAV matrix for live queries
|
| 49 |
+
pav_path = p / "pav_matrix.parquet"
|
| 50 |
+
if pav_path.exists():
|
| 51 |
+
DATA["pav"] = pd.read_parquet(pav_path)
|
| 52 |
+
logger.info(f"PAV matrix loaded: {DATA['pav'].shape}")
|
| 53 |
+
|
| 54 |
+
# Load contig index
|
| 55 |
+
ci_path = p / "genome_contig_index.json"
|
| 56 |
+
if ci_path.exists():
|
| 57 |
+
with open(ci_path) as f:
|
| 58 |
+
DATA["contig_index"] = json.load(f)
|
| 59 |
+
|
| 60 |
+
logger.info("All data loaded successfully")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
load_data()
|
| 64 |
+
|
| 65 |
+
# ===========================================================
|
| 66 |
+
# Derive UI choices
|
| 67 |
+
# ===========================================================
|
| 68 |
+
line_choices = sorted(DATA["line_stats"]["line_id"].tolist())
|
| 69 |
+
|
| 70 |
+
# Top contigs by gene count
|
| 71 |
+
contig_gene_counts = DATA["gff_index"]["contig_id"].value_counts()
|
| 72 |
+
contig_choices = contig_gene_counts.head(30).index.tolist()
|
| 73 |
+
|
| 74 |
+
# Gene choices (all genes with protein data)
|
| 75 |
+
gene_choices = sorted(DATA["protein"]["gene_id"].tolist())
|
| 76 |
+
|
| 77 |
+
# Country list for filters
|
| 78 |
+
country_list = sorted(DATA["line_stats"]["country"].unique().tolist())
|
| 79 |
+
|
| 80 |
+
# ===========================================================
|
| 81 |
+
# Build UI
|
| 82 |
+
# ===========================================================
|
| 83 |
+
demo, C = build_app(line_choices, contig_choices, gene_choices)
|
| 84 |
+
|
| 85 |
+
# ===========================================================
|
| 86 |
+
# Wire callbacks
|
| 87 |
+
# ===========================================================
|
| 88 |
+
with demo:
|
| 89 |
+
|
| 90 |
+
# Update country filter choices
|
| 91 |
+
C["country_filter"].choices = country_list
|
| 92 |
+
|
| 93 |
+
# -- Data Health on load --
|
| 94 |
+
try:
|
| 95 |
+
report = {
|
| 96 |
+
"Total genes in PAV": len(DATA["gene_freq"]),
|
| 97 |
+
"Lines": len(DATA["line_stats"]),
|
| 98 |
+
"Genes with GFF annotation": len(DATA["gff_index"]),
|
| 99 |
+
"Genes with protein data": len(DATA["protein"]),
|
| 100 |
+
"Clusters found": DATA["embedding"]["cluster_id"].nunique(),
|
| 101 |
+
}
|
| 102 |
+
C["data_health_html"].value = build_data_health_html(report)
|
| 103 |
+
except Exception as e:
|
| 104 |
+
C["data_health_html"].value = f"<p>Error: {e}</p>"
|
| 105 |
+
|
| 106 |
+
# -- Quest 0 --
|
| 107 |
+
C["q0_line_dropdown"].change(
|
| 108 |
+
fn=lambda line_id, state: on_line_selected(line_id, state, DATA),
|
| 109 |
+
inputs=[C["q0_line_dropdown"], C["state"]],
|
| 110 |
+
outputs=[C["q0_total_genes"], C["q0_unique_genes"],
|
| 111 |
+
C["q0_nearest_neighbor"], C["state"]],
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
C["q0_start_btn"].click(
|
| 115 |
+
fn=on_start_journey,
|
| 116 |
+
inputs=[C["state"]],
|
| 117 |
+
outputs=[C["tabs"], C["state"]],
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# -- Quest 1 --
|
| 121 |
+
C["q1_color_radio"].change(
|
| 122 |
+
fn=lambda color_by, state: build_umap_plot(color_by, state, DATA),
|
| 123 |
+
inputs=[C["q1_color_radio"], C["state"]],
|
| 124 |
+
outputs=[C["q1_umap_plot"]],
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Render UMAP on tab load
|
| 128 |
+
C["q1_tab"].select(
|
| 129 |
+
fn=lambda state: build_umap_plot("Country", state, DATA),
|
| 130 |
+
inputs=[C["state"]],
|
| 131 |
+
outputs=[C["q1_umap_plot"]],
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
C["q1_compare_btn"].click(
|
| 135 |
+
fn=lambda state: on_compare_party(state, DATA),
|
| 136 |
+
inputs=[C["state"]],
|
| 137 |
+
outputs=[C["q1_comparison_plot"], C["q1_comparison_plot"]],
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# -- Quest 2 --
|
| 141 |
+
def update_quest2(core_thresh, cloud_thresh, filter_type, state):
|
| 142 |
+
donut = build_donut_chart(core_thresh, cloud_thresh, DATA)
|
| 143 |
+
hist = build_frequency_histogram(core_thresh, cloud_thresh, DATA)
|
| 144 |
+
table = build_treasure_table(state, core_thresh, cloud_thresh, filter_type, DATA)
|
| 145 |
+
return donut, hist, table
|
| 146 |
+
|
| 147 |
+
for trigger in [C["q2_core_slider"], C["q2_cloud_slider"]]:
|
| 148 |
+
trigger.change(
|
| 149 |
+
fn=update_quest2,
|
| 150 |
+
inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
|
| 151 |
+
C["q2_filter_radio"], C["state"]],
|
| 152 |
+
outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
|
| 153 |
+
C["q2_treasure_table"]],
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
C["q2_filter_radio"].change(
|
| 157 |
+
fn=update_quest2,
|
| 158 |
+
inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
|
| 159 |
+
C["q2_filter_radio"], C["state"]],
|
| 160 |
+
outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
|
| 161 |
+
C["q2_treasure_table"]],
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# Render Quest 2 on tab select
|
| 165 |
+
C["q2_tab"].select(
|
| 166 |
+
fn=update_quest2,
|
| 167 |
+
inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
|
| 168 |
+
C["q2_filter_radio"], C["state"]],
|
| 169 |
+
outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
|
| 170 |
+
C["q2_treasure_table"]],
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
C["q2_pin_btn"].click(
|
| 174 |
+
fn=lambda gene_id, state: on_pin_gene(gene_id, state),
|
| 175 |
+
inputs=[C["q2_selected_gene_text"], C["state"]],
|
| 176 |
+
outputs=[C["q2_backpack_display"], C["state"]],
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Table row click → select gene
|
| 180 |
+
C["q2_treasure_table"].select(
|
| 181 |
+
fn=lambda evt, state: (str(evt.value) if evt else "", state),
|
| 182 |
+
inputs=[C["state"]],
|
| 183 |
+
outputs=[C["q2_selected_gene_text"], C["state"]],
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# -- Quest 3 --
|
| 187 |
+
C["q3_tab"].select(
|
| 188 |
+
fn=lambda: build_hotspot_heatmap(DATA),
|
| 189 |
+
inputs=[],
|
| 190 |
+
outputs=[C["q3_heatmap_plot"]],
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
C["q3_contig_dropdown"].change(
|
| 194 |
+
fn=lambda contig_id, state: on_contig_selected(contig_id, DATA, state),
|
| 195 |
+
inputs=[C["q3_contig_dropdown"], C["state"]],
|
| 196 |
+
outputs=[C["q3_track_plot"], C["q3_region_table"]],
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# -- Quest 4 --
|
| 200 |
+
C["q4_gene_dropdown"].change(
|
| 201 |
+
fn=lambda gene_id: get_protein_stats_html(gene_id, DATA),
|
| 202 |
+
inputs=[C["q4_gene_dropdown"]],
|
| 203 |
+
outputs=[C["q4_protein_stats_html"]],
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
C["q4_tab"].select(
|
| 207 |
+
fn=lambda state: (
|
| 208 |
+
build_backpack_comparison(state, DATA),
|
| 209 |
+
build_composition_heatmap(state, DATA),
|
| 210 |
+
),
|
| 211 |
+
inputs=[C["state"]],
|
| 212 |
+
outputs=[C["q4_comparison_bar_plot"], C["q4_composition_heatmap"]],
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# -- Gene Card --
|
| 216 |
+
C["gc_show_genome_btn"].click(
|
| 217 |
+
fn=lambda state: (
|
| 218 |
+
gr.Tabs(selected="quest3"),
|
| 219 |
+
state.selected_gene if state and state.selected_gene else None,
|
| 220 |
+
),
|
| 221 |
+
inputs=[C["state"]],
|
| 222 |
+
outputs=[C["tabs"], C["q3_contig_dropdown"]],
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
C["gc_show_protein_btn"].click(
|
| 226 |
+
fn=lambda state: (
|
| 227 |
+
gr.Tabs(selected="quest4"),
|
| 228 |
+
state.selected_gene if state and state.selected_gene else None,
|
| 229 |
+
),
|
| 230 |
+
inputs=[C["state"]],
|
| 231 |
+
outputs=[C["tabs"], C["q4_gene_dropdown"]],
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
C["gc_pin_card_btn"].click(
|
| 235 |
+
fn=lambda state: on_pin_gene(state.selected_gene if state else "", state),
|
| 236 |
+
inputs=[C["state"]],
|
| 237 |
+
outputs=[C["q2_backpack_display"], C["state"]],
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
C["gc_download_gene_btn"].click(
|
| 241 |
+
fn=lambda state: on_download_gene_report(state, DATA),
|
| 242 |
+
inputs=[C["state"]],
|
| 243 |
+
outputs=[C["gc_gene_report_file"]],
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
# -- Final Report --
|
| 247 |
+
C["final_generate_btn"].click(
|
| 248 |
+
fn=lambda state: on_generate_report(state, DATA),
|
| 249 |
+
inputs=[C["state"]],
|
| 250 |
+
outputs=[
|
| 251 |
+
C["final_report_md"],
|
| 252 |
+
C["final_download_json"],
|
| 253 |
+
C["final_download_csv"],
|
| 254 |
+
C["final_download_json"], # visibility
|
| 255 |
+
C["final_download_csv"], # visibility
|
| 256 |
+
C["final_achievements_html"],
|
| 257 |
+
C["state"],
|
| 258 |
+
],
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
# ===========================================================
|
| 263 |
+
# Launch
|
| 264 |
+
# ===========================================================
|
| 265 |
+
if __name__ == "__main__":
|
| 266 |
+
demo.launch()
|
data/89_line_PAV.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4eaac454bd3103e8738cf483fc45ac821c8ae0dc99d28c5abd383b0005ef38c
|
| 3 |
+
size 10437803
|
data/Pigenopea_89lines_pangenome_clean.all_leAED1_sort_byPerl_clean.gff
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a288ef682398b642d4024a09f55727d6391fb164b8b6d72603b13796b6b95eba
|
| 3 |
+
size 64791788
|
data/Pigenopea_89lines_pangenome_clean.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a282a51d87323f1effd70ee43a9ab7b95dbc6584f4df0e6b3c0d095a0b992f7
|
| 3 |
+
size 637456599
|
data/Pigenopea_89lines_pangenome_clean_bg1k_protein.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be9efd3242fffb67a569937fb4842027238c86a245f303b5521d48ec84c50389
|
| 3 |
+
size 18085493
|
precomputed/cluster_markers.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b195070da682210273a73da98b3aa3bcf8146d1c9dd6aa1094ff0e7b17ca0485
|
| 3 |
+
size 6411
|
precomputed/genome_contig_index.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7184ff7812f745877dd62bda4310f7336906e3aa3658908fee9a767dbf53d04d
|
| 3 |
+
size 11022538
|
precomputed/gff_gene_index.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d48537f3aae0c63e4afb65831be8762f805a5215b7ff0fff64de44a7385d50e
|
| 3 |
+
size 1247901
|
precomputed/hotspot_bins.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fbf2c937e61d39a60a1d51e20bcd6b0c5dd681429a763b43467d5d9e95fcfa59
|
| 3 |
+
size 132552
|
precomputed/line_embedding.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97492cd82529decb539371fde3696da357b670ec44ed1a4f32015fc6ad621ef8
|
| 3 |
+
size 4680
|
precomputed/line_similarity_topk.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8339f144d99301b979159e14e8a413364727a8f93dcc4801945aee7671ef9e10
|
| 3 |
+
size 8063
|
precomputed/line_stats.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:645aba02382a23a3cdb476bb3e24f9111fb806909d076732bead9662d8eba4b6
|
| 3 |
+
size 4672
|
precomputed/pav_gene_frequency.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecd91ec6198f2d8489177a03d81e9c62b6edcc4575994eb534c9c4507c38071d
|
| 3 |
+
size 376243
|
precomputed/pav_matrix.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24abc505527d02d5d1c465f0b93c1902833e95c224ad00f3ccc9ee881173729c
|
| 3 |
+
size 490920
|
precomputed/protein_index.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c67ced53bb82255c9027d4d492acf8945bcc0c28ec233a73ced14054a482649
|
| 3 |
+
size 1623053
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
plotly>=5.24
|
| 2 |
+
pandas>=2.2
|
| 3 |
+
pyarrow>=17.0
|
| 4 |
+
numpy>=1.26
|
| 5 |
+
scipy>=1.14
|
| 6 |
+
scikit-learn>=1.5
|
| 7 |
+
umap-learn>=0.5.7
|
| 8 |
+
biopython>=1.84
|
| 9 |
+
pyfaidx>=0.8
|
| 10 |
+
Pillow
|
| 11 |
+
reportlab
|
scripts/run_precompute.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
CLI script to generate all precomputed data.
|
| 4 |
+
Usage: python scripts/run_precompute.py --data-dir data/ --output-dir precomputed/
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
# Add project root to path
|
| 13 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 14 |
+
|
| 15 |
+
from src.data_loader import (
|
| 16 |
+
load_pav, parse_gff_genes, parse_protein_fasta,
|
| 17 |
+
build_contig_index, build_contig_name_mapping, validate_joins,
|
| 18 |
+
)
|
| 19 |
+
from src.precompute import (
|
| 20 |
+
compute_gene_frequency, compute_line_stats, compute_line_embedding,
|
| 21 |
+
compute_similarity_topk, build_gff_gene_parquet, build_protein_parquet,
|
| 22 |
+
save_contig_index, compute_hotspot_bins, compute_cluster_markers,
|
| 23 |
+
)
|
| 24 |
+
from src.utils import logger, find_file
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def main():
|
| 28 |
+
parser = argparse.ArgumentParser(description="Precompute pangenome data")
|
| 29 |
+
parser.add_argument("--data-dir", default="data/", help="Input data directory")
|
| 30 |
+
parser.add_argument("--output-dir", default="precomputed/", help="Output directory")
|
| 31 |
+
args = parser.parse_args()
|
| 32 |
+
|
| 33 |
+
data_dir = os.path.abspath(args.data_dir)
|
| 34 |
+
output_dir = os.path.abspath(args.output_dir)
|
| 35 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
t_total = time.time()
|
| 38 |
+
|
| 39 |
+
# 1. Load raw data
|
| 40 |
+
logger.info("=== Phase 1: Loading raw data ===")
|
| 41 |
+
pav_path = os.path.join(data_dir, "89_line_PAV.txt")
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
data_p = Path(data_dir)
|
| 44 |
+
|
| 45 |
+
gff_files = list(data_p.glob("*.gff"))
|
| 46 |
+
protein_files = list(data_p.glob("*protein*.fasta"))
|
| 47 |
+
genome_files = [f for f in data_p.glob("*.fasta") if "protein" not in f.name]
|
| 48 |
+
|
| 49 |
+
if not gff_files:
|
| 50 |
+
logger.error("No GFF file found in data directory")
|
| 51 |
+
sys.exit(1)
|
| 52 |
+
if not protein_files:
|
| 53 |
+
logger.error("No protein FASTA file found in data directory")
|
| 54 |
+
sys.exit(1)
|
| 55 |
+
|
| 56 |
+
pav = load_pav(pav_path)
|
| 57 |
+
gff_genes = parse_gff_genes(str(gff_files[0]))
|
| 58 |
+
protein_index = parse_protein_fasta(str(protein_files[0]))
|
| 59 |
+
|
| 60 |
+
contig_index = {}
|
| 61 |
+
if genome_files:
|
| 62 |
+
contig_index = build_contig_index(str(genome_files[0]))
|
| 63 |
+
else:
|
| 64 |
+
logger.warning("No genome FASTA found; contig index will be empty")
|
| 65 |
+
|
| 66 |
+
# Validation
|
| 67 |
+
logger.info("=== Validation ===")
|
| 68 |
+
contig_mapping = build_contig_name_mapping(gff_genes, contig_index)
|
| 69 |
+
report = validate_joins(pav, gff_genes, protein_index, contig_index)
|
| 70 |
+
for k, v in report.items():
|
| 71 |
+
logger.info(f" {k}: {v}")
|
| 72 |
+
|
| 73 |
+
# 2. Compute derived data
|
| 74 |
+
logger.info("=== Phase 2: Computing derived data ===")
|
| 75 |
+
|
| 76 |
+
gene_freq = compute_gene_frequency(pav)
|
| 77 |
+
gene_freq.to_parquet(os.path.join(output_dir, "pav_gene_frequency.parquet"), index=False)
|
| 78 |
+
|
| 79 |
+
line_stats = compute_line_stats(pav)
|
| 80 |
+
line_stats.to_parquet(os.path.join(output_dir, "line_stats.parquet"), index=False)
|
| 81 |
+
|
| 82 |
+
embedding = compute_line_embedding(pav)
|
| 83 |
+
embedding.to_parquet(os.path.join(output_dir, "line_embedding.parquet"), index=False)
|
| 84 |
+
|
| 85 |
+
similarity = compute_similarity_topk(pav, k=15)
|
| 86 |
+
similarity.to_parquet(os.path.join(output_dir, "line_similarity_topk.parquet"), index=False)
|
| 87 |
+
|
| 88 |
+
build_gff_gene_parquet(gff_genes, os.path.join(output_dir, "gff_gene_index.parquet"))
|
| 89 |
+
build_protein_parquet(protein_index, os.path.join(output_dir, "protein_index.parquet"))
|
| 90 |
+
save_contig_index(contig_index, contig_mapping, os.path.join(output_dir, "genome_contig_index.json"))
|
| 91 |
+
|
| 92 |
+
hotspots = compute_hotspot_bins(gff_genes, gene_freq, contig_index)
|
| 93 |
+
hotspots.to_parquet(os.path.join(output_dir, "hotspot_bins.parquet"), index=False)
|
| 94 |
+
|
| 95 |
+
markers = compute_cluster_markers(pav, embedding)
|
| 96 |
+
markers.to_parquet(os.path.join(output_dir, "cluster_markers.parquet"), index=False)
|
| 97 |
+
|
| 98 |
+
# Also save the PAV matrix as parquet for efficient loading
|
| 99 |
+
pav.to_parquet(os.path.join(output_dir, "pav_matrix.parquet"))
|
| 100 |
+
|
| 101 |
+
dt = time.time() - t_total
|
| 102 |
+
logger.info(f"=== All precomputation done in {dt:.1f}s ===")
|
| 103 |
+
|
| 104 |
+
# List output files
|
| 105 |
+
for f in sorted(Path(output_dir).glob("*")):
|
| 106 |
+
size_mb = f.stat().st_size / 1024 / 1024
|
| 107 |
+
logger.info(f" {f.name}: {size_mb:.2f} MB")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
if __name__ == "__main__":
|
| 111 |
+
main()
|
src/__init__.py
ADDED
|
File without changes
|
src/callbacks.py
ADDED
|
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""All Gradio callbacks for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import plotly.graph_objects as go
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from src.state import AppState
|
| 9 |
+
from src.gene_card import build_gene_card, render_gene_card_html, export_gene_report
|
| 10 |
+
from src.field_report import generate_field_report, export_report_json, export_report_csv
|
| 11 |
+
|
| 12 |
+
# Color palettes
|
| 13 |
+
CORE_COLORS = {"core": "#2E7D32", "shell": "#FFC107", "cloud": "#F44336", "unknown": "#9E9E9E"}
|
| 14 |
+
COUNTRY_COLORS = px.colors.qualitative.Set3
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# ============================================================
|
| 18 |
+
# Quest 0 Callbacks
|
| 19 |
+
# ============================================================
|
| 20 |
+
|
| 21 |
+
def on_line_selected(line_id: str, state: AppState, data: dict) -> tuple:
|
| 22 |
+
"""
|
| 23 |
+
Triggered by dropdown change.
|
| 24 |
+
Returns: (total_genes, unique_genes, nearest_neighbor, updated_state)
|
| 25 |
+
"""
|
| 26 |
+
if not line_id or state is None:
|
| 27 |
+
state = AppState()
|
| 28 |
+
state.selected_line = line_id
|
| 29 |
+
|
| 30 |
+
line_stats = data["line_stats"]
|
| 31 |
+
similarity = data["similarity"]
|
| 32 |
+
|
| 33 |
+
row = line_stats[line_stats["line_id"] == line_id]
|
| 34 |
+
if len(row) == 0:
|
| 35 |
+
return "--", "--", "--", state
|
| 36 |
+
|
| 37 |
+
total_genes = str(int(row.iloc[0]["genes_present_count"]))
|
| 38 |
+
unique_genes = str(int(row.iloc[0]["unique_genes_count"]))
|
| 39 |
+
|
| 40 |
+
# Nearest neighbor
|
| 41 |
+
sim_rows = similarity[similarity["line_id"] == line_id]
|
| 42 |
+
if len(sim_rows) > 0:
|
| 43 |
+
top = sim_rows.nlargest(1, "jaccard_score").iloc[0]
|
| 44 |
+
nearest = f"{top['neighbor_line_id']} ({top['jaccard_score']:.3f})"
|
| 45 |
+
else:
|
| 46 |
+
nearest = "--"
|
| 47 |
+
|
| 48 |
+
return total_genes, unique_genes, nearest, state
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def on_start_journey(state: AppState) -> tuple:
|
| 52 |
+
"""Award Explorer achievement and switch to Quest 1."""
|
| 53 |
+
if state is None:
|
| 54 |
+
state = AppState()
|
| 55 |
+
state.award("Explorer")
|
| 56 |
+
return gr.Tabs(selected="quest1"), state
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ============================================================
|
| 60 |
+
# Quest 1 Callbacks
|
| 61 |
+
# ============================================================
|
| 62 |
+
|
| 63 |
+
def build_umap_plot(color_by: str, state: AppState, data: dict) -> go.Figure:
|
| 64 |
+
"""Build Plotly UMAP scatter."""
|
| 65 |
+
embedding = data["embedding"]
|
| 66 |
+
line_stats = data["line_stats"]
|
| 67 |
+
|
| 68 |
+
df = embedding.merge(line_stats[["line_id", "country"]], on="line_id", how="left")
|
| 69 |
+
df["country"] = df["country"].fillna("Unknown")
|
| 70 |
+
|
| 71 |
+
color_col = "country" if color_by == "Country" else "cluster_id"
|
| 72 |
+
if color_col == "cluster_id":
|
| 73 |
+
df["cluster_id"] = df["cluster_id"].astype(str)
|
| 74 |
+
|
| 75 |
+
fig = px.scatter(
|
| 76 |
+
df, x="umap_x", y="umap_y", color=color_col,
|
| 77 |
+
hover_data=["line_id", "country"],
|
| 78 |
+
title="UMAP of 89 Pigeon Pea Lines",
|
| 79 |
+
labels={"umap_x": "UMAP 1", "umap_y": "UMAP 2"},
|
| 80 |
+
color_discrete_sequence=COUNTRY_COLORS if color_by == "Country" else px.colors.qualitative.Bold,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Highlight selected line
|
| 84 |
+
if state and state.selected_line:
|
| 85 |
+
sel = df[df["line_id"] == state.selected_line]
|
| 86 |
+
if len(sel) > 0:
|
| 87 |
+
fig.add_trace(go.Scatter(
|
| 88 |
+
x=sel["umap_x"], y=sel["umap_y"],
|
| 89 |
+
mode="markers",
|
| 90 |
+
marker=dict(symbol="star", size=18, color="red", line=dict(width=2, color="black")),
|
| 91 |
+
name=f"Your line: {state.selected_line}",
|
| 92 |
+
hovertext=state.selected_line,
|
| 93 |
+
))
|
| 94 |
+
|
| 95 |
+
fig.update_layout(
|
| 96 |
+
plot_bgcolor="white",
|
| 97 |
+
legend=dict(orientation="h", yanchor="bottom", y=-0.3),
|
| 98 |
+
)
|
| 99 |
+
return fig
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def on_umap_select(selected_data, state: AppState) -> tuple:
|
| 103 |
+
"""Handle UMAP point selection."""
|
| 104 |
+
if state is None:
|
| 105 |
+
state = AppState()
|
| 106 |
+
if selected_data and "points" in selected_data:
|
| 107 |
+
selected_lines = [p.get("hovertext", p.get("text", "")) for p in selected_data["points"]]
|
| 108 |
+
selected_lines = [l for l in selected_lines if l]
|
| 109 |
+
state.selected_party = selected_lines
|
| 110 |
+
party_text = f"Selected {len(selected_lines)} lines: " + ", ".join(selected_lines[:10])
|
| 111 |
+
if len(selected_lines) > 10:
|
| 112 |
+
party_text += f" ... +{len(selected_lines) - 10} more"
|
| 113 |
+
else:
|
| 114 |
+
state.selected_party = []
|
| 115 |
+
party_text = "None selected"
|
| 116 |
+
return party_text, state
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def on_compare_party(state: AppState, data: dict) -> tuple:
|
| 120 |
+
"""Compare selected line vs party."""
|
| 121 |
+
if not state or not state.selected_line or not state.selected_party:
|
| 122 |
+
fig = go.Figure()
|
| 123 |
+
fig.add_annotation(text="Select your line and a party first", showarrow=False)
|
| 124 |
+
return fig, True
|
| 125 |
+
|
| 126 |
+
pav = data.get("pav")
|
| 127 |
+
if pav is None:
|
| 128 |
+
fig = go.Figure()
|
| 129 |
+
fig.add_annotation(text="PAV data not loaded", showarrow=False)
|
| 130 |
+
return fig, True
|
| 131 |
+
|
| 132 |
+
my_genes = set(pav.index[pav[state.selected_line] == 1])
|
| 133 |
+
party_cols = [c for c in state.selected_party if c in pav.columns and c != state.selected_line]
|
| 134 |
+
if not party_cols:
|
| 135 |
+
fig = go.Figure()
|
| 136 |
+
fig.add_annotation(text="No valid party members", showarrow=False)
|
| 137 |
+
return fig, True
|
| 138 |
+
|
| 139 |
+
party_genes = set()
|
| 140 |
+
for col in party_cols:
|
| 141 |
+
party_genes |= set(pav.index[pav[col] == 1])
|
| 142 |
+
|
| 143 |
+
shared = len(my_genes & party_genes)
|
| 144 |
+
only_mine = len(my_genes - party_genes)
|
| 145 |
+
only_party = len(party_genes - my_genes)
|
| 146 |
+
|
| 147 |
+
fig = go.Figure(data=[
|
| 148 |
+
go.Bar(name="Shared", x=["Gene Sets"], y=[shared], marker_color="#2E7D32"),
|
| 149 |
+
go.Bar(name=f"Only {state.selected_line}", x=["Gene Sets"], y=[only_mine], marker_color="#1565C0"),
|
| 150 |
+
go.Bar(name="Only Party", x=["Gene Sets"], y=[only_party], marker_color="#FFC107"),
|
| 151 |
+
])
|
| 152 |
+
fig.update_layout(
|
| 153 |
+
barmode="group",
|
| 154 |
+
title=f"Gene Comparison: {state.selected_line} vs {len(party_cols)} party members",
|
| 155 |
+
yaxis_title="Number of genes",
|
| 156 |
+
)
|
| 157 |
+
return fig, True
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ============================================================
|
| 161 |
+
# Quest 2 Callbacks
|
| 162 |
+
# ============================================================
|
| 163 |
+
|
| 164 |
+
def build_donut_chart(core_thresh: float, cloud_thresh: float, data: dict) -> go.Figure:
|
| 165 |
+
"""Build core/shell/cloud donut chart."""
|
| 166 |
+
gene_freq = data["gene_freq"]
|
| 167 |
+
|
| 168 |
+
core = int((gene_freq["freq_pct"] >= core_thresh).sum())
|
| 169 |
+
cloud = int((gene_freq["freq_pct"] < cloud_thresh).sum())
|
| 170 |
+
shell = len(gene_freq) - core - cloud
|
| 171 |
+
|
| 172 |
+
fig = go.Figure(data=[go.Pie(
|
| 173 |
+
labels=["Core", "Shell", "Cloud"],
|
| 174 |
+
values=[core, shell, cloud],
|
| 175 |
+
hole=0.5,
|
| 176 |
+
marker_colors=[CORE_COLORS["core"], CORE_COLORS["shell"], CORE_COLORS["cloud"]],
|
| 177 |
+
textinfo="label+value+percent",
|
| 178 |
+
)])
|
| 179 |
+
fig.update_layout(
|
| 180 |
+
title=f"Gene Classification (Core>={core_thresh}%, Cloud<{cloud_thresh}%)",
|
| 181 |
+
showlegend=True,
|
| 182 |
+
)
|
| 183 |
+
return fig
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def build_frequency_histogram(core_thresh: float, cloud_thresh: float, data: dict) -> go.Figure:
|
| 187 |
+
"""Build colored histogram of gene frequencies."""
|
| 188 |
+
gene_freq = data["gene_freq"]
|
| 189 |
+
|
| 190 |
+
fig = go.Figure()
|
| 191 |
+
for cls, color in CORE_COLORS.items():
|
| 192 |
+
if cls == "unknown":
|
| 193 |
+
continue
|
| 194 |
+
subset = gene_freq[gene_freq["core_class"] == cls]
|
| 195 |
+
fig.add_trace(go.Histogram(
|
| 196 |
+
x=subset["freq_pct"],
|
| 197 |
+
name=cls.capitalize(),
|
| 198 |
+
marker_color=color,
|
| 199 |
+
opacity=0.75,
|
| 200 |
+
nbinsx=50,
|
| 201 |
+
))
|
| 202 |
+
|
| 203 |
+
fig.update_layout(
|
| 204 |
+
barmode="overlay",
|
| 205 |
+
title="Gene Frequency Distribution",
|
| 206 |
+
xaxis_title="Frequency (%)",
|
| 207 |
+
yaxis_title="Count",
|
| 208 |
+
)
|
| 209 |
+
# Add threshold lines
|
| 210 |
+
fig.add_vline(x=core_thresh, line_dash="dash", line_color="green",
|
| 211 |
+
annotation_text=f"Core>={core_thresh}%")
|
| 212 |
+
fig.add_vline(x=cloud_thresh, line_dash="dash", line_color="red",
|
| 213 |
+
annotation_text=f"Cloud<{cloud_thresh}%")
|
| 214 |
+
return fig
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def build_treasure_table(state: AppState, core_thresh: float, cloud_thresh: float,
|
| 218 |
+
filter_type: str, data: dict) -> pd.DataFrame:
|
| 219 |
+
"""Build gene treasure table with current filters."""
|
| 220 |
+
gene_freq = data["gene_freq"].copy()
|
| 221 |
+
|
| 222 |
+
# Reclassify based on current thresholds
|
| 223 |
+
gene_freq["core_class"] = gene_freq["freq_pct"].apply(
|
| 224 |
+
lambda x: "core" if x >= core_thresh else ("cloud" if x < cloud_thresh else "shell")
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# Add in_my_line column
|
| 228 |
+
pav = data.get("pav")
|
| 229 |
+
if pav is not None and state and state.selected_line and state.selected_line in pav.columns:
|
| 230 |
+
my_presence = pav[state.selected_line]
|
| 231 |
+
gene_freq["in_my_line"] = gene_freq["gene_id"].map(
|
| 232 |
+
lambda g: "Yes" if g in my_presence.index and my_presence.get(g, 0) == 1 else "No"
|
| 233 |
+
)
|
| 234 |
+
else:
|
| 235 |
+
gene_freq["in_my_line"] = "N/A"
|
| 236 |
+
|
| 237 |
+
# Filter
|
| 238 |
+
if filter_type == "Unique to my line":
|
| 239 |
+
if pav is not None and state and state.selected_line:
|
| 240 |
+
unique_mask = (pav.sum(axis=1) == 1) & (pav[state.selected_line] == 1)
|
| 241 |
+
unique_genes = set(pav.index[unique_mask])
|
| 242 |
+
gene_freq = gene_freq[gene_freq["gene_id"].isin(unique_genes)]
|
| 243 |
+
elif filter_type == "Rare (<5 lines)":
|
| 244 |
+
gene_freq = gene_freq[gene_freq["freq_count"] <= 5]
|
| 245 |
+
elif filter_type == "Cluster markers":
|
| 246 |
+
markers = data.get("markers")
|
| 247 |
+
if markers is not None:
|
| 248 |
+
marker_genes = set(markers["gene_id"])
|
| 249 |
+
gene_freq = gene_freq[gene_freq["gene_id"].isin(marker_genes)]
|
| 250 |
+
|
| 251 |
+
# Sort and limit
|
| 252 |
+
gene_freq = gene_freq.sort_values("freq_count", ascending=True).head(500)
|
| 253 |
+
return gene_freq[["gene_id", "freq_count", "freq_pct", "core_class", "in_my_line"]]
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def on_pin_gene(gene_id: str, state: AppState) -> tuple:
|
| 257 |
+
"""Add gene to backpack."""
|
| 258 |
+
if state is None:
|
| 259 |
+
state = AppState()
|
| 260 |
+
if not gene_id or gene_id == "Click a row to select":
|
| 261 |
+
return "Select a gene first", state
|
| 262 |
+
|
| 263 |
+
added = state.add_to_backpack(gene_id)
|
| 264 |
+
backpack_text = ", ".join(state.backpack_genes) if state.backpack_genes else "Empty"
|
| 265 |
+
if not added:
|
| 266 |
+
backpack_text = f"(already in backpack) {backpack_text}"
|
| 267 |
+
return backpack_text, state
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def on_gene_click_table(evt, state: AppState) -> tuple:
|
| 271 |
+
"""Handle table row selection."""
|
| 272 |
+
if state is None:
|
| 273 |
+
state = AppState()
|
| 274 |
+
if evt is not None and hasattr(evt, 'value'):
|
| 275 |
+
gene_id = str(evt.value)
|
| 276 |
+
state.selected_gene = gene_id
|
| 277 |
+
return gene_id, state
|
| 278 |
+
return "Click a row to select", state
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# ============================================================
|
| 282 |
+
# Quest 3 Callbacks
|
| 283 |
+
# ============================================================
|
| 284 |
+
|
| 285 |
+
def build_hotspot_heatmap(data: dict, top_n_contigs: int = 20) -> go.Figure:
|
| 286 |
+
"""Build contig x bin heatmap from hotspot_bins."""
|
| 287 |
+
hotspots = data["hotspots"]
|
| 288 |
+
|
| 289 |
+
# Top N contigs by total genes
|
| 290 |
+
contig_counts = hotspots.groupby("contig_id")["total_genes"].sum()
|
| 291 |
+
top_contigs = contig_counts.nlargest(top_n_contigs).index.tolist()
|
| 292 |
+
subset = hotspots[hotspots["contig_id"].isin(top_contigs)]
|
| 293 |
+
|
| 294 |
+
if len(subset) == 0:
|
| 295 |
+
fig = go.Figure()
|
| 296 |
+
fig.add_annotation(text="No hotspot data available", showarrow=False)
|
| 297 |
+
return fig
|
| 298 |
+
|
| 299 |
+
pivot = subset.pivot_table(
|
| 300 |
+
index="contig_id", columns="bin_start",
|
| 301 |
+
values="variability_score", aggfunc="max"
|
| 302 |
+
).fillna(0)
|
| 303 |
+
|
| 304 |
+
# Shorten contig names for display
|
| 305 |
+
short_names = [c.split("|")[-1] if "|" in c else c[:30] for c in pivot.index]
|
| 306 |
+
|
| 307 |
+
fig = go.Figure(data=go.Heatmap(
|
| 308 |
+
z=pivot.values,
|
| 309 |
+
x=[f"{int(c/1000)}kb" for c in pivot.columns],
|
| 310 |
+
y=short_names,
|
| 311 |
+
colorscale=[[0, "#E8F5E9"], [0.5, "#FFC107"], [1.0, "#F44336"]],
|
| 312 |
+
colorbar_title="Variability",
|
| 313 |
+
hovertemplate="Contig: %{y}<br>Bin: %{x}<br>Score: %{z:.1f}<extra></extra>",
|
| 314 |
+
))
|
| 315 |
+
fig.update_layout(
|
| 316 |
+
title=f"Genomic Variability Heatmap (Top {top_n_contigs} contigs)",
|
| 317 |
+
xaxis_title="Genomic position",
|
| 318 |
+
yaxis_title="Contig",
|
| 319 |
+
height=600,
|
| 320 |
+
)
|
| 321 |
+
return fig
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def on_contig_selected(contig_id: str, data: dict, state: AppState) -> tuple:
|
| 325 |
+
"""Build track plot for selected contig."""
|
| 326 |
+
if not contig_id:
|
| 327 |
+
return go.Figure(), pd.DataFrame()
|
| 328 |
+
|
| 329 |
+
gff = data["gff_index"]
|
| 330 |
+
gene_freq = data["gene_freq"]
|
| 331 |
+
|
| 332 |
+
contig_genes = gff[gff["contig_id"] == contig_id].merge(
|
| 333 |
+
gene_freq[["gene_id", "core_class", "freq_pct"]], on="gene_id", how="left"
|
| 334 |
+
)
|
| 335 |
+
contig_genes["core_class"] = contig_genes["core_class"].fillna("unknown")
|
| 336 |
+
|
| 337 |
+
if len(contig_genes) == 0:
|
| 338 |
+
fig = go.Figure()
|
| 339 |
+
fig.add_annotation(text="No genes on this contig", showarrow=False)
|
| 340 |
+
return fig, pd.DataFrame()
|
| 341 |
+
|
| 342 |
+
fig = go.Figure()
|
| 343 |
+
for cls, color in CORE_COLORS.items():
|
| 344 |
+
subset = contig_genes[contig_genes["core_class"] == cls]
|
| 345 |
+
if len(subset) == 0:
|
| 346 |
+
continue
|
| 347 |
+
fig.add_trace(go.Scatter(
|
| 348 |
+
x=(subset["start"] + subset["end"]) / 2,
|
| 349 |
+
y=[cls] * len(subset),
|
| 350 |
+
mode="markers",
|
| 351 |
+
marker=dict(
|
| 352 |
+
symbol="line-ew", size=12, color=color,
|
| 353 |
+
line=dict(width=2, color=color),
|
| 354 |
+
),
|
| 355 |
+
name=cls.capitalize(),
|
| 356 |
+
text=subset["gene_id"],
|
| 357 |
+
hovertemplate="Gene: %{text}<br>Position: %{x:,.0f}<extra></extra>",
|
| 358 |
+
))
|
| 359 |
+
|
| 360 |
+
short_name = contig_id.split("|")[-1] if "|" in contig_id else contig_id[:30]
|
| 361 |
+
fig.update_layout(
|
| 362 |
+
title=f"Gene Track: {short_name}",
|
| 363 |
+
xaxis_title="Genomic position (bp)",
|
| 364 |
+
yaxis_title="Gene class",
|
| 365 |
+
showlegend=True,
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
table_df = contig_genes[["gene_id", "start", "end", "strand", "core_class", "freq_pct"]].sort_values("start")
|
| 369 |
+
return fig, table_df
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ============================================================
|
| 373 |
+
# Quest 4 Callbacks
|
| 374 |
+
# ============================================================
|
| 375 |
+
|
| 376 |
+
def get_protein_stats_html(gene_id: str, data: dict) -> str:
|
| 377 |
+
"""Get protein stats as HTML."""
|
| 378 |
+
if not gene_id:
|
| 379 |
+
return "<p>Select a gene</p>"
|
| 380 |
+
|
| 381 |
+
protein = data["protein"]
|
| 382 |
+
row = protein[protein["gene_id"] == gene_id]
|
| 383 |
+
if len(row) == 0:
|
| 384 |
+
return "<p><i>No protein data available for this gene.</i></p>"
|
| 385 |
+
|
| 386 |
+
r = row.iloc[0]
|
| 387 |
+
return (
|
| 388 |
+
f"<div class='stat-card'>"
|
| 389 |
+
f"<p><b>Protein Length:</b> {int(r['protein_length'])} aa</p>"
|
| 390 |
+
f"<p><b>Top Amino Acids:</b> {r['composition_summary']}</p>"
|
| 391 |
+
f"</div>"
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def build_backpack_comparison(state: AppState, data: dict) -> go.Figure:
|
| 396 |
+
"""Bar chart of protein lengths for backpack genes."""
|
| 397 |
+
if not state or len(state.backpack_genes) < 2:
|
| 398 |
+
fig = go.Figure()
|
| 399 |
+
fig.add_annotation(text="Pin at least 2 genes to compare", showarrow=False)
|
| 400 |
+
return fig
|
| 401 |
+
|
| 402 |
+
protein = data["protein"]
|
| 403 |
+
bp_prot = protein[protein["gene_id"].isin(state.backpack_genes)]
|
| 404 |
+
|
| 405 |
+
fig = go.Figure(data=[go.Bar(
|
| 406 |
+
x=bp_prot["gene_id"],
|
| 407 |
+
y=bp_prot["protein_length"],
|
| 408 |
+
marker_color="#2E7D32",
|
| 409 |
+
text=bp_prot["protein_length"],
|
| 410 |
+
textposition="auto",
|
| 411 |
+
)])
|
| 412 |
+
fig.update_layout(
|
| 413 |
+
title="Backpack Genes: Protein Length Comparison",
|
| 414 |
+
xaxis_title="Gene",
|
| 415 |
+
yaxis_title="Protein Length (aa)",
|
| 416 |
+
)
|
| 417 |
+
return fig
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
def build_composition_heatmap(state: AppState, data: dict) -> go.Figure:
|
| 421 |
+
"""Heatmap of amino acid composition for backpack genes."""
|
| 422 |
+
if not state or len(state.backpack_genes) < 2:
|
| 423 |
+
fig = go.Figure()
|
| 424 |
+
fig.add_annotation(text="Pin at least 2 genes to compare", showarrow=False)
|
| 425 |
+
return fig
|
| 426 |
+
|
| 427 |
+
# Parse composition from summary strings
|
| 428 |
+
protein = data["protein"]
|
| 429 |
+
bp_prot = protein[protein["gene_id"].isin(state.backpack_genes)]
|
| 430 |
+
|
| 431 |
+
aa_data = {}
|
| 432 |
+
for _, row in bp_prot.iterrows():
|
| 433 |
+
gene_id = row["gene_id"]
|
| 434 |
+
comp = row["composition_summary"]
|
| 435 |
+
aa_dict = {}
|
| 436 |
+
for item in comp.split(", "):
|
| 437 |
+
parts = item.split(":")
|
| 438 |
+
if len(parts) == 2:
|
| 439 |
+
aa = parts[0].strip()
|
| 440 |
+
pct = float(parts[1].replace("%", ""))
|
| 441 |
+
aa_dict[aa] = pct
|
| 442 |
+
aa_data[gene_id] = aa_dict
|
| 443 |
+
|
| 444 |
+
if not aa_data:
|
| 445 |
+
fig = go.Figure()
|
| 446 |
+
fig.add_annotation(text="No composition data", showarrow=False)
|
| 447 |
+
return fig
|
| 448 |
+
|
| 449 |
+
df = pd.DataFrame(aa_data).fillna(0).T
|
| 450 |
+
fig = go.Figure(data=go.Heatmap(
|
| 451 |
+
z=df.values,
|
| 452 |
+
x=df.columns.tolist(),
|
| 453 |
+
y=df.index.tolist(),
|
| 454 |
+
colorscale="YlGn",
|
| 455 |
+
colorbar_title="%",
|
| 456 |
+
))
|
| 457 |
+
fig.update_layout(
|
| 458 |
+
title="Amino Acid Composition Heatmap",
|
| 459 |
+
xaxis_title="Amino Acid",
|
| 460 |
+
yaxis_title="Gene",
|
| 461 |
+
)
|
| 462 |
+
return fig
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
# ============================================================
|
| 466 |
+
# Gene Card Callbacks
|
| 467 |
+
# ============================================================
|
| 468 |
+
|
| 469 |
+
def on_open_gene_card(gene_id: str, state: AppState, data: dict) -> tuple:
|
| 470 |
+
"""Open Gene Card side panel."""
|
| 471 |
+
if not gene_id:
|
| 472 |
+
return "", False, state
|
| 473 |
+
|
| 474 |
+
state.selected_gene = gene_id
|
| 475 |
+
card = build_gene_card(gene_id, data)
|
| 476 |
+
html = render_gene_card_html(card)
|
| 477 |
+
state.award("Gene Hunter")
|
| 478 |
+
return html, True, state
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
def on_download_gene_report(state: AppState, data: dict) -> str:
|
| 482 |
+
"""Download gene report."""
|
| 483 |
+
if state and state.selected_gene:
|
| 484 |
+
return export_gene_report(state.selected_gene, data)
|
| 485 |
+
return None
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
# ============================================================
|
| 489 |
+
# Final Report Callbacks
|
| 490 |
+
# ============================================================
|
| 491 |
+
|
| 492 |
+
def on_generate_report(state: AppState, data: dict) -> tuple:
|
| 493 |
+
"""Generate field report."""
|
| 494 |
+
if state is None:
|
| 495 |
+
state = AppState()
|
| 496 |
+
state.award("Cartographer")
|
| 497 |
+
report_md = generate_field_report(state, data)
|
| 498 |
+
json_path = export_report_json(state, data)
|
| 499 |
+
csv_path = export_report_csv(state, data)
|
| 500 |
+
|
| 501 |
+
# Achievement HTML
|
| 502 |
+
badges = " ".join(
|
| 503 |
+
f'<span class="achievement-badge">{a}</span>'
|
| 504 |
+
for a in sorted(state.achievements)
|
| 505 |
+
)
|
| 506 |
+
return report_md, json_path, csv_path, True, True, badges, state
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
# ============================================================
|
| 510 |
+
# Data Health
|
| 511 |
+
# ============================================================
|
| 512 |
+
|
| 513 |
+
def build_data_health_html(validation_report: dict) -> str:
|
| 514 |
+
"""Build data health HTML from validation report."""
|
| 515 |
+
rows = ""
|
| 516 |
+
for k, v in validation_report.items():
|
| 517 |
+
if isinstance(v, float):
|
| 518 |
+
v = f"{v:.1f}%"
|
| 519 |
+
rows += f"<tr><td><b>{k}</b></td><td>{v}</td></tr>"
|
| 520 |
+
return f"<table style='width:100%'>{rows}</table>"
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
# Need gr import for Tabs update
|
| 524 |
+
import gradio as gr
|
src/data_loader.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data parsing and validation for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from collections import Counter
|
| 8 |
+
|
| 9 |
+
from src.utils import logger, timer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@timer
|
| 13 |
+
def load_pav(path: str) -> pd.DataFrame:
|
| 14 |
+
"""
|
| 15 |
+
Load 89_line_PAV.txt.
|
| 16 |
+
Returns DataFrame: index=gene_id (str), columns=line_ids (str), values=int {0,1}.
|
| 17 |
+
"""
|
| 18 |
+
df = pd.read_csv(path, sep="\t", index_col=0)
|
| 19 |
+
df.index.name = "gene"
|
| 20 |
+
df.index = df.index.astype(str)
|
| 21 |
+
df.columns = df.columns.astype(str)
|
| 22 |
+
# Validate all values are 0 or 1
|
| 23 |
+
unique_vals = set(df.values.flatten())
|
| 24 |
+
assert unique_vals.issubset({0, 1}), f"PAV contains values other than 0/1: {unique_vals - {0, 1}}"
|
| 25 |
+
logger.info(f"PAV matrix loaded: {df.shape[0]} genes x {df.shape[1]} lines")
|
| 26 |
+
return df
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@timer
|
| 30 |
+
def parse_gff_genes(path: str) -> pd.DataFrame:
|
| 31 |
+
"""
|
| 32 |
+
Parse GFF3; keep only feature == 'gene' rows.
|
| 33 |
+
Returns DataFrame: gene_id, contig_id, start, end, strand.
|
| 34 |
+
"""
|
| 35 |
+
records = []
|
| 36 |
+
with open(path, "r") as f:
|
| 37 |
+
for line in f:
|
| 38 |
+
if line.startswith("#"):
|
| 39 |
+
continue
|
| 40 |
+
parts = line.strip().split("\t")
|
| 41 |
+
if len(parts) < 9:
|
| 42 |
+
continue
|
| 43 |
+
if parts[2] != "gene":
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
contig_id = parts[0]
|
| 47 |
+
start = int(parts[3])
|
| 48 |
+
end = int(parts[4])
|
| 49 |
+
strand = parts[6]
|
| 50 |
+
attrs = parts[8]
|
| 51 |
+
|
| 52 |
+
# Extract gene_id from attributes: ID=<value>
|
| 53 |
+
gene_id = None
|
| 54 |
+
for attr in attrs.split(";"):
|
| 55 |
+
attr = attr.strip()
|
| 56 |
+
if attr.startswith("ID="):
|
| 57 |
+
gene_id = attr[3:]
|
| 58 |
+
break
|
| 59 |
+
|
| 60 |
+
if gene_id:
|
| 61 |
+
records.append({
|
| 62 |
+
"gene_id": gene_id,
|
| 63 |
+
"contig_id": contig_id,
|
| 64 |
+
"start": start,
|
| 65 |
+
"end": end,
|
| 66 |
+
"strand": strand,
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
df = pd.DataFrame(records)
|
| 70 |
+
logger.info(f"GFF parsed: {len(df)} genes on {df['contig_id'].nunique()} contigs")
|
| 71 |
+
return df
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@timer
|
| 75 |
+
def parse_protein_fasta(path: str) -> pd.DataFrame:
|
| 76 |
+
"""
|
| 77 |
+
Returns DataFrame: gene_id, protein_length, aa_composition (dict as string).
|
| 78 |
+
gene_id = header token after '>' up to first whitespace.
|
| 79 |
+
"""
|
| 80 |
+
records = []
|
| 81 |
+
current_id = None
|
| 82 |
+
current_seq = []
|
| 83 |
+
|
| 84 |
+
def flush():
|
| 85 |
+
if current_id and current_seq:
|
| 86 |
+
seq = "".join(current_seq).replace("*", "")
|
| 87 |
+
length = len(seq)
|
| 88 |
+
counts = Counter(seq)
|
| 89 |
+
total = max(length, 1)
|
| 90 |
+
top_aas = sorted(counts.items(), key=lambda x: -x[1])[:5]
|
| 91 |
+
comp_str = ", ".join(f"{aa}:{count/total*100:.1f}%" for aa, count in top_aas)
|
| 92 |
+
records.append({
|
| 93 |
+
"gene_id": current_id,
|
| 94 |
+
"protein_length": length,
|
| 95 |
+
"composition_summary": comp_str,
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
with open(path, "r") as f:
|
| 99 |
+
for line in f:
|
| 100 |
+
line = line.strip()
|
| 101 |
+
if line.startswith(">"):
|
| 102 |
+
flush()
|
| 103 |
+
current_id = line[1:].split()[0]
|
| 104 |
+
current_seq = []
|
| 105 |
+
else:
|
| 106 |
+
current_seq.append(line)
|
| 107 |
+
flush()
|
| 108 |
+
|
| 109 |
+
df = pd.DataFrame(records)
|
| 110 |
+
logger.info(f"Protein FASTA parsed: {len(df)} proteins")
|
| 111 |
+
return df
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@timer
|
| 115 |
+
def build_contig_index(path: str) -> dict:
|
| 116 |
+
"""
|
| 117 |
+
Returns dict: {contig_id: length}.
|
| 118 |
+
Sequential scan of FASTA headers and sequences.
|
| 119 |
+
"""
|
| 120 |
+
contig_index = {}
|
| 121 |
+
current_contig = None
|
| 122 |
+
current_len = 0
|
| 123 |
+
|
| 124 |
+
with open(path, "r") as f:
|
| 125 |
+
for line in f:
|
| 126 |
+
if line.startswith(">"):
|
| 127 |
+
if current_contig is not None:
|
| 128 |
+
contig_index[current_contig] = current_len
|
| 129 |
+
current_contig = line[1:].strip().split()[0]
|
| 130 |
+
current_len = 0
|
| 131 |
+
else:
|
| 132 |
+
current_len += len(line.strip())
|
| 133 |
+
if current_contig is not None:
|
| 134 |
+
contig_index[current_contig] = current_len
|
| 135 |
+
|
| 136 |
+
logger.info(f"Contig index built: {len(contig_index)} contigs")
|
| 137 |
+
return contig_index
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def build_contig_name_mapping(gff_genes: pd.DataFrame, contig_index: dict) -> dict:
|
| 141 |
+
"""
|
| 142 |
+
Build mapping from GFF contig IDs to FASTA contig IDs.
|
| 143 |
+
Strategy: exact match first, then substring match on accession tokens.
|
| 144 |
+
Returns dict: {gff_contig_id: fasta_contig_id}
|
| 145 |
+
"""
|
| 146 |
+
gff_contigs = set(gff_genes["contig_id"].unique())
|
| 147 |
+
fasta_contigs = set(contig_index.keys())
|
| 148 |
+
mapping = {}
|
| 149 |
+
|
| 150 |
+
# Exact match
|
| 151 |
+
for gc in gff_contigs:
|
| 152 |
+
if gc in fasta_contigs:
|
| 153 |
+
mapping[gc] = gc
|
| 154 |
+
|
| 155 |
+
# For unmatched, try accession-based matching
|
| 156 |
+
unmatched = gff_contigs - set(mapping.keys())
|
| 157 |
+
if unmatched:
|
| 158 |
+
# Extract accession-like tokens from FASTA headers (e.g. NC_033813.1)
|
| 159 |
+
fasta_accession_map = {}
|
| 160 |
+
for fc in fasta_contigs:
|
| 161 |
+
# Try to extract RefSeq accession
|
| 162 |
+
match = re.search(r'(N[CWZ]_\d+\.\d+)', fc)
|
| 163 |
+
if match:
|
| 164 |
+
fasta_accession_map[match.group(1)] = fc
|
| 165 |
+
|
| 166 |
+
for gc in unmatched:
|
| 167 |
+
match = re.search(r'(N[CWZ]_\d+\.\d+)', gc)
|
| 168 |
+
if match and match.group(1) in fasta_accession_map:
|
| 169 |
+
mapping[gc] = fasta_accession_map[match.group(1)]
|
| 170 |
+
|
| 171 |
+
logger.info(f"Contig mapping: {len(mapping)}/{len(gff_contigs)} GFF contigs matched to FASTA")
|
| 172 |
+
return mapping
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def validate_joins(pav: pd.DataFrame, gff_genes: pd.DataFrame,
|
| 176 |
+
protein_index: pd.DataFrame, contig_index: dict) -> dict:
|
| 177 |
+
"""
|
| 178 |
+
Returns validation report with coverage percentages and orphan genes.
|
| 179 |
+
"""
|
| 180 |
+
pav_genes = set(pav.index)
|
| 181 |
+
gff_gene_set = set(gff_genes["gene_id"])
|
| 182 |
+
protein_gene_set = set(protein_index["gene_id"])
|
| 183 |
+
contig_set = set(contig_index.keys())
|
| 184 |
+
gff_contig_set = set(gff_genes["contig_id"])
|
| 185 |
+
|
| 186 |
+
pav_in_gff = pav_genes & gff_gene_set
|
| 187 |
+
pav_in_protein = pav_genes & protein_gene_set
|
| 188 |
+
gff_contigs_in_fasta = gff_contig_set & contig_set
|
| 189 |
+
orphans = pav_genes - (gff_gene_set | protein_gene_set)
|
| 190 |
+
|
| 191 |
+
report = {
|
| 192 |
+
"pav_gene_count": len(pav_genes),
|
| 193 |
+
"gff_gene_count": len(gff_gene_set),
|
| 194 |
+
"protein_gene_count": len(protein_gene_set),
|
| 195 |
+
"pav_genes_in_gff_pct": len(pav_in_gff) / max(len(pav_genes), 1) * 100,
|
| 196 |
+
"pav_genes_in_protein_pct": len(pav_in_protein) / max(len(pav_genes), 1) * 100,
|
| 197 |
+
"gff_contigs_in_fasta_pct": len(gff_contigs_in_fasta) / max(len(gff_contig_set), 1) * 100,
|
| 198 |
+
"orphan_genes_count": len(orphans),
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
if orphans:
|
| 202 |
+
logger.warning(f"{len(orphans)} orphan genes (in PAV but missing from both GFF and protein)")
|
| 203 |
+
for key, val in report.items():
|
| 204 |
+
logger.info(f" {key}: {val}")
|
| 205 |
+
|
| 206 |
+
return report
|
src/field_report.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Field Report generator for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import csv
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from src.state import AppState
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def generate_field_report(state: AppState, data: dict) -> str:
|
| 13 |
+
"""
|
| 14 |
+
Generate markdown field report.
|
| 15 |
+
Returns markdown string.
|
| 16 |
+
"""
|
| 17 |
+
if not state.selected_line:
|
| 18 |
+
return "*No line selected. Go back to Quest 0 to choose a line.*"
|
| 19 |
+
|
| 20 |
+
line_id = state.selected_line
|
| 21 |
+
line_stats = data["line_stats"]
|
| 22 |
+
embedding = data["embedding"]
|
| 23 |
+
similarity = data["similarity"]
|
| 24 |
+
gene_freq = data["gene_freq"]
|
| 25 |
+
pav = data.get("pav")
|
| 26 |
+
|
| 27 |
+
# Line info
|
| 28 |
+
ls_row = line_stats[line_stats["line_id"] == line_id]
|
| 29 |
+
country = ls_row.iloc[0]["country"] if len(ls_row) > 0 else "Unknown"
|
| 30 |
+
genes_present = int(ls_row.iloc[0]["genes_present_count"]) if len(ls_row) > 0 else 0
|
| 31 |
+
unique_genes = int(ls_row.iloc[0]["unique_genes_count"]) if len(ls_row) > 0 else 0
|
| 32 |
+
|
| 33 |
+
# Cluster
|
| 34 |
+
emb_row = embedding[embedding["line_id"] == line_id]
|
| 35 |
+
cluster_id = int(emb_row.iloc[0]["cluster_id"]) if len(emb_row) > 0 else -1
|
| 36 |
+
|
| 37 |
+
# Nearest neighbors
|
| 38 |
+
sim_rows = similarity[similarity["line_id"] == line_id].nlargest(3, "jaccard_score")
|
| 39 |
+
neighbors_str = ", ".join(
|
| 40 |
+
f"{r['neighbor_line_id']} ({r['jaccard_score']:.3f})"
|
| 41 |
+
for _, r in sim_rows.iterrows()
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Core/shell/cloud for this line
|
| 45 |
+
core_shell_cloud = ""
|
| 46 |
+
if pav is not None and line_id in pav.columns:
|
| 47 |
+
my_genes = set(pav.index[pav[line_id] == 1])
|
| 48 |
+
my_freq = gene_freq[gene_freq["gene_id"].isin(my_genes)]
|
| 49 |
+
core_count = int((my_freq["core_class"] == "core").sum())
|
| 50 |
+
shell_count = int((my_freq["core_class"] == "shell").sum())
|
| 51 |
+
cloud_count = int((my_freq["core_class"] == "cloud").sum())
|
| 52 |
+
core_shell_cloud = (
|
| 53 |
+
f"- Core genes: **{core_count}**\n"
|
| 54 |
+
f"- Shell genes: **{shell_count}**\n"
|
| 55 |
+
f"- Cloud genes: **{cloud_count}**\n"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Top rare genes
|
| 59 |
+
rare_genes_str = ""
|
| 60 |
+
if pav is not None and line_id in pav.columns:
|
| 61 |
+
my_genes_list = pav.index[pav[line_id] == 1].tolist()
|
| 62 |
+
rare = gene_freq[
|
| 63 |
+
(gene_freq["gene_id"].isin(my_genes_list)) &
|
| 64 |
+
(gene_freq["freq_count"] <= 5)
|
| 65 |
+
].nsmallest(5, "freq_count")
|
| 66 |
+
if len(rare) > 0:
|
| 67 |
+
rare_genes_str = "\n".join(
|
| 68 |
+
f"- **{r['gene_id']}** (present in {r['freq_count']} lines)"
|
| 69 |
+
for _, r in rare.iterrows()
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
rare_genes_str = "No rare genes (<=5 lines) found in this line."
|
| 73 |
+
|
| 74 |
+
# Backpack
|
| 75 |
+
backpack_str = ""
|
| 76 |
+
if state.backpack_genes:
|
| 77 |
+
backpack_items = []
|
| 78 |
+
for g in state.backpack_genes:
|
| 79 |
+
gf = gene_freq[gene_freq["gene_id"] == g]
|
| 80 |
+
if len(gf) > 0:
|
| 81 |
+
backpack_items.append(
|
| 82 |
+
f"- **{g}**: {gf.iloc[0]['core_class']} "
|
| 83 |
+
f"({gf.iloc[0]['freq_count']} lines)"
|
| 84 |
+
)
|
| 85 |
+
else:
|
| 86 |
+
backpack_items.append(f"- **{g}**: no frequency data")
|
| 87 |
+
backpack_str = "\n".join(backpack_items)
|
| 88 |
+
else:
|
| 89 |
+
backpack_str = "No genes pinned to backpack."
|
| 90 |
+
|
| 91 |
+
# Achievements
|
| 92 |
+
achievements_str = ", ".join(
|
| 93 |
+
f"**{a}**" for a in sorted(state.achievements)
|
| 94 |
+
) if state.achievements else "None yet"
|
| 95 |
+
|
| 96 |
+
report = f"""
|
| 97 |
+
## Field Report: {line_id}
|
| 98 |
+
|
| 99 |
+
**Country of origin:** {country}
|
| 100 |
+
**Cluster:** {cluster_id}
|
| 101 |
+
**Genes present:** {genes_present:,}
|
| 102 |
+
**Unique genes:** {unique_genes}
|
| 103 |
+
|
| 104 |
+
### Nearest Neighbors
|
| 105 |
+
{neighbors_str}
|
| 106 |
+
|
| 107 |
+
### Gene Composition
|
| 108 |
+
{core_shell_cloud}
|
| 109 |
+
|
| 110 |
+
### Top 5 Rare Genes
|
| 111 |
+
{rare_genes_str}
|
| 112 |
+
|
| 113 |
+
### Backpack Collection
|
| 114 |
+
{backpack_str}
|
| 115 |
+
|
| 116 |
+
### Achievements
|
| 117 |
+
{achievements_str}
|
| 118 |
+
|
| 119 |
+
---
|
| 120 |
+
*Generated by Pigeon Pea Pangenome Atlas*
|
| 121 |
+
"""
|
| 122 |
+
return report.strip()
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def export_report_json(state: AppState, data: dict) -> str:
|
| 126 |
+
"""Export full report data as JSON. Return file path."""
|
| 127 |
+
report_data = {
|
| 128 |
+
"selected_line": state.selected_line,
|
| 129 |
+
"backpack_genes": state.backpack_genes,
|
| 130 |
+
"achievements": list(state.achievements),
|
| 131 |
+
"core_threshold": state.core_threshold,
|
| 132 |
+
"cloud_threshold": state.cloud_threshold,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
if state.selected_line:
|
| 136 |
+
ls = data["line_stats"]
|
| 137 |
+
row = ls[ls["line_id"] == state.selected_line]
|
| 138 |
+
if len(row) > 0:
|
| 139 |
+
report_data["line_stats"] = row.iloc[0].to_dict()
|
| 140 |
+
|
| 141 |
+
tmpdir = tempfile.mkdtemp()
|
| 142 |
+
path = os.path.join(tmpdir, "field_report.json")
|
| 143 |
+
with open(path, "w") as f:
|
| 144 |
+
json.dump(report_data, f, indent=2, default=str)
|
| 145 |
+
return path
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def export_report_csv(state: AppState, data: dict) -> str:
|
| 149 |
+
"""Export tabular portions as CSV. Return file path."""
|
| 150 |
+
tmpdir = tempfile.mkdtemp()
|
| 151 |
+
path = os.path.join(tmpdir, "field_report.csv")
|
| 152 |
+
|
| 153 |
+
rows = []
|
| 154 |
+
if state.backpack_genes:
|
| 155 |
+
gene_freq = data["gene_freq"]
|
| 156 |
+
for g in state.backpack_genes:
|
| 157 |
+
gf = gene_freq[gene_freq["gene_id"] == g]
|
| 158 |
+
if len(gf) > 0:
|
| 159 |
+
rows.append({
|
| 160 |
+
"gene_id": g,
|
| 161 |
+
"freq_count": int(gf.iloc[0]["freq_count"]),
|
| 162 |
+
"freq_pct": float(gf.iloc[0]["freq_pct"]),
|
| 163 |
+
"core_class": gf.iloc[0]["core_class"],
|
| 164 |
+
})
|
| 165 |
+
|
| 166 |
+
if rows:
|
| 167 |
+
with open(path, "w", newline="") as f:
|
| 168 |
+
writer = csv.DictWriter(f, fieldnames=["gene_id", "freq_count", "freq_pct", "core_class"])
|
| 169 |
+
writer.writeheader()
|
| 170 |
+
writer.writerows(rows)
|
| 171 |
+
else:
|
| 172 |
+
with open(path, "w") as f:
|
| 173 |
+
f.write("gene_id,freq_count,freq_pct,core_class\n")
|
| 174 |
+
|
| 175 |
+
return path
|
src/gene_card.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gene Card builder for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def build_gene_card(gene_id: str, data: dict) -> dict:
|
| 10 |
+
"""
|
| 11 |
+
Assemble all data for the Gene Card.
|
| 12 |
+
Returns dict with all fields for rendering.
|
| 13 |
+
"""
|
| 14 |
+
result = {"gene_id": gene_id}
|
| 15 |
+
|
| 16 |
+
# Frequency info
|
| 17 |
+
gene_freq = data["gene_freq"]
|
| 18 |
+
freq_row = gene_freq[gene_freq["gene_id"] == gene_id]
|
| 19 |
+
if len(freq_row) > 0:
|
| 20 |
+
row = freq_row.iloc[0]
|
| 21 |
+
result["freq_count"] = int(row["freq_count"])
|
| 22 |
+
result["freq_pct"] = float(row["freq_pct"])
|
| 23 |
+
result["core_class"] = row["core_class"]
|
| 24 |
+
else:
|
| 25 |
+
result["freq_count"] = None
|
| 26 |
+
result["freq_pct"] = None
|
| 27 |
+
result["core_class"] = "unknown"
|
| 28 |
+
|
| 29 |
+
# Presence vector
|
| 30 |
+
pav = data.get("pav")
|
| 31 |
+
if pav is not None and gene_id in pav.index:
|
| 32 |
+
result["presence_vector"] = pav.loc[gene_id]
|
| 33 |
+
else:
|
| 34 |
+
result["presence_vector"] = None
|
| 35 |
+
|
| 36 |
+
# GFF info
|
| 37 |
+
gff = data["gff_index"]
|
| 38 |
+
gff_row = gff[gff["gene_id"] == gene_id]
|
| 39 |
+
if len(gff_row) > 0:
|
| 40 |
+
row = gff_row.iloc[0]
|
| 41 |
+
result["contig"] = row["contig_id"]
|
| 42 |
+
result["start"] = int(row["start"])
|
| 43 |
+
result["end"] = int(row["end"])
|
| 44 |
+
result["strand"] = row["strand"]
|
| 45 |
+
else:
|
| 46 |
+
result["contig"] = None
|
| 47 |
+
result["start"] = None
|
| 48 |
+
result["end"] = None
|
| 49 |
+
result["strand"] = None
|
| 50 |
+
|
| 51 |
+
# Protein info
|
| 52 |
+
protein = data["protein"]
|
| 53 |
+
prot_row = protein[protein["gene_id"] == gene_id]
|
| 54 |
+
if len(prot_row) > 0:
|
| 55 |
+
row = prot_row.iloc[0]
|
| 56 |
+
result["protein_length"] = int(row["protein_length"])
|
| 57 |
+
result["composition_summary"] = row["composition_summary"]
|
| 58 |
+
else:
|
| 59 |
+
result["protein_length"] = None
|
| 60 |
+
result["composition_summary"] = None
|
| 61 |
+
|
| 62 |
+
return result
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def render_gene_card_html(card: dict) -> str:
|
| 66 |
+
"""Render Gene Card data as HTML."""
|
| 67 |
+
gene_id = card["gene_id"]
|
| 68 |
+
|
| 69 |
+
# Badge
|
| 70 |
+
cc = card.get("core_class", "unknown")
|
| 71 |
+
badge_map = {
|
| 72 |
+
"core": ('<span class="quest-badge badge-core">Core</span>', "Present in >=95% of lines"),
|
| 73 |
+
"shell": ('<span class="quest-badge badge-shell">Shell</span>', "Present in 15-95% of lines"),
|
| 74 |
+
"cloud": ('<span class="quest-badge badge-cloud">Cloud</span>', "Present in <15% of lines"),
|
| 75 |
+
}
|
| 76 |
+
badge_html, badge_desc = badge_map.get(cc, ('<span class="quest-badge">Unknown</span>', ""))
|
| 77 |
+
|
| 78 |
+
# Frequency
|
| 79 |
+
freq_text = ""
|
| 80 |
+
if card.get("freq_count") is not None:
|
| 81 |
+
freq_text = f"<p>Present in <b>{card['freq_count']}</b> lines ({card['freq_pct']:.1f}%)</p>"
|
| 82 |
+
|
| 83 |
+
# Presence barcode
|
| 84 |
+
barcode_html = ""
|
| 85 |
+
if card.get("presence_vector") is not None:
|
| 86 |
+
barcode_html = render_presence_barcode(card["presence_vector"])
|
| 87 |
+
|
| 88 |
+
# Location
|
| 89 |
+
loc_text = ""
|
| 90 |
+
if card.get("contig"):
|
| 91 |
+
loc_text = (
|
| 92 |
+
f"<p>Contig: <code>{card['contig']}</code></p>"
|
| 93 |
+
f"<p>Position: {card['start']:,} - {card['end']:,} ({card['strand']})</p>"
|
| 94 |
+
)
|
| 95 |
+
else:
|
| 96 |
+
loc_text = "<p><i>No coordinate annotation available</i></p>"
|
| 97 |
+
|
| 98 |
+
# Protein
|
| 99 |
+
prot_text = ""
|
| 100 |
+
if card.get("protein_length"):
|
| 101 |
+
prot_text = (
|
| 102 |
+
f"<p>Protein length: <b>{card['protein_length']}</b> aa</p>"
|
| 103 |
+
f"<p>Top AAs: {card['composition_summary']}</p>"
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
prot_text = "<p><i>No protein data available</i></p>"
|
| 107 |
+
|
| 108 |
+
html = f"""
|
| 109 |
+
<div class="gene-card">
|
| 110 |
+
<h3>Gene: {gene_id}</h3>
|
| 111 |
+
{badge_html}
|
| 112 |
+
<p style="font-size:0.8em;color:#666">{badge_desc}</p>
|
| 113 |
+
{freq_text}
|
| 114 |
+
<h4>Presence Barcode</h4>
|
| 115 |
+
{barcode_html}
|
| 116 |
+
<h4>Genomic Location</h4>
|
| 117 |
+
{loc_text}
|
| 118 |
+
<h4>Protein</h4>
|
| 119 |
+
{prot_text}
|
| 120 |
+
</div>
|
| 121 |
+
"""
|
| 122 |
+
return html
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def render_presence_barcode(presence_vector: pd.Series) -> str:
|
| 126 |
+
"""
|
| 127 |
+
Returns HTML string: colored spans for each line.
|
| 128 |
+
Green (#4CAF50) = present, light gray (#E0E0E0) = absent.
|
| 129 |
+
"""
|
| 130 |
+
spans = []
|
| 131 |
+
for val in presence_vector:
|
| 132 |
+
cls = "present" if val == 1 else "absent"
|
| 133 |
+
spans.append(f'<span class="{cls}"></span>')
|
| 134 |
+
return '<div class="presence-barcode">' + "".join(spans) + '</div>'
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def export_gene_report(gene_id: str, data: dict) -> str:
|
| 138 |
+
"""Export gene data as JSON file. Return file path."""
|
| 139 |
+
card = build_gene_card(gene_id, data)
|
| 140 |
+
# Convert presence vector to list for JSON
|
| 141 |
+
if card.get("presence_vector") is not None:
|
| 142 |
+
card["presence_vector"] = card["presence_vector"].to_dict()
|
| 143 |
+
|
| 144 |
+
tmpdir = tempfile.mkdtemp()
|
| 145 |
+
path = os.path.join(tmpdir, f"{gene_id}_report.json")
|
| 146 |
+
with open(path, "w") as f:
|
| 147 |
+
json.dump(card, f, indent=2, default=str)
|
| 148 |
+
return path
|
src/precompute.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Offline precomputation for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from scipy.spatial.distance import pdist, squareform
|
| 6 |
+
from sklearn.cluster import KMeans
|
| 7 |
+
from sklearn.metrics import silhouette_score
|
| 8 |
+
|
| 9 |
+
from src.utils import logger, timer, parse_country
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@timer
|
| 13 |
+
def compute_gene_frequency(pav: pd.DataFrame) -> pd.DataFrame:
|
| 14 |
+
"""
|
| 15 |
+
Compute per-gene frequency and core class.
|
| 16 |
+
Output columns: gene_id, freq_count, freq_pct, core_class
|
| 17 |
+
"""
|
| 18 |
+
n_lines = pav.shape[1]
|
| 19 |
+
freq_count = pav.sum(axis=1).astype(int)
|
| 20 |
+
freq_pct = (freq_count / n_lines * 100).round(2)
|
| 21 |
+
|
| 22 |
+
def classify(pct):
|
| 23 |
+
if pct >= 95:
|
| 24 |
+
return "core"
|
| 25 |
+
elif pct >= 15:
|
| 26 |
+
return "shell"
|
| 27 |
+
return "cloud"
|
| 28 |
+
|
| 29 |
+
core_class = freq_pct.map(classify)
|
| 30 |
+
|
| 31 |
+
df = pd.DataFrame({
|
| 32 |
+
"gene_id": pav.index,
|
| 33 |
+
"freq_count": freq_count.values,
|
| 34 |
+
"freq_pct": freq_pct.values,
|
| 35 |
+
"core_class": core_class.values,
|
| 36 |
+
})
|
| 37 |
+
logger.info(f"Gene frequency: {(df['core_class']=='core').sum()} core, "
|
| 38 |
+
f"{(df['core_class']=='shell').sum()} shell, "
|
| 39 |
+
f"{(df['core_class']=='cloud').sum()} cloud")
|
| 40 |
+
return df
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@timer
|
| 44 |
+
def compute_line_stats(pav: pd.DataFrame) -> pd.DataFrame:
|
| 45 |
+
"""
|
| 46 |
+
Compute per-line statistics.
|
| 47 |
+
Output columns: line_id, country, genes_present_count, unique_genes_count
|
| 48 |
+
"""
|
| 49 |
+
n_lines = pav.shape[1]
|
| 50 |
+
records = []
|
| 51 |
+
for line_id in pav.columns:
|
| 52 |
+
presence = pav[line_id]
|
| 53 |
+
genes_present = int(presence.sum())
|
| 54 |
+
# Unique genes: present in this line but no others
|
| 55 |
+
unique_mask = (pav.sum(axis=1) == 1) & (presence == 1)
|
| 56 |
+
unique_count = int(unique_mask.sum())
|
| 57 |
+
country = parse_country(line_id)
|
| 58 |
+
records.append({
|
| 59 |
+
"line_id": line_id,
|
| 60 |
+
"country": country,
|
| 61 |
+
"genes_present_count": genes_present,
|
| 62 |
+
"unique_genes_count": unique_count,
|
| 63 |
+
})
|
| 64 |
+
df = pd.DataFrame(records)
|
| 65 |
+
logger.info(f"Line stats computed for {len(df)} lines")
|
| 66 |
+
return df
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@timer
|
| 70 |
+
def compute_line_embedding(pav: pd.DataFrame) -> pd.DataFrame:
|
| 71 |
+
"""
|
| 72 |
+
UMAP embedding + KMeans clustering of lines.
|
| 73 |
+
Output columns: line_id, umap_x, umap_y, cluster_id
|
| 74 |
+
"""
|
| 75 |
+
import umap
|
| 76 |
+
|
| 77 |
+
# Transpose: rows = lines, columns = genes
|
| 78 |
+
X = pav.T.values.astype(np.float32)
|
| 79 |
+
line_ids = list(pav.columns)
|
| 80 |
+
|
| 81 |
+
# UMAP
|
| 82 |
+
reducer = umap.UMAP(n_components=2, metric="jaccard", n_neighbors=15,
|
| 83 |
+
min_dist=0.1, random_state=42)
|
| 84 |
+
embedding = reducer.fit_transform(X)
|
| 85 |
+
|
| 86 |
+
# KMeans clustering — pick k by silhouette
|
| 87 |
+
best_k, best_score = 3, -1
|
| 88 |
+
for k in range(3, min(9, len(line_ids))):
|
| 89 |
+
km = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 90 |
+
labels = km.fit_predict(embedding)
|
| 91 |
+
score = silhouette_score(embedding, labels)
|
| 92 |
+
if score > best_score:
|
| 93 |
+
best_k, best_score = k, score
|
| 94 |
+
best_labels = labels
|
| 95 |
+
|
| 96 |
+
logger.info(f"UMAP + KMeans: best k={best_k}, silhouette={best_score:.3f}")
|
| 97 |
+
|
| 98 |
+
df = pd.DataFrame({
|
| 99 |
+
"line_id": line_ids,
|
| 100 |
+
"umap_x": embedding[:, 0],
|
| 101 |
+
"umap_y": embedding[:, 1],
|
| 102 |
+
"cluster_id": best_labels,
|
| 103 |
+
})
|
| 104 |
+
return df
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
@timer
|
| 108 |
+
def compute_similarity_topk(pav: pd.DataFrame, k: int = 15) -> pd.DataFrame:
|
| 109 |
+
"""
|
| 110 |
+
Pairwise Jaccard similarity, keep top-K neighbors per line.
|
| 111 |
+
Output columns: line_id, neighbor_line_id, jaccard_score
|
| 112 |
+
"""
|
| 113 |
+
X = pav.T.values.astype(np.float32)
|
| 114 |
+
line_ids = list(pav.columns)
|
| 115 |
+
n = len(line_ids)
|
| 116 |
+
|
| 117 |
+
# Compute pairwise Jaccard distance, convert to similarity
|
| 118 |
+
dist_vec = pdist(X, metric="jaccard")
|
| 119 |
+
dist_mat = squareform(dist_vec)
|
| 120 |
+
sim_mat = 1.0 - dist_mat
|
| 121 |
+
|
| 122 |
+
records = []
|
| 123 |
+
for i in range(n):
|
| 124 |
+
scores = sim_mat[i].copy()
|
| 125 |
+
scores[i] = -1 # exclude self
|
| 126 |
+
top_idx = np.argsort(scores)[::-1][:k]
|
| 127 |
+
for j in top_idx:
|
| 128 |
+
records.append({
|
| 129 |
+
"line_id": line_ids[i],
|
| 130 |
+
"neighbor_line_id": line_ids[j],
|
| 131 |
+
"jaccard_score": round(float(scores[j]), 4),
|
| 132 |
+
})
|
| 133 |
+
|
| 134 |
+
df = pd.DataFrame(records)
|
| 135 |
+
logger.info(f"Similarity top-{k}: {len(df)} pairs")
|
| 136 |
+
return df
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@timer
|
| 140 |
+
def build_gff_gene_parquet(gff_genes: pd.DataFrame, output_path: str) -> None:
|
| 141 |
+
"""Save parsed GFF gene DataFrame to parquet."""
|
| 142 |
+
gff_genes.to_parquet(output_path, index=False)
|
| 143 |
+
logger.info(f"GFF gene index saved: {output_path}")
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
@timer
|
| 147 |
+
def build_protein_parquet(protein_df: pd.DataFrame, output_path: str) -> None:
|
| 148 |
+
"""Save protein index to parquet."""
|
| 149 |
+
protein_df.to_parquet(output_path, index=False)
|
| 150 |
+
logger.info(f"Protein index saved: {output_path}")
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@timer
|
| 154 |
+
def save_contig_index(contig_index: dict, contig_mapping: dict, output_path: str) -> None:
|
| 155 |
+
"""Save contig index as JSON."""
|
| 156 |
+
import json
|
| 157 |
+
data = {}
|
| 158 |
+
for contig_id, length in contig_index.items():
|
| 159 |
+
gff_seqid = None
|
| 160 |
+
for gff_id, fasta_id in contig_mapping.items():
|
| 161 |
+
if fasta_id == contig_id:
|
| 162 |
+
gff_seqid = gff_id
|
| 163 |
+
break
|
| 164 |
+
data[contig_id] = {
|
| 165 |
+
"length": length,
|
| 166 |
+
"gff_seqid": gff_seqid or contig_id,
|
| 167 |
+
"fasta_header": contig_id,
|
| 168 |
+
}
|
| 169 |
+
with open(output_path, "w") as f:
|
| 170 |
+
json.dump(data, f, indent=2)
|
| 171 |
+
logger.info(f"Contig index saved: {output_path}")
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
@timer
|
| 175 |
+
def compute_hotspot_bins(gff_genes: pd.DataFrame, gene_freq: pd.DataFrame,
|
| 176 |
+
contig_index: dict, bin_size: int = 100_000) -> pd.DataFrame:
|
| 177 |
+
"""
|
| 178 |
+
Bin genes along contigs and compute variability scores.
|
| 179 |
+
Output columns: contig_id, bin_start, bin_end, total_genes, cloud_genes,
|
| 180 |
+
shell_genes, core_genes, mean_freq, variability_score
|
| 181 |
+
"""
|
| 182 |
+
# Join gff with gene frequency
|
| 183 |
+
merged = gff_genes.merge(gene_freq, on="gene_id", how="inner")
|
| 184 |
+
merged["midpoint"] = (merged["start"] + merged["end"]) // 2
|
| 185 |
+
|
| 186 |
+
records = []
|
| 187 |
+
for contig_id in merged["contig_id"].unique():
|
| 188 |
+
contig_genes = merged[merged["contig_id"] == contig_id]
|
| 189 |
+
max_pos = contig_genes["end"].max()
|
| 190 |
+
|
| 191 |
+
for bin_start in range(0, max_pos + bin_size, bin_size):
|
| 192 |
+
bin_end = bin_start + bin_size
|
| 193 |
+
in_bin = contig_genes[
|
| 194 |
+
(contig_genes["midpoint"] >= bin_start) &
|
| 195 |
+
(contig_genes["midpoint"] < bin_end)
|
| 196 |
+
]
|
| 197 |
+
if len(in_bin) == 0:
|
| 198 |
+
continue
|
| 199 |
+
|
| 200 |
+
core_count = int((in_bin["core_class"] == "core").sum())
|
| 201 |
+
shell_count = int((in_bin["core_class"] == "shell").sum())
|
| 202 |
+
cloud_count = int((in_bin["core_class"] == "cloud").sum())
|
| 203 |
+
mean_freq = float(in_bin["freq_pct"].mean())
|
| 204 |
+
variability_score = cloud_count + 0.5 * shell_count
|
| 205 |
+
|
| 206 |
+
records.append({
|
| 207 |
+
"contig_id": contig_id,
|
| 208 |
+
"bin_start": bin_start,
|
| 209 |
+
"bin_end": bin_end,
|
| 210 |
+
"total_genes": len(in_bin),
|
| 211 |
+
"core_genes": core_count,
|
| 212 |
+
"shell_genes": shell_count,
|
| 213 |
+
"cloud_genes": cloud_count,
|
| 214 |
+
"mean_freq": round(mean_freq, 2),
|
| 215 |
+
"variability_score": round(variability_score, 2),
|
| 216 |
+
})
|
| 217 |
+
|
| 218 |
+
df = pd.DataFrame(records)
|
| 219 |
+
logger.info(f"Hotspot bins computed: {len(df)} bins across {df['contig_id'].nunique()} contigs")
|
| 220 |
+
return df
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
@timer
|
| 224 |
+
def compute_cluster_markers(pav: pd.DataFrame, embedding: pd.DataFrame,
|
| 225 |
+
top_n: int = 50) -> pd.DataFrame:
|
| 226 |
+
"""
|
| 227 |
+
Find marker genes for each cluster.
|
| 228 |
+
Output columns: cluster_id, gene_id, in_cluster_freq, out_cluster_freq, marker_score
|
| 229 |
+
"""
|
| 230 |
+
clusters = embedding[["line_id", "cluster_id"]].copy()
|
| 231 |
+
records = []
|
| 232 |
+
|
| 233 |
+
for cid in sorted(clusters["cluster_id"].unique()):
|
| 234 |
+
in_lines = set(clusters[clusters["cluster_id"] == cid]["line_id"])
|
| 235 |
+
out_lines = set(clusters[clusters["cluster_id"] != cid]["line_id"])
|
| 236 |
+
|
| 237 |
+
in_cols = [c for c in pav.columns if c in in_lines]
|
| 238 |
+
out_cols = [c for c in pav.columns if c in out_lines]
|
| 239 |
+
|
| 240 |
+
if not in_cols or not out_cols:
|
| 241 |
+
continue
|
| 242 |
+
|
| 243 |
+
in_freq = pav[in_cols].mean(axis=1)
|
| 244 |
+
out_freq = pav[out_cols].mean(axis=1)
|
| 245 |
+
marker_score = in_freq - out_freq
|
| 246 |
+
|
| 247 |
+
top_genes = marker_score.nlargest(top_n)
|
| 248 |
+
for gene_id, score in top_genes.items():
|
| 249 |
+
records.append({
|
| 250 |
+
"cluster_id": int(cid),
|
| 251 |
+
"gene_id": gene_id,
|
| 252 |
+
"in_cluster_freq": round(float(in_freq[gene_id]), 4),
|
| 253 |
+
"out_cluster_freq": round(float(out_freq[gene_id]), 4),
|
| 254 |
+
"marker_score": round(float(score), 4),
|
| 255 |
+
})
|
| 256 |
+
|
| 257 |
+
df = pd.DataFrame(records)
|
| 258 |
+
logger.info(f"Cluster markers: {len(df)} total across {df['cluster_id'].nunique()} clusters")
|
| 259 |
+
return df
|
src/state.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application state for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class AppState:
|
| 8 |
+
selected_line: str | None = None
|
| 9 |
+
selected_party: list[str] = field(default_factory=list)
|
| 10 |
+
selected_gene: str | None = None
|
| 11 |
+
backpack_genes: list[str] = field(default_factory=list)
|
| 12 |
+
core_threshold: float = 95.0
|
| 13 |
+
cloud_threshold: float = 15.0
|
| 14 |
+
selected_contig: str | None = None
|
| 15 |
+
achievements: set[str] = field(default_factory=set)
|
| 16 |
+
|
| 17 |
+
def add_to_backpack(self, gene_id: str) -> bool:
|
| 18 |
+
"""Add gene to backpack. Returns True if added, False if already present or full."""
|
| 19 |
+
if gene_id in self.backpack_genes:
|
| 20 |
+
return False
|
| 21 |
+
if len(self.backpack_genes) >= 50:
|
| 22 |
+
return False
|
| 23 |
+
self.backpack_genes.append(gene_id)
|
| 24 |
+
if len(self.backpack_genes) >= 5:
|
| 25 |
+
self.achievements.add("Collector")
|
| 26 |
+
return True
|
| 27 |
+
|
| 28 |
+
def remove_from_backpack(self, gene_id: str) -> bool:
|
| 29 |
+
"""Remove gene from backpack."""
|
| 30 |
+
if gene_id in self.backpack_genes:
|
| 31 |
+
self.backpack_genes.remove(gene_id)
|
| 32 |
+
return True
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
def award(self, badge: str):
|
| 36 |
+
"""Award an achievement badge."""
|
| 37 |
+
self.achievements.add(badge)
|
src/utils.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared helpers for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from functools import wraps
|
| 8 |
+
|
| 9 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 10 |
+
logger = logging.getLogger("pangenome")
|
| 11 |
+
|
| 12 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 13 |
+
DATA_DIR = PROJECT_ROOT / "data"
|
| 14 |
+
PRECOMPUTED_DIR = PROJECT_ROOT / "precomputed"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def find_file(directory: Path, pattern: str) -> Path:
|
| 18 |
+
"""Find first file matching glob pattern in directory."""
|
| 19 |
+
matches = list(directory.glob(pattern))
|
| 20 |
+
if not matches:
|
| 21 |
+
raise FileNotFoundError(f"No file matching '{pattern}' in {directory}")
|
| 22 |
+
return matches[0]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def timer(func):
|
| 26 |
+
"""Decorator that logs execution time."""
|
| 27 |
+
@wraps(func)
|
| 28 |
+
def wrapper(*args, **kwargs):
|
| 29 |
+
t0 = time.time()
|
| 30 |
+
result = func(*args, **kwargs)
|
| 31 |
+
dt = time.time() - t0
|
| 32 |
+
logger.info(f"{func.__name__} completed in {dt:.2f}s")
|
| 33 |
+
return result
|
| 34 |
+
return wrapper
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
KNOWN_COUNTRIES = {
|
| 38 |
+
"India", "Myanmar", "Unknown", "Zaire", "Uganda", "Indonesia", "Jamaica",
|
| 39 |
+
"South_Africa", "Puerto_Rico", "Philippines", "Sierra_Leone", "Nigeria",
|
| 40 |
+
"Malawi", "Italy", "Kenya", "Sri_Lanka", "Thailand", "Nepal",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def parse_country(line_id: str) -> str:
|
| 45 |
+
"""Extract country from line ID (last token after underscore)."""
|
| 46 |
+
parts = line_id.rsplit("_", 1)
|
| 47 |
+
if len(parts) == 2 and parts[1] in KNOWN_COUNTRIES:
|
| 48 |
+
return parts[1]
|
| 49 |
+
# Try two-word countries
|
| 50 |
+
parts2 = line_id.rsplit("_", 2)
|
| 51 |
+
if len(parts2) >= 3:
|
| 52 |
+
two_word = f"{parts2[-2]}_{parts2[-1]}"
|
| 53 |
+
if two_word in KNOWN_COUNTRIES:
|
| 54 |
+
return two_word
|
| 55 |
+
return "Unknown"
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared test fixtures for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import pytest
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Ensure project root is in path
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 12 |
+
|
| 13 |
+
from src.state import AppState
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.fixture
|
| 17 |
+
def synthetic_pav():
|
| 18 |
+
"""20 genes x 10 lines, random 0/1."""
|
| 19 |
+
np.random.seed(42)
|
| 20 |
+
genes = [f"g{i:05d}" for i in range(20)]
|
| 21 |
+
lines = [f"Line_{i}_India" for i in range(10)]
|
| 22 |
+
data = np.random.randint(0, 2, size=(20, 10))
|
| 23 |
+
return pd.DataFrame(data, index=genes, columns=lines)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@pytest.fixture
|
| 27 |
+
def synthetic_gff():
|
| 28 |
+
"""20 genes on 3 contigs."""
|
| 29 |
+
records = []
|
| 30 |
+
for i in range(20):
|
| 31 |
+
contig = f"contig_{i % 3}"
|
| 32 |
+
start = i * 1000
|
| 33 |
+
end = start + 500
|
| 34 |
+
records.append({
|
| 35 |
+
"gene_id": f"g{i:05d}",
|
| 36 |
+
"contig_id": contig,
|
| 37 |
+
"start": start,
|
| 38 |
+
"end": end,
|
| 39 |
+
"strand": "+" if i % 2 == 0 else "-",
|
| 40 |
+
})
|
| 41 |
+
return pd.DataFrame(records)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@pytest.fixture
|
| 45 |
+
def synthetic_protein():
|
| 46 |
+
"""20 genes with random lengths."""
|
| 47 |
+
records = []
|
| 48 |
+
for i in range(20):
|
| 49 |
+
records.append({
|
| 50 |
+
"gene_id": f"g{i:05d}",
|
| 51 |
+
"protein_length": np.random.randint(50, 1000),
|
| 52 |
+
"composition_summary": "L:9.0%, A:8.0%, G:7.0%, S:6.0%, V:5.0%",
|
| 53 |
+
})
|
| 54 |
+
return pd.DataFrame(records)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@pytest.fixture
|
| 58 |
+
def synthetic_contig_index():
|
| 59 |
+
"""3 contigs."""
|
| 60 |
+
return {"contig_0": 50000, "contig_1": 80000, "contig_2": 30000}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@pytest.fixture
|
| 64 |
+
def empty_state():
|
| 65 |
+
"""Fresh AppState."""
|
| 66 |
+
return AppState()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@pytest.fixture
|
| 70 |
+
def populated_state():
|
| 71 |
+
"""AppState with some selections."""
|
| 72 |
+
state = AppState()
|
| 73 |
+
state.selected_line = "Line_0_India"
|
| 74 |
+
state.backpack_genes = ["g00001", "g00002"]
|
| 75 |
+
state.achievements = {"Explorer"}
|
| 76 |
+
return state
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@pytest.fixture
|
| 80 |
+
def synthetic_data(synthetic_pav, synthetic_gff, synthetic_protein, synthetic_contig_index):
|
| 81 |
+
"""Complete synthetic data dict matching app DATA format."""
|
| 82 |
+
from src.precompute import (
|
| 83 |
+
compute_gene_frequency, compute_line_stats,
|
| 84 |
+
compute_line_embedding, compute_similarity_topk,
|
| 85 |
+
compute_hotspot_bins, compute_cluster_markers,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
gene_freq = compute_gene_frequency(synthetic_pav)
|
| 89 |
+
line_stats = compute_line_stats(synthetic_pav)
|
| 90 |
+
|
| 91 |
+
# Simple embedding without UMAP (too small for meaningful UMAP)
|
| 92 |
+
line_ids = list(synthetic_pav.columns)
|
| 93 |
+
embedding = pd.DataFrame({
|
| 94 |
+
"line_id": line_ids,
|
| 95 |
+
"umap_x": np.random.randn(len(line_ids)),
|
| 96 |
+
"umap_y": np.random.randn(len(line_ids)),
|
| 97 |
+
"cluster_id": [i % 3 for i in range(len(line_ids))],
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
similarity = compute_similarity_topk(synthetic_pav, k=5)
|
| 101 |
+
|
| 102 |
+
hotspots = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
|
| 103 |
+
markers = compute_cluster_markers(synthetic_pav, embedding)
|
| 104 |
+
|
| 105 |
+
return {
|
| 106 |
+
"pav": synthetic_pav,
|
| 107 |
+
"gene_freq": gene_freq,
|
| 108 |
+
"line_stats": line_stats,
|
| 109 |
+
"embedding": embedding,
|
| 110 |
+
"similarity": similarity,
|
| 111 |
+
"gff_index": synthetic_gff,
|
| 112 |
+
"protein": synthetic_protein,
|
| 113 |
+
"hotspots": hotspots,
|
| 114 |
+
"markers": markers,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@pytest.fixture
|
| 119 |
+
def precomputed_dir():
|
| 120 |
+
"""Path to precomputed directory (if exists)."""
|
| 121 |
+
p = Path(__file__).resolve().parent.parent / "precomputed"
|
| 122 |
+
if p.exists():
|
| 123 |
+
return p
|
| 124 |
+
return None
|
tests/test_callbacks.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for src/callbacks.py."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import plotly.graph_objects as go
|
| 5 |
+
|
| 6 |
+
from src.state import AppState
|
| 7 |
+
from src.callbacks import (
|
| 8 |
+
on_line_selected, build_umap_plot,
|
| 9 |
+
build_donut_chart, build_frequency_histogram, build_treasure_table,
|
| 10 |
+
on_pin_gene, build_hotspot_heatmap, get_protein_stats_html,
|
| 11 |
+
build_backpack_comparison,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TestQuest0:
|
| 16 |
+
|
| 17 |
+
def test_on_line_selected_returns(self, synthetic_data):
|
| 18 |
+
state = AppState()
|
| 19 |
+
line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 20 |
+
total, unique, neighbor, new_state = on_line_selected(line_id, state, synthetic_data)
|
| 21 |
+
assert total != "--"
|
| 22 |
+
assert int(total) > 0
|
| 23 |
+
|
| 24 |
+
def test_on_line_selected_updates_state(self, synthetic_data):
|
| 25 |
+
state = AppState()
|
| 26 |
+
line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 27 |
+
_, _, _, new_state = on_line_selected(line_id, state, synthetic_data)
|
| 28 |
+
assert new_state.selected_line == line_id
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TestQuest1:
|
| 32 |
+
|
| 33 |
+
def test_umap_plot_returns_figure(self, synthetic_data):
|
| 34 |
+
state = AppState()
|
| 35 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 36 |
+
fig = build_umap_plot("Country", state, synthetic_data)
|
| 37 |
+
assert isinstance(fig, go.Figure)
|
| 38 |
+
assert len(fig.data) >= 1
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class TestQuest2:
|
| 42 |
+
|
| 43 |
+
def test_donut_three_segments(self, synthetic_data):
|
| 44 |
+
fig = build_donut_chart(95, 15, synthetic_data)
|
| 45 |
+
assert isinstance(fig, go.Figure)
|
| 46 |
+
assert len(fig.data) == 1
|
| 47 |
+
assert len(fig.data[0].values) == 3
|
| 48 |
+
|
| 49 |
+
def test_histogram_returns_figure(self, synthetic_data):
|
| 50 |
+
fig = build_frequency_histogram(95, 15, synthetic_data)
|
| 51 |
+
assert isinstance(fig, go.Figure)
|
| 52 |
+
|
| 53 |
+
def test_treasure_table_columns(self, synthetic_data):
|
| 54 |
+
state = AppState()
|
| 55 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 56 |
+
df = build_treasure_table(state, 95, 15, "All", synthetic_data)
|
| 57 |
+
assert "gene_id" in df.columns
|
| 58 |
+
assert "freq_count" in df.columns
|
| 59 |
+
assert "core_class" in df.columns
|
| 60 |
+
|
| 61 |
+
def test_pin_gene(self):
|
| 62 |
+
state = AppState()
|
| 63 |
+
backpack_text, new_state = on_pin_gene("g00001", state)
|
| 64 |
+
assert "g00001" in new_state.backpack_genes
|
| 65 |
+
|
| 66 |
+
def test_pin_gene_idempotent(self):
|
| 67 |
+
state = AppState()
|
| 68 |
+
on_pin_gene("g00001", state)
|
| 69 |
+
backpack_text, new_state = on_pin_gene("g00001", state)
|
| 70 |
+
assert state.backpack_genes.count("g00001") == 1
|
| 71 |
+
|
| 72 |
+
def test_backpack_limit(self):
|
| 73 |
+
state = AppState()
|
| 74 |
+
for i in range(55):
|
| 75 |
+
state.add_to_backpack(f"g{i:05d}")
|
| 76 |
+
assert len(state.backpack_genes) <= 50
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class TestQuest3:
|
| 80 |
+
|
| 81 |
+
def test_hotspot_heatmap(self, synthetic_data):
|
| 82 |
+
fig = build_hotspot_heatmap(synthetic_data)
|
| 83 |
+
assert isinstance(fig, go.Figure)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class TestQuest4:
|
| 87 |
+
|
| 88 |
+
def test_protein_stats_html(self, synthetic_data):
|
| 89 |
+
gene_id = synthetic_data["protein"].iloc[0]["gene_id"]
|
| 90 |
+
html = get_protein_stats_html(gene_id, synthetic_data)
|
| 91 |
+
assert "aa" in html or "protein" in html.lower()
|
| 92 |
+
|
| 93 |
+
def test_backpack_comparison_needs_2(self, synthetic_data):
|
| 94 |
+
state = AppState()
|
| 95 |
+
state.backpack_genes = ["g00001"]
|
| 96 |
+
fig = build_backpack_comparison(state, synthetic_data)
|
| 97 |
+
assert isinstance(fig, go.Figure)
|
tests/test_data_loader.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for src/data_loader.py."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from src.data_loader import load_pav, parse_gff_genes, parse_protein_fasta, build_contig_index, validate_joins
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
| 11 |
+
HAS_REAL_DATA = (DATA_DIR / "89_line_PAV.txt").exists()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@pytest.mark.skipif(not HAS_REAL_DATA, reason="Real data not available")
|
| 15 |
+
class TestRealData:
|
| 16 |
+
|
| 17 |
+
def test_load_pav_shape(self):
|
| 18 |
+
df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
|
| 19 |
+
assert df.shape[0] > 0
|
| 20 |
+
assert df.shape[1] > 0
|
| 21 |
+
|
| 22 |
+
def test_load_pav_values(self):
|
| 23 |
+
df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
|
| 24 |
+
unique_vals = set(df.values.flatten())
|
| 25 |
+
assert unique_vals.issubset({0, 1})
|
| 26 |
+
|
| 27 |
+
def test_load_pav_gene_ids(self):
|
| 28 |
+
df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
|
| 29 |
+
assert df.index.dtype == object # string type
|
| 30 |
+
assert df.index.is_unique
|
| 31 |
+
|
| 32 |
+
def test_load_pav_line_count(self):
|
| 33 |
+
df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
|
| 34 |
+
# 89 lines + reference = 90 columns
|
| 35 |
+
assert df.shape[1] >= 89, f"Expected >=89 lines, got {df.shape[1]}"
|
| 36 |
+
|
| 37 |
+
def test_parse_gff_columns(self):
|
| 38 |
+
gff_files = list(DATA_DIR.glob("*.gff"))
|
| 39 |
+
assert len(gff_files) > 0
|
| 40 |
+
df = parse_gff_genes(str(gff_files[0]))
|
| 41 |
+
assert set(df.columns) == {"gene_id", "contig_id", "start", "end", "strand"}
|
| 42 |
+
|
| 43 |
+
def test_parse_gff_no_empty_ids(self):
|
| 44 |
+
gff_files = list(DATA_DIR.glob("*.gff"))
|
| 45 |
+
df = parse_gff_genes(str(gff_files[0]))
|
| 46 |
+
assert not df["gene_id"].isna().any()
|
| 47 |
+
assert not (df["gene_id"] == "").any()
|
| 48 |
+
|
| 49 |
+
def test_protein_fasta_lengths(self):
|
| 50 |
+
fasta_files = list(DATA_DIR.glob("*protein*.fasta"))
|
| 51 |
+
assert len(fasta_files) > 0
|
| 52 |
+
df = parse_protein_fasta(str(fasta_files[0]))
|
| 53 |
+
assert (df["protein_length"] > 0).all()
|
| 54 |
+
|
| 55 |
+
def test_contig_index_nonempty(self):
|
| 56 |
+
fasta_files = [f for f in DATA_DIR.glob("*.fasta") if "protein" not in f.name]
|
| 57 |
+
if fasta_files:
|
| 58 |
+
ci = build_contig_index(str(fasta_files[0]))
|
| 59 |
+
assert len(ci) > 0
|
| 60 |
+
|
| 61 |
+
def test_validate_joins_coverage(self):
|
| 62 |
+
pav = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
|
| 63 |
+
gff_files = list(DATA_DIR.glob("*.gff"))
|
| 64 |
+
gff = parse_gff_genes(str(gff_files[0]))
|
| 65 |
+
fasta_files = list(DATA_DIR.glob("*protein*.fasta"))
|
| 66 |
+
protein = parse_protein_fasta(str(fasta_files[0]))
|
| 67 |
+
genome_files = [f for f in DATA_DIR.glob("*.fasta") if "protein" not in f.name]
|
| 68 |
+
ci = build_contig_index(str(genome_files[0])) if genome_files else {}
|
| 69 |
+
|
| 70 |
+
report = validate_joins(pav, gff, protein, ci)
|
| 71 |
+
assert report["pav_genes_in_gff_pct"] > 50
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class TestSynthetic:
|
| 75 |
+
|
| 76 |
+
def test_pav_shape(self, synthetic_pav):
|
| 77 |
+
assert synthetic_pav.shape == (20, 10)
|
| 78 |
+
|
| 79 |
+
def test_pav_values(self, synthetic_pav):
|
| 80 |
+
unique_vals = set(synthetic_pav.values.flatten())
|
| 81 |
+
assert unique_vals.issubset({0, 1})
|
| 82 |
+
|
| 83 |
+
def test_gff_columns(self, synthetic_gff):
|
| 84 |
+
assert set(synthetic_gff.columns) == {"gene_id", "contig_id", "start", "end", "strand"}
|
| 85 |
+
|
| 86 |
+
def test_gff_no_empty_ids(self, synthetic_gff):
|
| 87 |
+
assert not synthetic_gff["gene_id"].isna().any()
|
| 88 |
+
|
| 89 |
+
def test_protein_lengths(self, synthetic_protein):
|
| 90 |
+
assert (synthetic_protein["protein_length"] > 0).all()
|
tests/test_field_report.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for src/field_report.py."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
from src.state import AppState
|
| 7 |
+
from src.field_report import generate_field_report, export_report_json, export_report_csv
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TestFieldReport:
|
| 11 |
+
|
| 12 |
+
def test_report_has_sections(self, synthetic_data):
|
| 13 |
+
state = AppState()
|
| 14 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 15 |
+
report = generate_field_report(state, synthetic_data)
|
| 16 |
+
assert state.selected_line in report
|
| 17 |
+
assert "Cluster" in report
|
| 18 |
+
assert "Rare" in report or "rare" in report.lower()
|
| 19 |
+
|
| 20 |
+
def test_report_not_empty(self, synthetic_data):
|
| 21 |
+
state = AppState()
|
| 22 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 23 |
+
report = generate_field_report(state, synthetic_data)
|
| 24 |
+
assert len(report) > 100
|
| 25 |
+
|
| 26 |
+
def test_report_with_empty_backpack(self, synthetic_data):
|
| 27 |
+
state = AppState()
|
| 28 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 29 |
+
state.backpack_genes = []
|
| 30 |
+
report = generate_field_report(state, synthetic_data)
|
| 31 |
+
assert "No genes pinned" in report or "backpack" in report.lower()
|
| 32 |
+
|
| 33 |
+
def test_report_no_line(self, synthetic_data):
|
| 34 |
+
state = AppState()
|
| 35 |
+
report = generate_field_report(state, synthetic_data)
|
| 36 |
+
assert "No line selected" in report
|
| 37 |
+
|
| 38 |
+
def test_export_json_valid(self, synthetic_data):
|
| 39 |
+
state = AppState()
|
| 40 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 41 |
+
path = export_report_json(state, synthetic_data)
|
| 42 |
+
with open(path) as f:
|
| 43 |
+
data = json.load(f)
|
| 44 |
+
assert "selected_line" in data
|
| 45 |
+
|
| 46 |
+
def test_export_json_schema(self, synthetic_data):
|
| 47 |
+
state = AppState()
|
| 48 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 49 |
+
state.backpack_genes = ["g00001"]
|
| 50 |
+
path = export_report_json(state, synthetic_data)
|
| 51 |
+
with open(path) as f:
|
| 52 |
+
data = json.load(f)
|
| 53 |
+
assert "backpack_genes" in data
|
| 54 |
+
assert "achievements" in data
|
| 55 |
+
|
| 56 |
+
def test_export_csv(self, synthetic_data):
|
| 57 |
+
state = AppState()
|
| 58 |
+
state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 59 |
+
state.backpack_genes = [synthetic_data["gene_freq"].iloc[0]["gene_id"]]
|
| 60 |
+
path = export_report_csv(state, synthetic_data)
|
| 61 |
+
with open(path) as f:
|
| 62 |
+
content = f.read()
|
| 63 |
+
assert "gene_id" in content
|
tests/test_gene_card.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for src/gene_card.py."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
from src.gene_card import build_gene_card, render_gene_card_html, render_presence_barcode, export_gene_report
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TestGeneCard:
|
| 10 |
+
|
| 11 |
+
def test_gene_card_build(self, synthetic_data):
|
| 12 |
+
gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
|
| 13 |
+
card = build_gene_card(gene_id, synthetic_data)
|
| 14 |
+
assert card["gene_id"] == gene_id
|
| 15 |
+
assert "freq_count" in card
|
| 16 |
+
assert "core_class" in card
|
| 17 |
+
assert "contig" in card
|
| 18 |
+
assert "protein_length" in card
|
| 19 |
+
|
| 20 |
+
def test_gene_card_missing_gff(self, synthetic_data):
|
| 21 |
+
# Use a gene_id not in GFF
|
| 22 |
+
card = build_gene_card("nonexistent_gene", synthetic_data)
|
| 23 |
+
assert card["contig"] is None
|
| 24 |
+
|
| 25 |
+
def test_gene_card_missing_protein(self, synthetic_data):
|
| 26 |
+
card = build_gene_card("nonexistent_gene", synthetic_data)
|
| 27 |
+
assert card["protein_length"] is None
|
| 28 |
+
|
| 29 |
+
def test_gene_card_html(self, synthetic_data):
|
| 30 |
+
gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
|
| 31 |
+
card = build_gene_card(gene_id, synthetic_data)
|
| 32 |
+
html = render_gene_card_html(card)
|
| 33 |
+
assert gene_id in html
|
| 34 |
+
assert "gene-card" in html
|
| 35 |
+
|
| 36 |
+
def test_presence_barcode_length(self, synthetic_pav):
|
| 37 |
+
gene_id = synthetic_pav.index[0]
|
| 38 |
+
vector = synthetic_pav.loc[gene_id]
|
| 39 |
+
html = render_presence_barcode(vector)
|
| 40 |
+
# Should have as many spans as lines
|
| 41 |
+
assert html.count("<span") == len(vector)
|
| 42 |
+
|
| 43 |
+
def test_presence_barcode_html_valid(self, synthetic_pav):
|
| 44 |
+
gene_id = synthetic_pav.index[0]
|
| 45 |
+
vector = synthetic_pav.loc[gene_id]
|
| 46 |
+
html = render_presence_barcode(vector)
|
| 47 |
+
assert 'class="present"' in html or 'class="absent"' in html
|
| 48 |
+
|
| 49 |
+
def test_export_gene_report(self, synthetic_data):
|
| 50 |
+
gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
|
| 51 |
+
path = export_gene_report(gene_id, synthetic_data)
|
| 52 |
+
with open(path) as f:
|
| 53 |
+
data = json.load(f)
|
| 54 |
+
assert data["gene_id"] == gene_id
|
tests/test_integration.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Integration tests for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from src.state import AppState
|
| 7 |
+
from src.callbacks import (
|
| 8 |
+
on_line_selected, build_umap_plot, build_donut_chart,
|
| 9 |
+
build_treasure_table, on_pin_gene, build_hotspot_heatmap,
|
| 10 |
+
get_protein_stats_html, on_generate_report,
|
| 11 |
+
)
|
| 12 |
+
from src.gene_card import build_gene_card, render_gene_card_html
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TestFullFlow:
|
| 16 |
+
|
| 17 |
+
def test_full_flow(self, synthetic_data):
|
| 18 |
+
"""Simulate: select line -> UMAP -> thresholds -> gene card -> pin -> report."""
|
| 19 |
+
state = AppState()
|
| 20 |
+
|
| 21 |
+
# Step 1: Select line
|
| 22 |
+
line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
|
| 23 |
+
total, unique, neighbor, state = on_line_selected(line_id, state, synthetic_data)
|
| 24 |
+
assert state.selected_line == line_id
|
| 25 |
+
|
| 26 |
+
# Step 2: View UMAP
|
| 27 |
+
fig = build_umap_plot("Country", state, synthetic_data)
|
| 28 |
+
assert fig is not None
|
| 29 |
+
|
| 30 |
+
# Step 3: Change thresholds
|
| 31 |
+
donut = build_donut_chart(90, 10, synthetic_data)
|
| 32 |
+
assert donut is not None
|
| 33 |
+
|
| 34 |
+
# Step 4: View treasure table
|
| 35 |
+
table = build_treasure_table(state, 90, 10, "All", synthetic_data)
|
| 36 |
+
assert len(table) > 0
|
| 37 |
+
|
| 38 |
+
# Step 5: Click gene -> gene card
|
| 39 |
+
gene_id = table.iloc[0]["gene_id"]
|
| 40 |
+
card = build_gene_card(gene_id, synthetic_data)
|
| 41 |
+
html = render_gene_card_html(card)
|
| 42 |
+
assert gene_id in html
|
| 43 |
+
|
| 44 |
+
# Step 6: Pin gene
|
| 45 |
+
state.selected_gene = gene_id
|
| 46 |
+
_, state = on_pin_gene(gene_id, state)
|
| 47 |
+
assert gene_id in state.backpack_genes
|
| 48 |
+
|
| 49 |
+
# Step 7: Generate report
|
| 50 |
+
report_md, json_path, csv_path, _, _, badges, state = on_generate_report(state, synthetic_data)
|
| 51 |
+
assert len(report_md) > 100
|
| 52 |
+
assert "Cartographer" in state.achievements
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class TestPrecomputedConsistency:
|
| 56 |
+
|
| 57 |
+
def test_embedding_line_ids(self, synthetic_data):
|
| 58 |
+
"""All line_ids in embedding exist in line_stats."""
|
| 59 |
+
emb_lines = set(synthetic_data["embedding"]["line_id"])
|
| 60 |
+
stat_lines = set(synthetic_data["line_stats"]["line_id"])
|
| 61 |
+
assert emb_lines.issubset(stat_lines)
|
| 62 |
+
|
| 63 |
+
def test_marker_gene_ids(self, synthetic_data):
|
| 64 |
+
"""All gene_ids in markers exist in gene_freq."""
|
| 65 |
+
if len(synthetic_data["markers"]) == 0:
|
| 66 |
+
pytest.skip("No markers computed")
|
| 67 |
+
marker_genes = set(synthetic_data["markers"]["gene_id"])
|
| 68 |
+
freq_genes = set(synthetic_data["gene_freq"]["gene_id"])
|
| 69 |
+
assert marker_genes.issubset(freq_genes)
|
tests/test_precompute.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for src/precompute.py."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from src.precompute import (
|
| 9 |
+
compute_gene_frequency, compute_line_stats,
|
| 10 |
+
compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestGeneFrequency:
|
| 15 |
+
|
| 16 |
+
def test_freq_count_range(self, synthetic_pav):
|
| 17 |
+
df = compute_gene_frequency(synthetic_pav)
|
| 18 |
+
assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all()
|
| 19 |
+
assert (df["freq_count"] <= synthetic_pav.shape[1]).all()
|
| 20 |
+
|
| 21 |
+
def test_freq_classes(self, synthetic_pav):
|
| 22 |
+
df = compute_gene_frequency(synthetic_pav)
|
| 23 |
+
assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"})
|
| 24 |
+
|
| 25 |
+
def test_freq_pct_range(self, synthetic_pav):
|
| 26 |
+
df = compute_gene_frequency(synthetic_pav)
|
| 27 |
+
assert (df["freq_pct"] >= 0).all()
|
| 28 |
+
assert (df["freq_pct"] <= 100).all()
|
| 29 |
+
|
| 30 |
+
def test_gene_id_column(self, synthetic_pav):
|
| 31 |
+
df = compute_gene_frequency(synthetic_pav)
|
| 32 |
+
assert "gene_id" in df.columns
|
| 33 |
+
assert len(df) == len(synthetic_pav)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class TestLineStats:
|
| 37 |
+
|
| 38 |
+
def test_line_count(self, synthetic_pav):
|
| 39 |
+
df = compute_line_stats(synthetic_pav)
|
| 40 |
+
assert len(df) == synthetic_pav.shape[1]
|
| 41 |
+
|
| 42 |
+
def test_columns(self, synthetic_pav):
|
| 43 |
+
df = compute_line_stats(synthetic_pav)
|
| 44 |
+
assert "line_id" in df.columns
|
| 45 |
+
assert "country" in df.columns
|
| 46 |
+
assert "genes_present_count" in df.columns
|
| 47 |
+
assert "unique_genes_count" in df.columns
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class TestSimilarity:
|
| 51 |
+
|
| 52 |
+
def test_jaccard_range(self, synthetic_pav):
|
| 53 |
+
df = compute_similarity_topk(synthetic_pav, k=5)
|
| 54 |
+
assert (df["jaccard_score"] >= 0).all()
|
| 55 |
+
assert (df["jaccard_score"] <= 1).all()
|
| 56 |
+
|
| 57 |
+
def test_jaccard_symmetry(self, synthetic_pav):
|
| 58 |
+
df = compute_similarity_topk(synthetic_pav, k=5)
|
| 59 |
+
# Check a few pairs
|
| 60 |
+
for _, row in df.head(5).iterrows():
|
| 61 |
+
reverse = df[
|
| 62 |
+
(df["line_id"] == row["neighbor_line_id"]) &
|
| 63 |
+
(df["neighbor_line_id"] == row["line_id"])
|
| 64 |
+
]
|
| 65 |
+
if len(reverse) > 0:
|
| 66 |
+
assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class TestHotspots:
|
| 70 |
+
|
| 71 |
+
def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index):
|
| 72 |
+
gene_freq = compute_gene_frequency(synthetic_pav)
|
| 73 |
+
df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
|
| 74 |
+
if len(df) > 0:
|
| 75 |
+
assert (df["bin_start"] < df["bin_end"]).all()
|
| 76 |
+
assert (df["total_genes"] >= 0).all()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class TestClusterMarkers:
|
| 80 |
+
|
| 81 |
+
def test_marker_count(self, synthetic_pav):
|
| 82 |
+
line_ids = list(synthetic_pav.columns)
|
| 83 |
+
embedding = pd.DataFrame({
|
| 84 |
+
"line_id": line_ids,
|
| 85 |
+
"umap_x": np.random.randn(len(line_ids)),
|
| 86 |
+
"umap_y": np.random.randn(len(line_ids)),
|
| 87 |
+
"cluster_id": [i % 3 for i in range(len(line_ids))],
|
| 88 |
+
})
|
| 89 |
+
df = compute_cluster_markers(synthetic_pav, embedding, top_n=5)
|
| 90 |
+
# Each cluster should have at most top_n markers
|
| 91 |
+
for cid in df["cluster_id"].unique():
|
| 92 |
+
assert len(df[df["cluster_id"] == cid]) <= 5
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
class TestPrecomputedFiles:
|
| 96 |
+
|
| 97 |
+
def test_files_exist(self, precomputed_dir):
|
| 98 |
+
if precomputed_dir is None:
|
| 99 |
+
pytest.skip("Precomputed directory not available")
|
| 100 |
+
expected = [
|
| 101 |
+
"pav_gene_frequency.parquet",
|
| 102 |
+
"line_stats.parquet",
|
| 103 |
+
"line_embedding.parquet",
|
| 104 |
+
"line_similarity_topk.parquet",
|
| 105 |
+
"gff_gene_index.parquet",
|
| 106 |
+
"protein_index.parquet",
|
| 107 |
+
"genome_contig_index.json",
|
| 108 |
+
"hotspot_bins.parquet",
|
| 109 |
+
"cluster_markers.parquet",
|
| 110 |
+
]
|
| 111 |
+
for f in expected:
|
| 112 |
+
assert (precomputed_dir / f).exists(), f"Missing: {f}"
|
ui/__init__.py
ADDED
|
File without changes
|
ui/final.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Final tab: Field Report generation and export."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_final_tab():
|
| 7 |
+
"""Build Final Report tab components. Returns dict of components."""
|
| 8 |
+
with gr.Tab("Field Report", id="final") as tab:
|
| 9 |
+
gr.Markdown("## Your Exploration Report")
|
| 10 |
+
gr.Markdown(
|
| 11 |
+
"Generate a summary of your pangenome exploration journey, "
|
| 12 |
+
"including your selected line, findings, and backpack collection."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
generate_btn = gr.Button("Generate Report", variant="primary")
|
| 16 |
+
|
| 17 |
+
report_md = gr.Markdown(value="*Click 'Generate Report' to create your field report.*")
|
| 18 |
+
|
| 19 |
+
with gr.Row():
|
| 20 |
+
download_json = gr.File(label="Download JSON", visible=False)
|
| 21 |
+
download_csv = gr.File(label="Download CSV", visible=False)
|
| 22 |
+
|
| 23 |
+
gr.Markdown("### Achievements Earned")
|
| 24 |
+
achievements_html = gr.HTML(value="<p>Complete quests to earn badges!</p>")
|
| 25 |
+
|
| 26 |
+
return {
|
| 27 |
+
"tab": tab,
|
| 28 |
+
"generate_btn": generate_btn,
|
| 29 |
+
"report_md": report_md,
|
| 30 |
+
"download_json": download_json,
|
| 31 |
+
"download_csv": download_csv,
|
| 32 |
+
"achievements_html": achievements_html,
|
| 33 |
+
}
|
ui/gene_card_ui.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gene Card side panel UI."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_gene_card_panel():
|
| 7 |
+
"""Build Gene Card side panel. Returns dict of components."""
|
| 8 |
+
with gr.Column(visible=False, scale=1) as gene_card_col:
|
| 9 |
+
gr.Markdown("### Gene Card")
|
| 10 |
+
gene_card_html = gr.HTML(value="")
|
| 11 |
+
|
| 12 |
+
with gr.Row():
|
| 13 |
+
show_genome_btn = gr.Button("Show on Genome", size="sm")
|
| 14 |
+
show_protein_btn = gr.Button("Show Protein", size="sm")
|
| 15 |
+
|
| 16 |
+
with gr.Row():
|
| 17 |
+
pin_card_btn = gr.Button("Pin to Backpack", size="sm", variant="secondary")
|
| 18 |
+
download_gene_btn = gr.Button("Download Report", size="sm")
|
| 19 |
+
|
| 20 |
+
gene_report_file = gr.File(label="Gene Report", visible=False)
|
| 21 |
+
|
| 22 |
+
return {
|
| 23 |
+
"column": gene_card_col,
|
| 24 |
+
"gene_card_html": gene_card_html,
|
| 25 |
+
"show_genome_btn": show_genome_btn,
|
| 26 |
+
"show_protein_btn": show_protein_btn,
|
| 27 |
+
"pin_card_btn": pin_card_btn,
|
| 28 |
+
"download_gene_btn": download_gene_btn,
|
| 29 |
+
"gene_report_file": gene_report_file,
|
| 30 |
+
}
|
ui/layout.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Master Gradio layout for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
from ui.theme import build_theme, CUSTOM_CSS
|
| 6 |
+
from ui.quest0 import build_quest0
|
| 7 |
+
from ui.quest1 import build_quest1
|
| 8 |
+
from ui.quest2 import build_quest2
|
| 9 |
+
from ui.quest3 import build_quest3
|
| 10 |
+
from ui.quest4 import build_quest4
|
| 11 |
+
from ui.final import build_final_tab
|
| 12 |
+
from ui.gene_card_ui import build_gene_card_panel
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def build_app(line_choices: list[str], contig_choices: list[str],
|
| 16 |
+
gene_choices: list[str]) -> tuple:
|
| 17 |
+
"""
|
| 18 |
+
Build the full Gradio Blocks app.
|
| 19 |
+
Returns (demo, components_dict) where components_dict maps all UI elements.
|
| 20 |
+
"""
|
| 21 |
+
theme = build_theme()
|
| 22 |
+
|
| 23 |
+
with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="Pigeon Pea Pangenome Atlas") as demo:
|
| 24 |
+
# State
|
| 25 |
+
state = gr.State(value=None)
|
| 26 |
+
|
| 27 |
+
# Header
|
| 28 |
+
gr.Markdown("# Pigeon Pea Pangenome Atlas")
|
| 29 |
+
gr.Markdown("*An interactive exploration of 89 pigeon pea lines and their pangenome*")
|
| 30 |
+
|
| 31 |
+
# Progress tracker
|
| 32 |
+
progress_html = gr.HTML(
|
| 33 |
+
value=_build_progress_html(0),
|
| 34 |
+
elem_classes=["progress-tracker"],
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Global filters
|
| 38 |
+
with gr.Accordion("Global Filters", open=False):
|
| 39 |
+
with gr.Row():
|
| 40 |
+
country_filter = gr.CheckboxGroup(
|
| 41 |
+
choices=[],
|
| 42 |
+
label="Filter by country",
|
| 43 |
+
info="Leave empty to show all",
|
| 44 |
+
)
|
| 45 |
+
annotated_toggle = gr.Checkbox(
|
| 46 |
+
label="Only annotated genes (with GFF entry)",
|
| 47 |
+
value=False,
|
| 48 |
+
)
|
| 49 |
+
with gr.Accordion("What is PAV?", open=False):
|
| 50 |
+
gr.Markdown(
|
| 51 |
+
"**Presence-Absence Variation (PAV)** describes genes that are present "
|
| 52 |
+
"in some lines but absent from others. The pangenome captures all genes "
|
| 53 |
+
"across the species, not just those in a single reference genome."
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Main content area
|
| 57 |
+
with gr.Row():
|
| 58 |
+
# Main tabs (left ~75%)
|
| 59 |
+
with gr.Column(scale=3):
|
| 60 |
+
with gr.Tabs() as tabs:
|
| 61 |
+
q0 = build_quest0(line_choices)
|
| 62 |
+
q1 = build_quest1()
|
| 63 |
+
q2 = build_quest2()
|
| 64 |
+
q3 = build_quest3(contig_choices)
|
| 65 |
+
q4 = build_quest4(gene_choices)
|
| 66 |
+
final = build_final_tab()
|
| 67 |
+
|
| 68 |
+
# Gene Card side panel (right ~25%)
|
| 69 |
+
gc = build_gene_card_panel()
|
| 70 |
+
|
| 71 |
+
# Data Health accordion
|
| 72 |
+
with gr.Accordion("Data Health", open=False, visible=True):
|
| 73 |
+
data_health_html = gr.HTML(value="<p>Loading data health report...</p>")
|
| 74 |
+
|
| 75 |
+
components = {
|
| 76 |
+
"state": state,
|
| 77 |
+
"progress_html": progress_html,
|
| 78 |
+
"tabs": tabs,
|
| 79 |
+
"country_filter": country_filter,
|
| 80 |
+
"annotated_toggle": annotated_toggle,
|
| 81 |
+
"data_health_html": data_health_html,
|
| 82 |
+
**{f"q0_{k}": v for k, v in q0.items()},
|
| 83 |
+
**{f"q1_{k}": v for k, v in q1.items()},
|
| 84 |
+
**{f"q2_{k}": v for k, v in q2.items()},
|
| 85 |
+
**{f"q3_{k}": v for k, v in q3.items()},
|
| 86 |
+
**{f"q4_{k}": v for k, v in q4.items()},
|
| 87 |
+
**{f"final_{k}": v for k, v in final.items()},
|
| 88 |
+
**{f"gc_{k}": v for k, v in gc.items()},
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
return demo, components
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _build_progress_html(active_quest: int) -> str:
|
| 95 |
+
"""Build progress tracker HTML."""
|
| 96 |
+
steps = [
|
| 97 |
+
("Explorer", 0),
|
| 98 |
+
("Map the World", 1),
|
| 99 |
+
("Core vs Accessory", 2),
|
| 100 |
+
("Genome Landmarks", 3),
|
| 101 |
+
("Protein Relics", 4),
|
| 102 |
+
("Field Report", 5),
|
| 103 |
+
]
|
| 104 |
+
parts = []
|
| 105 |
+
for label, idx in steps:
|
| 106 |
+
cls = "progress-step active" if idx == active_quest else "progress-step"
|
| 107 |
+
parts.append(f'<span class="{cls}">{label}</span>')
|
| 108 |
+
return '<div class="progress-tracker">' + " ".join(parts) + '</div>'
|
ui/quest0.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quest 0: Choose Your Explorer — line selection and overview."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_quest0(line_choices: list[str]):
|
| 7 |
+
"""Build Quest 0 tab components. Returns dict of components."""
|
| 8 |
+
with gr.Tab("Choose Your Explorer", id="quest0") as tab:
|
| 9 |
+
gr.Markdown("## Choose your line to explore")
|
| 10 |
+
gr.Markdown(
|
| 11 |
+
"Select one of the 89 pigeon pea lines to begin your pangenome journey. "
|
| 12 |
+
"Each line has a unique gene repertoire shaped by geography and breeding history."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
line_dropdown = gr.Dropdown(
|
| 16 |
+
choices=line_choices,
|
| 17 |
+
label="Select a pigeon pea line",
|
| 18 |
+
info="89 lines from across the world",
|
| 19 |
+
interactive=True,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
with gr.Row():
|
| 23 |
+
total_genes_box = gr.Textbox(
|
| 24 |
+
label="Total Genes Present",
|
| 25 |
+
interactive=False,
|
| 26 |
+
value="--",
|
| 27 |
+
)
|
| 28 |
+
unique_genes_box = gr.Textbox(
|
| 29 |
+
label="Unique Genes",
|
| 30 |
+
interactive=False,
|
| 31 |
+
value="--",
|
| 32 |
+
info="Genes found only in this line",
|
| 33 |
+
)
|
| 34 |
+
nearest_neighbor_box = gr.Textbox(
|
| 35 |
+
label="Nearest Neighbor",
|
| 36 |
+
interactive=False,
|
| 37 |
+
value="--",
|
| 38 |
+
info="Most similar line by gene content",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
start_btn = gr.Button(
|
| 42 |
+
"Start the Journey",
|
| 43 |
+
variant="primary",
|
| 44 |
+
size="lg",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
return {
|
| 48 |
+
"tab": tab,
|
| 49 |
+
"line_dropdown": line_dropdown,
|
| 50 |
+
"total_genes": total_genes_box,
|
| 51 |
+
"unique_genes": unique_genes_box,
|
| 52 |
+
"nearest_neighbor": nearest_neighbor_box,
|
| 53 |
+
"start_btn": start_btn,
|
| 54 |
+
}
|
ui/quest1.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quest 1: Map the World — UMAP scatter of lines."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_quest1():
|
| 7 |
+
"""Build Quest 1 tab components. Returns dict of components."""
|
| 8 |
+
with gr.Tab("Map the World", id="quest1") as tab:
|
| 9 |
+
gr.Markdown("## How do 89 lines relate by gene content?")
|
| 10 |
+
gr.Markdown(
|
| 11 |
+
"This UMAP projection arranges lines by their gene presence/absence profiles. "
|
| 12 |
+
"Lines closer together share more genes."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
color_radio = gr.Radio(
|
| 16 |
+
choices=["Country", "Cluster"],
|
| 17 |
+
value="Country",
|
| 18 |
+
label="Color by",
|
| 19 |
+
interactive=True,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
umap_plot = gr.Plot(label="UMAP of 89 pigeon pea lines")
|
| 23 |
+
|
| 24 |
+
with gr.Row():
|
| 25 |
+
party_display = gr.Textbox(
|
| 26 |
+
label="Selected party (lasso/click to select)",
|
| 27 |
+
interactive=False,
|
| 28 |
+
value="None selected",
|
| 29 |
+
lines=2,
|
| 30 |
+
)
|
| 31 |
+
compare_btn = gr.Button("Compare my line to party", variant="secondary")
|
| 32 |
+
|
| 33 |
+
comparison_plot = gr.Plot(label="Comparison", visible=False)
|
| 34 |
+
|
| 35 |
+
with gr.Accordion("What does this mean?", open=False):
|
| 36 |
+
gr.Markdown(
|
| 37 |
+
"**UMAP** reduces the high-dimensional PAV matrix to 2D.\n\n"
|
| 38 |
+
"- **Country coloring** shows geographic origins.\n"
|
| 39 |
+
"- **Cluster coloring** shows groups identified by KMeans.\n"
|
| 40 |
+
"- **Click** a point to see its stats.\n"
|
| 41 |
+
"- **Lasso select** multiple points to compare with your chosen line."
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
return {
|
| 45 |
+
"tab": tab,
|
| 46 |
+
"color_radio": color_radio,
|
| 47 |
+
"umap_plot": umap_plot,
|
| 48 |
+
"party_display": party_display,
|
| 49 |
+
"compare_btn": compare_btn,
|
| 50 |
+
"comparison_plot": comparison_plot,
|
| 51 |
+
}
|
ui/quest2.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quest 2: Core vs Accessory — gene classification explorer."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_quest2():
|
| 7 |
+
"""Build Quest 2 tab components. Returns dict of components."""
|
| 8 |
+
with gr.Tab("Core vs Accessory", id="quest2") as tab:
|
| 9 |
+
gr.Markdown("## Explore the Core, Shell, and Cloud genome")
|
| 10 |
+
gr.Markdown(
|
| 11 |
+
"Genes are classified by how many of the 89 lines carry them. "
|
| 12 |
+
"Adjust the thresholds to explore different definitions."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
with gr.Row():
|
| 16 |
+
core_slider = gr.Slider(
|
| 17 |
+
minimum=50, maximum=100, value=95, step=1,
|
| 18 |
+
label="Core threshold (>=X%)",
|
| 19 |
+
info="Genes present in at least this % of lines",
|
| 20 |
+
)
|
| 21 |
+
cloud_slider = gr.Slider(
|
| 22 |
+
minimum=1, maximum=50, value=15, step=1,
|
| 23 |
+
label="Cloud threshold (<X%)",
|
| 24 |
+
info="Genes present in fewer than this % of lines",
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
with gr.Row():
|
| 28 |
+
donut_plot = gr.Plot(label="Core / Shell / Cloud distribution")
|
| 29 |
+
histogram_plot = gr.Plot(label="Gene frequency distribution")
|
| 30 |
+
|
| 31 |
+
gr.Markdown("### Gene Treasure List")
|
| 32 |
+
filter_radio = gr.Radio(
|
| 33 |
+
choices=["All", "Unique to my line", "Rare (<5 lines)", "Cluster markers"],
|
| 34 |
+
value="All",
|
| 35 |
+
label="Filter genes",
|
| 36 |
+
)
|
| 37 |
+
treasure_table = gr.Dataframe(
|
| 38 |
+
headers=["gene_id", "freq_count", "freq_pct", "core_class", "in_my_line"],
|
| 39 |
+
label="Genes",
|
| 40 |
+
interactive=False,
|
| 41 |
+
wrap=True,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
with gr.Row():
|
| 45 |
+
selected_gene_text = gr.Textbox(
|
| 46 |
+
label="Selected gene",
|
| 47 |
+
interactive=False,
|
| 48 |
+
value="Click a row to select",
|
| 49 |
+
)
|
| 50 |
+
pin_btn = gr.Button("Pin to Backpack", variant="secondary")
|
| 51 |
+
|
| 52 |
+
backpack_display = gr.Textbox(
|
| 53 |
+
label="Backpack (pinned genes)",
|
| 54 |
+
interactive=False,
|
| 55 |
+
value="Empty",
|
| 56 |
+
lines=2,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
"tab": tab,
|
| 61 |
+
"core_slider": core_slider,
|
| 62 |
+
"cloud_slider": cloud_slider,
|
| 63 |
+
"donut_plot": donut_plot,
|
| 64 |
+
"histogram_plot": histogram_plot,
|
| 65 |
+
"filter_radio": filter_radio,
|
| 66 |
+
"treasure_table": treasure_table,
|
| 67 |
+
"selected_gene_text": selected_gene_text,
|
| 68 |
+
"pin_btn": pin_btn,
|
| 69 |
+
"backpack_display": backpack_display,
|
| 70 |
+
}
|
ui/quest3.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quest 3: Genome Landmarks — hotspot exploration."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_quest3(contig_choices: list[str]):
|
| 7 |
+
"""Build Quest 3 tab components. Returns dict of components."""
|
| 8 |
+
with gr.Tab("Genome Landmarks", id="quest3") as tab:
|
| 9 |
+
gr.Markdown("## Explore genomic hotspots of variation")
|
| 10 |
+
gr.Markdown(
|
| 11 |
+
"The genome is divided into 100 kb bins. Hotter bins contain more "
|
| 12 |
+
"variable (shell/cloud) genes — potential regions of adaptation."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
contig_dropdown = gr.Dropdown(
|
| 16 |
+
choices=contig_choices,
|
| 17 |
+
label="Select contig (top contigs by gene count)",
|
| 18 |
+
interactive=True,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
heatmap_plot = gr.Plot(label="Variability heatmap (contigs x bins)")
|
| 22 |
+
|
| 23 |
+
gr.Markdown("### Contig Detail")
|
| 24 |
+
track_plot = gr.Plot(label="Gene track (colored by class)", visible=False)
|
| 25 |
+
|
| 26 |
+
region_table = gr.Dataframe(
|
| 27 |
+
headers=["gene_id", "start", "end", "strand", "core_class", "freq_pct"],
|
| 28 |
+
label="Genes in selected region",
|
| 29 |
+
interactive=False,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
region_gene_text = gr.Textbox(
|
| 33 |
+
label="Selected gene from region",
|
| 34 |
+
interactive=False,
|
| 35 |
+
visible=False,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
"tab": tab,
|
| 40 |
+
"contig_dropdown": contig_dropdown,
|
| 41 |
+
"heatmap_plot": heatmap_plot,
|
| 42 |
+
"track_plot": track_plot,
|
| 43 |
+
"region_table": region_table,
|
| 44 |
+
"region_gene_text": region_gene_text,
|
| 45 |
+
}
|
ui/quest4.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quest 4: Protein Relics — protein analysis."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_quest4(gene_choices: list[str]):
|
| 7 |
+
"""Build Quest 4 tab components. Returns dict of components."""
|
| 8 |
+
with gr.Tab("Protein Relics", id="quest4") as tab:
|
| 9 |
+
gr.Markdown("## Examine protein properties")
|
| 10 |
+
gr.Markdown(
|
| 11 |
+
"Explore the protein products of pangenome genes. Compare amino acid "
|
| 12 |
+
"compositions and lengths across your backpack collection."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
gene_dropdown = gr.Dropdown(
|
| 16 |
+
choices=gene_choices,
|
| 17 |
+
label="Select a gene (or pick from backpack)",
|
| 18 |
+
interactive=True,
|
| 19 |
+
allow_custom_value=True,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
protein_stats_html = gr.HTML(
|
| 23 |
+
value="<p>Select a gene to see protein stats</p>",
|
| 24 |
+
label="Protein Statistics",
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
gr.Markdown("### Backpack Comparison")
|
| 28 |
+
gr.Markdown("Pin at least 2 genes to your backpack to see comparisons.")
|
| 29 |
+
|
| 30 |
+
comparison_bar_plot = gr.Plot(label="Protein lengths comparison")
|
| 31 |
+
composition_heatmap = gr.Plot(label="Amino acid composition heatmap")
|
| 32 |
+
|
| 33 |
+
return {
|
| 34 |
+
"tab": tab,
|
| 35 |
+
"gene_dropdown": gene_dropdown,
|
| 36 |
+
"protein_stats_html": protein_stats_html,
|
| 37 |
+
"comparison_bar_plot": comparison_bar_plot,
|
| 38 |
+
"composition_heatmap": composition_heatmap,
|
| 39 |
+
}
|
ui/theme.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Custom Gradio theme for the Pigeon Pea Pangenome Atlas."""
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
CUSTOM_CSS = """
|
| 6 |
+
.quest-badge {
|
| 7 |
+
display: inline-block;
|
| 8 |
+
padding: 4px 12px;
|
| 9 |
+
border-radius: 16px;
|
| 10 |
+
font-size: 0.85em;
|
| 11 |
+
font-weight: 600;
|
| 12 |
+
margin: 2px 4px;
|
| 13 |
+
}
|
| 14 |
+
.badge-core { background: #2E7D32; color: white; }
|
| 15 |
+
.badge-shell { background: #FFC107; color: #333; }
|
| 16 |
+
.badge-cloud { background: #F44336; color: white; }
|
| 17 |
+
|
| 18 |
+
.gene-card {
|
| 19 |
+
border: 2px solid #2E7D32;
|
| 20 |
+
border-radius: 8px;
|
| 21 |
+
padding: 16px;
|
| 22 |
+
background: #F1F8E9;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
.presence-barcode span {
|
| 26 |
+
display: inline-block;
|
| 27 |
+
width: 3px;
|
| 28 |
+
height: 20px;
|
| 29 |
+
margin: 0;
|
| 30 |
+
}
|
| 31 |
+
.presence-barcode .present { background: #4CAF50; }
|
| 32 |
+
.presence-barcode .absent { background: #E0E0E0; }
|
| 33 |
+
|
| 34 |
+
.progress-tracker {
|
| 35 |
+
display: flex;
|
| 36 |
+
justify-content: center;
|
| 37 |
+
gap: 8px;
|
| 38 |
+
padding: 12px 0;
|
| 39 |
+
}
|
| 40 |
+
.progress-step {
|
| 41 |
+
display: inline-flex;
|
| 42 |
+
align-items: center;
|
| 43 |
+
gap: 4px;
|
| 44 |
+
padding: 4px 10px;
|
| 45 |
+
border-radius: 12px;
|
| 46 |
+
font-size: 0.8em;
|
| 47 |
+
background: #E8F5E9;
|
| 48 |
+
color: #2E7D32;
|
| 49 |
+
}
|
| 50 |
+
.progress-step.active {
|
| 51 |
+
background: #2E7D32;
|
| 52 |
+
color: white;
|
| 53 |
+
font-weight: 600;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.stat-card {
|
| 57 |
+
text-align: center;
|
| 58 |
+
padding: 16px;
|
| 59 |
+
border-radius: 8px;
|
| 60 |
+
background: #F1F8E9;
|
| 61 |
+
border: 1px solid #C8E6C9;
|
| 62 |
+
}
|
| 63 |
+
.stat-card .stat-value {
|
| 64 |
+
font-size: 1.8em;
|
| 65 |
+
font-weight: 700;
|
| 66 |
+
color: #2E7D32;
|
| 67 |
+
}
|
| 68 |
+
.stat-card .stat-label {
|
| 69 |
+
font-size: 0.85em;
|
| 70 |
+
color: #666;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.achievement-badge {
|
| 74 |
+
display: inline-block;
|
| 75 |
+
padding: 6px 14px;
|
| 76 |
+
border-radius: 20px;
|
| 77 |
+
background: linear-gradient(135deg, #FFC107, #FF9800);
|
| 78 |
+
color: #333;
|
| 79 |
+
font-weight: 600;
|
| 80 |
+
margin: 4px;
|
| 81 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 82 |
+
}
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def build_theme():
|
| 87 |
+
"""Build custom Gradio theme."""
|
| 88 |
+
theme = gr.themes.Soft(
|
| 89 |
+
primary_hue=gr.themes.colors.green,
|
| 90 |
+
secondary_hue=gr.themes.colors.amber,
|
| 91 |
+
neutral_hue=gr.themes.colors.gray,
|
| 92 |
+
font=gr.themes.GoogleFont("Inter"),
|
| 93 |
+
).set(
|
| 94 |
+
body_background_fill="#FAFAF5",
|
| 95 |
+
block_border_width="1px",
|
| 96 |
+
block_border_color="#C8E6C9",
|
| 97 |
+
block_radius="8px",
|
| 98 |
+
button_primary_background_fill="#2E7D32",
|
| 99 |
+
button_primary_text_color="white",
|
| 100 |
+
)
|
| 101 |
+
return theme
|