Spaces:

AshkanTaghipour
/

PanGenomeWatchAI

Build error

App Files Files Community

Ashkan Taghipour (The University of Western Australia) commited on Feb 9

Commit

16e4ad5

1 Parent(s): ad78656

Initial deploy: Pigeon Pea Pangenome Atlas

Browse files

Files changed (46) hide show

.gitattributes +5 -0
.gitignore +7 -0
README.md +6 -8
app.py +266 -0
data/89_line_PAV.txt +3 -0
data/Pigenopea_89lines_pangenome_clean.all_leAED1_sort_byPerl_clean.gff +3 -0
data/Pigenopea_89lines_pangenome_clean.fasta +3 -0
data/Pigenopea_89lines_pangenome_clean_bg1k_protein.fasta +3 -0
precomputed/cluster_markers.parquet +3 -0
precomputed/genome_contig_index.json +3 -0
precomputed/gff_gene_index.parquet +3 -0
precomputed/hotspot_bins.parquet +3 -0
precomputed/line_embedding.parquet +3 -0
precomputed/line_similarity_topk.parquet +3 -0
precomputed/line_stats.parquet +3 -0
precomputed/pav_gene_frequency.parquet +3 -0
precomputed/pav_matrix.parquet +3 -0
precomputed/protein_index.parquet +3 -0
requirements.txt +11 -0
scripts/run_precompute.py +111 -0
src/__init__.py +0 -0
src/callbacks.py +524 -0
src/data_loader.py +206 -0
src/field_report.py +175 -0
src/gene_card.py +148 -0
src/precompute.py +259 -0
src/state.py +37 -0
src/utils.py +55 -0
tests/__init__.py +0 -0
tests/conftest.py +124 -0
tests/test_callbacks.py +97 -0
tests/test_data_loader.py +90 -0
tests/test_field_report.py +63 -0
tests/test_gene_card.py +54 -0
tests/test_integration.py +69 -0
tests/test_precompute.py +112 -0
ui/__init__.py +0 -0
ui/final.py +33 -0
ui/gene_card_ui.py +30 -0
ui/layout.py +108 -0
ui/quest0.py +54 -0
ui/quest1.py +51 -0
ui/quest2.py +70 -0
ui/quest3.py +45 -0
ui/quest4.py +39 -0
ui/theme.py +101 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.fasta filter=lfs diff=lfs merge=lfs -text
+*.fasta.gz filter=lfs diff=lfs merge=lfs -text
+*.gff filter=lfs diff=lfs merge=lfs -text
+89_line_PAV.txt filter=lfs diff=lfs merge=lfs -text
+genome_contig_index.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+CREDENTIALS_AND_CONFIG.txt
+__pycache__/
+*.pyc
+.env
+*.egg-info/
+.conda/
+.pytest_cache/

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 ---
-title: PanGenomeWatchAI
-emoji: 🐢
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 6.5.1
 app_file: app.py
 pinned: false
-short_description: PanGenome analysis
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Pigeon Pea Pangenome Atlas
+emoji: "\U0001F331"
+colorFrom: green
+colorTo: yellow
 sdk: gradio
+sdk_version: 5.9.1
 app_file: app.py
 pinned: false
+short_description: Interactive pangenome exploration of 89 pigeon pea lines
 ---

app.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+Pigeon Pea Pangenome Atlas — Main entry point.
+Gradio app for exploring the pangenome of 89 pigeon pea lines.
+"""
+import os
+import sys
+import json
+import pandas as pd
+import gradio as gr
+from src.state import AppState
+from src.utils import logger, PRECOMPUTED_DIR
+from src.data_loader import validate_joins
+from src.callbacks import (
+    on_line_selected, on_start_journey,
+    build_umap_plot, on_umap_select, on_compare_party,
+    build_donut_chart, build_frequency_histogram, build_treasure_table,
+    on_pin_gene, on_gene_click_table,
+    build_hotspot_heatmap, on_contig_selected,
+    get_protein_stats_html, build_backpack_comparison, build_composition_heatmap,
+    on_open_gene_card, on_download_gene_report,
+    on_generate_report, build_data_health_html,
+)
+from ui.layout import build_app
+# ===========================================================
+# Load precomputed data
+# ===========================================================
+logger.info("Loading precomputed data...")
+DATA = {}
+def load_data():
+    """Load all precomputed parquets into memory."""
+    p = PRECOMPUTED_DIR
+    DATA["gene_freq"] = pd.read_parquet(p / "pav_gene_frequency.parquet")
+    DATA["line_stats"] = pd.read_parquet(p / "line_stats.parquet")
+    DATA["embedding"] = pd.read_parquet(p / "line_embedding.parquet")
+    DATA["similarity"] = pd.read_parquet(p / "line_similarity_topk.parquet")
+    DATA["gff_index"] = pd.read_parquet(p / "gff_gene_index.parquet")
+    DATA["protein"] = pd.read_parquet(p / "protein_index.parquet")
+    DATA["hotspots"] = pd.read_parquet(p / "hotspot_bins.parquet")
+    DATA["markers"] = pd.read_parquet(p / "cluster_markers.parquet")
+    # Load PAV matrix for live queries
+    pav_path = p / "pav_matrix.parquet"
+    if pav_path.exists():
+        DATA["pav"] = pd.read_parquet(pav_path)
+        logger.info(f"PAV matrix loaded: {DATA['pav'].shape}")
+    # Load contig index
+    ci_path = p / "genome_contig_index.json"
+    if ci_path.exists():
+        with open(ci_path) as f:
+            DATA["contig_index"] = json.load(f)
+    logger.info("All data loaded successfully")
+load_data()
+# ===========================================================
+# Derive UI choices
+# ===========================================================
+line_choices = sorted(DATA["line_stats"]["line_id"].tolist())
+# Top contigs by gene count
+contig_gene_counts = DATA["gff_index"]["contig_id"].value_counts()
+contig_choices = contig_gene_counts.head(30).index.tolist()
+# Gene choices (all genes with protein data)
+gene_choices = sorted(DATA["protein"]["gene_id"].tolist())
+# Country list for filters
+country_list = sorted(DATA["line_stats"]["country"].unique().tolist())
+# ===========================================================
+# Build UI
+# ===========================================================
+demo, C = build_app(line_choices, contig_choices, gene_choices)
+# ===========================================================
+# Wire callbacks
+# ===========================================================
+with demo:
+    # Update country filter choices
+    C["country_filter"].choices = country_list
+    # -- Data Health on load --
+    try:
+        report = {
+            "Total genes in PAV": len(DATA["gene_freq"]),
+            "Lines": len(DATA["line_stats"]),
+            "Genes with GFF annotation": len(DATA["gff_index"]),
+            "Genes with protein data": len(DATA["protein"]),
+            "Clusters found": DATA["embedding"]["cluster_id"].nunique(),
+        }
+        C["data_health_html"].value = build_data_health_html(report)
+    except Exception as e:
+        C["data_health_html"].value = f"<p>Error: {e}</p>"
+    # -- Quest 0 --
+    C["q0_line_dropdown"].change(
+        fn=lambda line_id, state: on_line_selected(line_id, state, DATA),
+        inputs=[C["q0_line_dropdown"], C["state"]],
+        outputs=[C["q0_total_genes"], C["q0_unique_genes"],
+                 C["q0_nearest_neighbor"], C["state"]],
+    )
+    C["q0_start_btn"].click(
+        fn=on_start_journey,
+        inputs=[C["state"]],
+        outputs=[C["tabs"], C["state"]],
+    )
+    # -- Quest 1 --
+    C["q1_color_radio"].change(
+        fn=lambda color_by, state: build_umap_plot(color_by, state, DATA),
+        inputs=[C["q1_color_radio"], C["state"]],
+        outputs=[C["q1_umap_plot"]],
+    )
+    # Render UMAP on tab load
+    C["q1_tab"].select(
+        fn=lambda state: build_umap_plot("Country", state, DATA),
+        inputs=[C["state"]],
+        outputs=[C["q1_umap_plot"]],
+    )
+    C["q1_compare_btn"].click(
+        fn=lambda state: on_compare_party(state, DATA),
+        inputs=[C["state"]],
+        outputs=[C["q1_comparison_plot"], C["q1_comparison_plot"]],
+    )
+    # -- Quest 2 --
+    def update_quest2(core_thresh, cloud_thresh, filter_type, state):
+        donut = build_donut_chart(core_thresh, cloud_thresh, DATA)
+        hist = build_frequency_histogram(core_thresh, cloud_thresh, DATA)
+        table = build_treasure_table(state, core_thresh, cloud_thresh, filter_type, DATA)
+        return donut, hist, table
+    for trigger in [C["q2_core_slider"], C["q2_cloud_slider"]]:
+        trigger.change(
+            fn=update_quest2,
+            inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
+                    C["q2_filter_radio"], C["state"]],
+            outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
+                     C["q2_treasure_table"]],
+        )
+    C["q2_filter_radio"].change(
+        fn=update_quest2,
+        inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
+                C["q2_filter_radio"], C["state"]],
+        outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
+                 C["q2_treasure_table"]],
+    )
+    # Render Quest 2 on tab select
+    C["q2_tab"].select(
+        fn=update_quest2,
+        inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
+                C["q2_filter_radio"], C["state"]],
+        outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
+                 C["q2_treasure_table"]],
+    )
+    C["q2_pin_btn"].click(
+        fn=lambda gene_id, state: on_pin_gene(gene_id, state),
+        inputs=[C["q2_selected_gene_text"], C["state"]],
+        outputs=[C["q2_backpack_display"], C["state"]],
+    )
+    # Table row click → select gene
+    C["q2_treasure_table"].select(
+        fn=lambda evt, state: (str(evt.value) if evt else "", state),
+        inputs=[C["state"]],
+        outputs=[C["q2_selected_gene_text"], C["state"]],
+    )
+    # -- Quest 3 --
+    C["q3_tab"].select(
+        fn=lambda: build_hotspot_heatmap(DATA),
+        inputs=[],
+        outputs=[C["q3_heatmap_plot"]],
+    )
+    C["q3_contig_dropdown"].change(
+        fn=lambda contig_id, state: on_contig_selected(contig_id, DATA, state),
+        inputs=[C["q3_contig_dropdown"], C["state"]],
+        outputs=[C["q3_track_plot"], C["q3_region_table"]],
+    )
+    # -- Quest 4 --
+    C["q4_gene_dropdown"].change(
+        fn=lambda gene_id: get_protein_stats_html(gene_id, DATA),
+        inputs=[C["q4_gene_dropdown"]],
+        outputs=[C["q4_protein_stats_html"]],
+    )
+    C["q4_tab"].select(
+        fn=lambda state: (
+            build_backpack_comparison(state, DATA),
+            build_composition_heatmap(state, DATA),
+        ),
+        inputs=[C["state"]],
+        outputs=[C["q4_comparison_bar_plot"], C["q4_composition_heatmap"]],
+    )
+    # -- Gene Card --
+    C["gc_show_genome_btn"].click(
+        fn=lambda state: (
+            gr.Tabs(selected="quest3"),
+            state.selected_gene if state and state.selected_gene else None,
+        ),
+        inputs=[C["state"]],
+        outputs=[C["tabs"], C["q3_contig_dropdown"]],
+    )
+    C["gc_show_protein_btn"].click(
+        fn=lambda state: (
+            gr.Tabs(selected="quest4"),
+            state.selected_gene if state and state.selected_gene else None,
+        ),
+        inputs=[C["state"]],
+        outputs=[C["tabs"], C["q4_gene_dropdown"]],
+    )
+    C["gc_pin_card_btn"].click(
+        fn=lambda state: on_pin_gene(state.selected_gene if state else "", state),
+        inputs=[C["state"]],
+        outputs=[C["q2_backpack_display"], C["state"]],
+    )
+    C["gc_download_gene_btn"].click(
+        fn=lambda state: on_download_gene_report(state, DATA),
+        inputs=[C["state"]],
+        outputs=[C["gc_gene_report_file"]],
+    )
+    # -- Final Report --
+    C["final_generate_btn"].click(
+        fn=lambda state: on_generate_report(state, DATA),
+        inputs=[C["state"]],
+        outputs=[
+            C["final_report_md"],
+            C["final_download_json"],
+            C["final_download_csv"],
+            C["final_download_json"],  # visibility
+            C["final_download_csv"],   # visibility
+            C["final_achievements_html"],
+            C["state"],
+        ],
+    )
+# ===========================================================
+# Launch
+# ===========================================================
+if __name__ == "__main__":
+    demo.launch()

data/89_line_PAV.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4eaac454bd3103e8738cf483fc45ac821c8ae0dc99d28c5abd383b0005ef38c
+size 10437803

data/Pigenopea_89lines_pangenome_clean.all_leAED1_sort_byPerl_clean.gff ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a288ef682398b642d4024a09f55727d6391fb164b8b6d72603b13796b6b95eba
+size 64791788

data/Pigenopea_89lines_pangenome_clean.fasta ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a282a51d87323f1effd70ee43a9ab7b95dbc6584f4df0e6b3c0d095a0b992f7
+size 637456599

data/Pigenopea_89lines_pangenome_clean_bg1k_protein.fasta ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be9efd3242fffb67a569937fb4842027238c86a245f303b5521d48ec84c50389
+size 18085493

precomputed/cluster_markers.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b195070da682210273a73da98b3aa3bcf8146d1c9dd6aa1094ff0e7b17ca0485
+size 6411

precomputed/genome_contig_index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7184ff7812f745877dd62bda4310f7336906e3aa3658908fee9a767dbf53d04d
+size 11022538

precomputed/gff_gene_index.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d48537f3aae0c63e4afb65831be8762f805a5215b7ff0fff64de44a7385d50e
+size 1247901

precomputed/hotspot_bins.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbf2c937e61d39a60a1d51e20bcd6b0c5dd681429a763b43467d5d9e95fcfa59
+size 132552

precomputed/line_embedding.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97492cd82529decb539371fde3696da357b670ec44ed1a4f32015fc6ad621ef8
+size 4680

precomputed/line_similarity_topk.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8339f144d99301b979159e14e8a413364727a8f93dcc4801945aee7671ef9e10
+size 8063

precomputed/line_stats.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:645aba02382a23a3cdb476bb3e24f9111fb806909d076732bead9662d8eba4b6
+size 4672

precomputed/pav_gene_frequency.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecd91ec6198f2d8489177a03d81e9c62b6edcc4575994eb534c9c4507c38071d
+size 376243

precomputed/pav_matrix.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24abc505527d02d5d1c465f0b93c1902833e95c224ad00f3ccc9ee881173729c
+size 490920

precomputed/protein_index.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c67ced53bb82255c9027d4d492acf8945bcc0c28ec233a73ced14054a482649
+size 1623053

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+plotly>=5.24
+pandas>=2.2
+pyarrow>=17.0
+numpy>=1.26
+scipy>=1.14
+scikit-learn>=1.5
+umap-learn>=0.5.7
+biopython>=1.84
+pyfaidx>=0.8
+Pillow
+reportlab

scripts/run_precompute.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env python3
+"""
+CLI script to generate all precomputed data.
+Usage: python scripts/run_precompute.py --data-dir data/ --output-dir precomputed/
+"""
+import argparse
+import sys
+import os
+import time
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from src.data_loader import (
+    load_pav, parse_gff_genes, parse_protein_fasta,
+    build_contig_index, build_contig_name_mapping, validate_joins,
+)
+from src.precompute import (
+    compute_gene_frequency, compute_line_stats, compute_line_embedding,
+    compute_similarity_topk, build_gff_gene_parquet, build_protein_parquet,
+    save_contig_index, compute_hotspot_bins, compute_cluster_markers,
+)
+from src.utils import logger, find_file
+def main():
+    parser = argparse.ArgumentParser(description="Precompute pangenome data")
+    parser.add_argument("--data-dir", default="data/", help="Input data directory")
+    parser.add_argument("--output-dir", default="precomputed/", help="Output directory")
+    args = parser.parse_args()
+    data_dir = os.path.abspath(args.data_dir)
+    output_dir = os.path.abspath(args.output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    t_total = time.time()
+    # 1. Load raw data
+    logger.info("=== Phase 1: Loading raw data ===")
+    pav_path = os.path.join(data_dir, "89_line_PAV.txt")
+    from pathlib import Path
+    data_p = Path(data_dir)
+    gff_files = list(data_p.glob("*.gff"))
+    protein_files = list(data_p.glob("*protein*.fasta"))
+    genome_files = [f for f in data_p.glob("*.fasta") if "protein" not in f.name]
+    if not gff_files:
+        logger.error("No GFF file found in data directory")
+        sys.exit(1)
+    if not protein_files:
+        logger.error("No protein FASTA file found in data directory")
+        sys.exit(1)
+    pav = load_pav(pav_path)
+    gff_genes = parse_gff_genes(str(gff_files[0]))
+    protein_index = parse_protein_fasta(str(protein_files[0]))
+    contig_index = {}
+    if genome_files:
+        contig_index = build_contig_index(str(genome_files[0]))
+    else:
+        logger.warning("No genome FASTA found; contig index will be empty")
+    # Validation
+    logger.info("=== Validation ===")
+    contig_mapping = build_contig_name_mapping(gff_genes, contig_index)
+    report = validate_joins(pav, gff_genes, protein_index, contig_index)
+    for k, v in report.items():
+        logger.info(f"  {k}: {v}")
+    # 2. Compute derived data
+    logger.info("=== Phase 2: Computing derived data ===")
+    gene_freq = compute_gene_frequency(pav)
+    gene_freq.to_parquet(os.path.join(output_dir, "pav_gene_frequency.parquet"), index=False)
+    line_stats = compute_line_stats(pav)
+    line_stats.to_parquet(os.path.join(output_dir, "line_stats.parquet"), index=False)
+    embedding = compute_line_embedding(pav)
+    embedding.to_parquet(os.path.join(output_dir, "line_embedding.parquet"), index=False)
+    similarity = compute_similarity_topk(pav, k=15)
+    similarity.to_parquet(os.path.join(output_dir, "line_similarity_topk.parquet"), index=False)
+    build_gff_gene_parquet(gff_genes, os.path.join(output_dir, "gff_gene_index.parquet"))
+    build_protein_parquet(protein_index, os.path.join(output_dir, "protein_index.parquet"))
+    save_contig_index(contig_index, contig_mapping, os.path.join(output_dir, "genome_contig_index.json"))
+    hotspots = compute_hotspot_bins(gff_genes, gene_freq, contig_index)
+    hotspots.to_parquet(os.path.join(output_dir, "hotspot_bins.parquet"), index=False)
+    markers = compute_cluster_markers(pav, embedding)
+    markers.to_parquet(os.path.join(output_dir, "cluster_markers.parquet"), index=False)
+    # Also save the PAV matrix as parquet for efficient loading
+    pav.to_parquet(os.path.join(output_dir, "pav_matrix.parquet"))
+    dt = time.time() - t_total
+    logger.info(f"=== All precomputation done in {dt:.1f}s ===")
+    # List output files
+    for f in sorted(Path(output_dir).glob("*")):
+        size_mb = f.stat().st_size / 1024 / 1024
+        logger.info(f"  {f.name}: {size_mb:.2f} MB")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

File without changes

src/callbacks.py ADDED Viewed

	@@ -0,0 +1,524 @@

+"""All Gradio callbacks for the Pigeon Pea Pangenome Atlas."""
+import plotly.graph_objects as go
+import plotly.express as px
+import pandas as pd
+import numpy as np
+from src.state import AppState
+from src.gene_card import build_gene_card, render_gene_card_html, export_gene_report
+from src.field_report import generate_field_report, export_report_json, export_report_csv
+# Color palettes
+CORE_COLORS = {"core": "#2E7D32", "shell": "#FFC107", "cloud": "#F44336", "unknown": "#9E9E9E"}
+COUNTRY_COLORS = px.colors.qualitative.Set3
+# ============================================================
+# Quest 0 Callbacks
+# ============================================================
+def on_line_selected(line_id: str, state: AppState, data: dict) -> tuple:
+    """
+    Triggered by dropdown change.
+    Returns: (total_genes, unique_genes, nearest_neighbor, updated_state)
+    """
+    if not line_id or state is None:
+        state = AppState()
+    state.selected_line = line_id
+    line_stats = data["line_stats"]
+    similarity = data["similarity"]
+    row = line_stats[line_stats["line_id"] == line_id]
+    if len(row) == 0:
+        return "--", "--", "--", state
+    total_genes = str(int(row.iloc[0]["genes_present_count"]))
+    unique_genes = str(int(row.iloc[0]["unique_genes_count"]))
+    # Nearest neighbor
+    sim_rows = similarity[similarity["line_id"] == line_id]
+    if len(sim_rows) > 0:
+        top = sim_rows.nlargest(1, "jaccard_score").iloc[0]
+        nearest = f"{top['neighbor_line_id']} ({top['jaccard_score']:.3f})"
+    else:
+        nearest = "--"
+    return total_genes, unique_genes, nearest, state
+def on_start_journey(state: AppState) -> tuple:
+    """Award Explorer achievement and switch to Quest 1."""
+    if state is None:
+        state = AppState()
+    state.award("Explorer")
+    return gr.Tabs(selected="quest1"), state
+# ============================================================
+# Quest 1 Callbacks
+# ============================================================
+def build_umap_plot(color_by: str, state: AppState, data: dict) -> go.Figure:
+    """Build Plotly UMAP scatter."""
+    embedding = data["embedding"]
+    line_stats = data["line_stats"]
+    df = embedding.merge(line_stats[["line_id", "country"]], on="line_id", how="left")
+    df["country"] = df["country"].fillna("Unknown")
+    color_col = "country" if color_by == "Country" else "cluster_id"
+    if color_col == "cluster_id":
+        df["cluster_id"] = df["cluster_id"].astype(str)
+    fig = px.scatter(
+        df, x="umap_x", y="umap_y", color=color_col,
+        hover_data=["line_id", "country"],
+        title="UMAP of 89 Pigeon Pea Lines",
+        labels={"umap_x": "UMAP 1", "umap_y": "UMAP 2"},
+        color_discrete_sequence=COUNTRY_COLORS if color_by == "Country" else px.colors.qualitative.Bold,
+    )
+    # Highlight selected line
+    if state and state.selected_line:
+        sel = df[df["line_id"] == state.selected_line]
+        if len(sel) > 0:
+            fig.add_trace(go.Scatter(
+                x=sel["umap_x"], y=sel["umap_y"],
+                mode="markers",
+                marker=dict(symbol="star", size=18, color="red", line=dict(width=2, color="black")),
+                name=f"Your line: {state.selected_line}",
+                hovertext=state.selected_line,
+            ))
+    fig.update_layout(
+        plot_bgcolor="white",
+        legend=dict(orientation="h", yanchor="bottom", y=-0.3),
+    )
+    return fig
+def on_umap_select(selected_data, state: AppState) -> tuple:
+    """Handle UMAP point selection."""
+    if state is None:
+        state = AppState()
+    if selected_data and "points" in selected_data:
+        selected_lines = [p.get("hovertext", p.get("text", "")) for p in selected_data["points"]]
+        selected_lines = [l for l in selected_lines if l]
+        state.selected_party = selected_lines
+        party_text = f"Selected {len(selected_lines)} lines: " + ", ".join(selected_lines[:10])
+        if len(selected_lines) > 10:
+            party_text += f" ... +{len(selected_lines) - 10} more"
+    else:
+        state.selected_party = []
+        party_text = "None selected"
+    return party_text, state
+def on_compare_party(state: AppState, data: dict) -> tuple:
+    """Compare selected line vs party."""
+    if not state or not state.selected_line or not state.selected_party:
+        fig = go.Figure()
+        fig.add_annotation(text="Select your line and a party first", showarrow=False)
+        return fig, True
+    pav = data.get("pav")
+    if pav is None:
+        fig = go.Figure()
+        fig.add_annotation(text="PAV data not loaded", showarrow=False)
+        return fig, True
+    my_genes = set(pav.index[pav[state.selected_line] == 1])
+    party_cols = [c for c in state.selected_party if c in pav.columns and c != state.selected_line]
+    if not party_cols:
+        fig = go.Figure()
+        fig.add_annotation(text="No valid party members", showarrow=False)
+        return fig, True
+    party_genes = set()
+    for col in party_cols:
+        party_genes |= set(pav.index[pav[col] == 1])
+    shared = len(my_genes & party_genes)
+    only_mine = len(my_genes - party_genes)
+    only_party = len(party_genes - my_genes)
+    fig = go.Figure(data=[
+        go.Bar(name="Shared", x=["Gene Sets"], y=[shared], marker_color="#2E7D32"),
+        go.Bar(name=f"Only {state.selected_line}", x=["Gene Sets"], y=[only_mine], marker_color="#1565C0"),
+        go.Bar(name="Only Party", x=["Gene Sets"], y=[only_party], marker_color="#FFC107"),
+    ])
+    fig.update_layout(
+        barmode="group",
+        title=f"Gene Comparison: {state.selected_line} vs {len(party_cols)} party members",
+        yaxis_title="Number of genes",
+    )
+    return fig, True
+# ============================================================
+# Quest 2 Callbacks
+# ============================================================
+def build_donut_chart(core_thresh: float, cloud_thresh: float, data: dict) -> go.Figure:
+    """Build core/shell/cloud donut chart."""
+    gene_freq = data["gene_freq"]
+    core = int((gene_freq["freq_pct"] >= core_thresh).sum())
+    cloud = int((gene_freq["freq_pct"] < cloud_thresh).sum())
+    shell = len(gene_freq) - core - cloud
+    fig = go.Figure(data=[go.Pie(
+        labels=["Core", "Shell", "Cloud"],
+        values=[core, shell, cloud],
+        hole=0.5,
+        marker_colors=[CORE_COLORS["core"], CORE_COLORS["shell"], CORE_COLORS["cloud"]],
+        textinfo="label+value+percent",
+    )])
+    fig.update_layout(
+        title=f"Gene Classification (Core>={core_thresh}%, Cloud<{cloud_thresh}%)",
+        showlegend=True,
+    )
+    return fig
+def build_frequency_histogram(core_thresh: float, cloud_thresh: float, data: dict) -> go.Figure:
+    """Build colored histogram of gene frequencies."""
+    gene_freq = data["gene_freq"]
+    fig = go.Figure()
+    for cls, color in CORE_COLORS.items():
+        if cls == "unknown":
+            continue
+        subset = gene_freq[gene_freq["core_class"] == cls]
+        fig.add_trace(go.Histogram(
+            x=subset["freq_pct"],
+            name=cls.capitalize(),
+            marker_color=color,
+            opacity=0.75,
+            nbinsx=50,
+        ))
+    fig.update_layout(
+        barmode="overlay",
+        title="Gene Frequency Distribution",
+        xaxis_title="Frequency (%)",
+        yaxis_title="Count",
+    )
+    # Add threshold lines
+    fig.add_vline(x=core_thresh, line_dash="dash", line_color="green",
+                  annotation_text=f"Core>={core_thresh}%")
+    fig.add_vline(x=cloud_thresh, line_dash="dash", line_color="red",
+                  annotation_text=f"Cloud<{cloud_thresh}%")
+    return fig
+def build_treasure_table(state: AppState, core_thresh: float, cloud_thresh: float,
+                         filter_type: str, data: dict) -> pd.DataFrame:
+    """Build gene treasure table with current filters."""
+    gene_freq = data["gene_freq"].copy()
+    # Reclassify based on current thresholds
+    gene_freq["core_class"] = gene_freq["freq_pct"].apply(
+        lambda x: "core" if x >= core_thresh else ("cloud" if x < cloud_thresh else "shell")
+    )
+    # Add in_my_line column
+    pav = data.get("pav")
+    if pav is not None and state and state.selected_line and state.selected_line in pav.columns:
+        my_presence = pav[state.selected_line]
+        gene_freq["in_my_line"] = gene_freq["gene_id"].map(
+            lambda g: "Yes" if g in my_presence.index and my_presence.get(g, 0) == 1 else "No"
+        )
+    else:
+        gene_freq["in_my_line"] = "N/A"
+    # Filter
+    if filter_type == "Unique to my line":
+        if pav is not None and state and state.selected_line:
+            unique_mask = (pav.sum(axis=1) == 1) & (pav[state.selected_line] == 1)
+            unique_genes = set(pav.index[unique_mask])
+            gene_freq = gene_freq[gene_freq["gene_id"].isin(unique_genes)]
+    elif filter_type == "Rare (<5 lines)":
+        gene_freq = gene_freq[gene_freq["freq_count"] <= 5]
+    elif filter_type == "Cluster markers":
+        markers = data.get("markers")
+        if markers is not None:
+            marker_genes = set(markers["gene_id"])
+            gene_freq = gene_freq[gene_freq["gene_id"].isin(marker_genes)]
+    # Sort and limit
+    gene_freq = gene_freq.sort_values("freq_count", ascending=True).head(500)
+    return gene_freq[["gene_id", "freq_count", "freq_pct", "core_class", "in_my_line"]]
+def on_pin_gene(gene_id: str, state: AppState) -> tuple:
+    """Add gene to backpack."""
+    if state is None:
+        state = AppState()
+    if not gene_id or gene_id == "Click a row to select":
+        return "Select a gene first", state
+    added = state.add_to_backpack(gene_id)
+    backpack_text = ", ".join(state.backpack_genes) if state.backpack_genes else "Empty"
+    if not added:
+        backpack_text = f"(already in backpack) {backpack_text}"
+    return backpack_text, state
+def on_gene_click_table(evt, state: AppState) -> tuple:
+    """Handle table row selection."""
+    if state is None:
+        state = AppState()
+    if evt is not None and hasattr(evt, 'value'):
+        gene_id = str(evt.value)
+        state.selected_gene = gene_id
+        return gene_id, state
+    return "Click a row to select", state
+# ============================================================
+# Quest 3 Callbacks
+# ============================================================
+def build_hotspot_heatmap(data: dict, top_n_contigs: int = 20) -> go.Figure:
+    """Build contig x bin heatmap from hotspot_bins."""
+    hotspots = data["hotspots"]
+    # Top N contigs by total genes
+    contig_counts = hotspots.groupby("contig_id")["total_genes"].sum()
+    top_contigs = contig_counts.nlargest(top_n_contigs).index.tolist()
+    subset = hotspots[hotspots["contig_id"].isin(top_contigs)]
+    if len(subset) == 0:
+        fig = go.Figure()
+        fig.add_annotation(text="No hotspot data available", showarrow=False)
+        return fig
+    pivot = subset.pivot_table(
+        index="contig_id", columns="bin_start",
+        values="variability_score", aggfunc="max"
+    ).fillna(0)
+    # Shorten contig names for display
+    short_names = [c.split("|")[-1] if "|" in c else c[:30] for c in pivot.index]
+    fig = go.Figure(data=go.Heatmap(
+        z=pivot.values,
+        x=[f"{int(c/1000)}kb" for c in pivot.columns],
+        y=short_names,
+        colorscale=[[0, "#E8F5E9"], [0.5, "#FFC107"], [1.0, "#F44336"]],
+        colorbar_title="Variability",
+        hovertemplate="Contig: %{y}<br>Bin: %{x}<br>Score: %{z:.1f}<extra></extra>",
+    ))
+    fig.update_layout(
+        title=f"Genomic Variability Heatmap (Top {top_n_contigs} contigs)",
+        xaxis_title="Genomic position",
+        yaxis_title="Contig",
+        height=600,
+    )
+    return fig
+def on_contig_selected(contig_id: str, data: dict, state: AppState) -> tuple:
+    """Build track plot for selected contig."""
+    if not contig_id:
+        return go.Figure(), pd.DataFrame()
+    gff = data["gff_index"]
+    gene_freq = data["gene_freq"]
+    contig_genes = gff[gff["contig_id"] == contig_id].merge(
+        gene_freq[["gene_id", "core_class", "freq_pct"]], on="gene_id", how="left"
+    )
+    contig_genes["core_class"] = contig_genes["core_class"].fillna("unknown")
+    if len(contig_genes) == 0:
+        fig = go.Figure()
+        fig.add_annotation(text="No genes on this contig", showarrow=False)
+        return fig, pd.DataFrame()
+    fig = go.Figure()
+    for cls, color in CORE_COLORS.items():
+        subset = contig_genes[contig_genes["core_class"] == cls]
+        if len(subset) == 0:
+            continue
+        fig.add_trace(go.Scatter(
+            x=(subset["start"] + subset["end"]) / 2,
+            y=[cls] * len(subset),
+            mode="markers",
+            marker=dict(
+                symbol="line-ew", size=12, color=color,
+                line=dict(width=2, color=color),
+            ),
+            name=cls.capitalize(),
+            text=subset["gene_id"],
+            hovertemplate="Gene: %{text}<br>Position: %{x:,.0f}<extra></extra>",
+        ))
+    short_name = contig_id.split("|")[-1] if "|" in contig_id else contig_id[:30]
+    fig.update_layout(
+        title=f"Gene Track: {short_name}",
+        xaxis_title="Genomic position (bp)",
+        yaxis_title="Gene class",
+        showlegend=True,
+    )
+    table_df = contig_genes[["gene_id", "start", "end", "strand", "core_class", "freq_pct"]].sort_values("start")
+    return fig, table_df
+# ============================================================
+# Quest 4 Callbacks
+# ============================================================
+def get_protein_stats_html(gene_id: str, data: dict) -> str:
+    """Get protein stats as HTML."""
+    if not gene_id:
+        return "<p>Select a gene</p>"
+    protein = data["protein"]
+    row = protein[protein["gene_id"] == gene_id]
+    if len(row) == 0:
+        return "<p><i>No protein data available for this gene.</i></p>"
+    r = row.iloc[0]
+    return (
+        f"<div class='stat-card'>"
+        f"<p><b>Protein Length:</b> {int(r['protein_length'])} aa</p>"
+        f"<p><b>Top Amino Acids:</b> {r['composition_summary']}</p>"
+        f"</div>"
+    )
+def build_backpack_comparison(state: AppState, data: dict) -> go.Figure:
+    """Bar chart of protein lengths for backpack genes."""
+    if not state or len(state.backpack_genes) < 2:
+        fig = go.Figure()
+        fig.add_annotation(text="Pin at least 2 genes to compare", showarrow=False)
+        return fig
+    protein = data["protein"]
+    bp_prot = protein[protein["gene_id"].isin(state.backpack_genes)]
+    fig = go.Figure(data=[go.Bar(
+        x=bp_prot["gene_id"],
+        y=bp_prot["protein_length"],
+        marker_color="#2E7D32",
+        text=bp_prot["protein_length"],
+        textposition="auto",
+    )])
+    fig.update_layout(
+        title="Backpack Genes: Protein Length Comparison",
+        xaxis_title="Gene",
+        yaxis_title="Protein Length (aa)",
+    )
+    return fig
+def build_composition_heatmap(state: AppState, data: dict) -> go.Figure:
+    """Heatmap of amino acid composition for backpack genes."""
+    if not state or len(state.backpack_genes) < 2:
+        fig = go.Figure()
+        fig.add_annotation(text="Pin at least 2 genes to compare", showarrow=False)
+        return fig
+    # Parse composition from summary strings
+    protein = data["protein"]
+    bp_prot = protein[protein["gene_id"].isin(state.backpack_genes)]
+    aa_data = {}
+    for _, row in bp_prot.iterrows():
+        gene_id = row["gene_id"]
+        comp = row["composition_summary"]
+        aa_dict = {}
+        for item in comp.split(", "):
+            parts = item.split(":")
+            if len(parts) == 2:
+                aa = parts[0].strip()
+                pct = float(parts[1].replace("%", ""))
+                aa_dict[aa] = pct
+        aa_data[gene_id] = aa_dict
+    if not aa_data:
+        fig = go.Figure()
+        fig.add_annotation(text="No composition data", showarrow=False)
+        return fig
+    df = pd.DataFrame(aa_data).fillna(0).T
+    fig = go.Figure(data=go.Heatmap(
+        z=df.values,
+        x=df.columns.tolist(),
+        y=df.index.tolist(),
+        colorscale="YlGn",
+        colorbar_title="%",
+    ))
+    fig.update_layout(
+        title="Amino Acid Composition Heatmap",
+        xaxis_title="Amino Acid",
+        yaxis_title="Gene",
+    )
+    return fig
+# ============================================================
+# Gene Card Callbacks
+# ============================================================
+def on_open_gene_card(gene_id: str, state: AppState, data: dict) -> tuple:
+    """Open Gene Card side panel."""
+    if not gene_id:
+        return "", False, state
+    state.selected_gene = gene_id
+    card = build_gene_card(gene_id, data)
+    html = render_gene_card_html(card)
+    state.award("Gene Hunter")
+    return html, True, state
+def on_download_gene_report(state: AppState, data: dict) -> str:
+    """Download gene report."""
+    if state and state.selected_gene:
+        return export_gene_report(state.selected_gene, data)
+    return None
+# ============================================================
+# Final Report Callbacks
+# ============================================================
+def on_generate_report(state: AppState, data: dict) -> tuple:
+    """Generate field report."""
+    if state is None:
+        state = AppState()
+    state.award("Cartographer")
+    report_md = generate_field_report(state, data)
+    json_path = export_report_json(state, data)
+    csv_path = export_report_csv(state, data)
+    # Achievement HTML
+    badges = " ".join(
+        f'<span class="achievement-badge">{a}</span>'
+        for a in sorted(state.achievements)
+    )
+    return report_md, json_path, csv_path, True, True, badges, state
+# ============================================================
+# Data Health
+# ============================================================
+def build_data_health_html(validation_report: dict) -> str:
+    """Build data health HTML from validation report."""
+    rows = ""
+    for k, v in validation_report.items():
+        if isinstance(v, float):
+            v = f"{v:.1f}%"
+        rows += f"<tr><td><b>{k}</b></td><td>{v}</td></tr>"
+    return f"<table style='width:100%'>{rows}</table>"
+# Need gr import for Tabs update
+import gradio as gr

src/data_loader.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Data parsing and validation for the Pigeon Pea Pangenome Atlas."""
+import re
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from collections import Counter
+from src.utils import logger, timer
+@timer
+def load_pav(path: str) -> pd.DataFrame:
+    """
+    Load 89_line_PAV.txt.
+    Returns DataFrame: index=gene_id (str), columns=line_ids (str), values=int {0,1}.
+    """
+    df = pd.read_csv(path, sep="\t", index_col=0)
+    df.index.name = "gene"
+    df.index = df.index.astype(str)
+    df.columns = df.columns.astype(str)
+    # Validate all values are 0 or 1
+    unique_vals = set(df.values.flatten())
+    assert unique_vals.issubset({0, 1}), f"PAV contains values other than 0/1: {unique_vals - {0, 1}}"
+    logger.info(f"PAV matrix loaded: {df.shape[0]} genes x {df.shape[1]} lines")
+    return df
+@timer
+def parse_gff_genes(path: str) -> pd.DataFrame:
+    """
+    Parse GFF3; keep only feature == 'gene' rows.
+    Returns DataFrame: gene_id, contig_id, start, end, strand.
+    """
+    records = []
+    with open(path, "r") as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            parts = line.strip().split("\t")
+            if len(parts) < 9:
+                continue
+            if parts[2] != "gene":
+                continue
+            contig_id = parts[0]
+            start = int(parts[3])
+            end = int(parts[4])
+            strand = parts[6]
+            attrs = parts[8]
+            # Extract gene_id from attributes: ID=<value>
+            gene_id = None
+            for attr in attrs.split(";"):
+                attr = attr.strip()
+                if attr.startswith("ID="):
+                    gene_id = attr[3:]
+                    break
+            if gene_id:
+                records.append({
+                    "gene_id": gene_id,
+                    "contig_id": contig_id,
+                    "start": start,
+                    "end": end,
+                    "strand": strand,
+                })
+    df = pd.DataFrame(records)
+    logger.info(f"GFF parsed: {len(df)} genes on {df['contig_id'].nunique()} contigs")
+    return df
+@timer
+def parse_protein_fasta(path: str) -> pd.DataFrame:
+    """
+    Returns DataFrame: gene_id, protein_length, aa_composition (dict as string).
+    gene_id = header token after '>' up to first whitespace.
+    """
+    records = []
+    current_id = None
+    current_seq = []
+    def flush():
+        if current_id and current_seq:
+            seq = "".join(current_seq).replace("*", "")
+            length = len(seq)
+            counts = Counter(seq)
+            total = max(length, 1)
+            top_aas = sorted(counts.items(), key=lambda x: -x[1])[:5]
+            comp_str = ", ".join(f"{aa}:{count/total*100:.1f}%" for aa, count in top_aas)
+            records.append({
+                "gene_id": current_id,
+                "protein_length": length,
+                "composition_summary": comp_str,
+            })
+    with open(path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                flush()
+                current_id = line[1:].split()[0]
+                current_seq = []
+            else:
+                current_seq.append(line)
+    flush()
+    df = pd.DataFrame(records)
+    logger.info(f"Protein FASTA parsed: {len(df)} proteins")
+    return df
+@timer
+def build_contig_index(path: str) -> dict:
+    """
+    Returns dict: {contig_id: length}.
+    Sequential scan of FASTA headers and sequences.
+    """
+    contig_index = {}
+    current_contig = None
+    current_len = 0
+    with open(path, "r") as f:
+        for line in f:
+            if line.startswith(">"):
+                if current_contig is not None:
+                    contig_index[current_contig] = current_len
+                current_contig = line[1:].strip().split()[0]
+                current_len = 0
+            else:
+                current_len += len(line.strip())
+    if current_contig is not None:
+        contig_index[current_contig] = current_len
+    logger.info(f"Contig index built: {len(contig_index)} contigs")
+    return contig_index
+def build_contig_name_mapping(gff_genes: pd.DataFrame, contig_index: dict) -> dict:
+    """
+    Build mapping from GFF contig IDs to FASTA contig IDs.
+    Strategy: exact match first, then substring match on accession tokens.
+    Returns dict: {gff_contig_id: fasta_contig_id}
+    """
+    gff_contigs = set(gff_genes["contig_id"].unique())
+    fasta_contigs = set(contig_index.keys())
+    mapping = {}
+    # Exact match
+    for gc in gff_contigs:
+        if gc in fasta_contigs:
+            mapping[gc] = gc
+    # For unmatched, try accession-based matching
+    unmatched = gff_contigs - set(mapping.keys())
+    if unmatched:
+        # Extract accession-like tokens from FASTA headers (e.g. NC_033813.1)
+        fasta_accession_map = {}
+        for fc in fasta_contigs:
+            # Try to extract RefSeq accession
+            match = re.search(r'(N[CWZ]_\d+\.\d+)', fc)
+            if match:
+                fasta_accession_map[match.group(1)] = fc
+        for gc in unmatched:
+            match = re.search(r'(N[CWZ]_\d+\.\d+)', gc)
+            if match and match.group(1) in fasta_accession_map:
+                mapping[gc] = fasta_accession_map[match.group(1)]
+    logger.info(f"Contig mapping: {len(mapping)}/{len(gff_contigs)} GFF contigs matched to FASTA")
+    return mapping
+def validate_joins(pav: pd.DataFrame, gff_genes: pd.DataFrame,
+                   protein_index: pd.DataFrame, contig_index: dict) -> dict:
+    """
+    Returns validation report with coverage percentages and orphan genes.
+    """
+    pav_genes = set(pav.index)
+    gff_gene_set = set(gff_genes["gene_id"])
+    protein_gene_set = set(protein_index["gene_id"])
+    contig_set = set(contig_index.keys())
+    gff_contig_set = set(gff_genes["contig_id"])
+    pav_in_gff = pav_genes & gff_gene_set
+    pav_in_protein = pav_genes & protein_gene_set
+    gff_contigs_in_fasta = gff_contig_set & contig_set
+    orphans = pav_genes - (gff_gene_set | protein_gene_set)
+    report = {
+        "pav_gene_count": len(pav_genes),
+        "gff_gene_count": len(gff_gene_set),
+        "protein_gene_count": len(protein_gene_set),
+        "pav_genes_in_gff_pct": len(pav_in_gff) / max(len(pav_genes), 1) * 100,
+        "pav_genes_in_protein_pct": len(pav_in_protein) / max(len(pav_genes), 1) * 100,
+        "gff_contigs_in_fasta_pct": len(gff_contigs_in_fasta) / max(len(gff_contig_set), 1) * 100,
+        "orphan_genes_count": len(orphans),
+    }
+    if orphans:
+        logger.warning(f"{len(orphans)} orphan genes (in PAV but missing from both GFF and protein)")
+    for key, val in report.items():
+        logger.info(f"  {key}: {val}")
+    return report

src/field_report.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Field Report generator for the Pigeon Pea Pangenome Atlas."""
+import json
+import csv
+import os
+import tempfile
+import pandas as pd
+from src.state import AppState
+def generate_field_report(state: AppState, data: dict) -> str:
+    """
+    Generate markdown field report.
+    Returns markdown string.
+    """
+    if not state.selected_line:
+        return "*No line selected. Go back to Quest 0 to choose a line.*"
+    line_id = state.selected_line
+    line_stats = data["line_stats"]
+    embedding = data["embedding"]
+    similarity = data["similarity"]
+    gene_freq = data["gene_freq"]
+    pav = data.get("pav")
+    # Line info
+    ls_row = line_stats[line_stats["line_id"] == line_id]
+    country = ls_row.iloc[0]["country"] if len(ls_row) > 0 else "Unknown"
+    genes_present = int(ls_row.iloc[0]["genes_present_count"]) if len(ls_row) > 0 else 0
+    unique_genes = int(ls_row.iloc[0]["unique_genes_count"]) if len(ls_row) > 0 else 0
+    # Cluster
+    emb_row = embedding[embedding["line_id"] == line_id]
+    cluster_id = int(emb_row.iloc[0]["cluster_id"]) if len(emb_row) > 0 else -1
+    # Nearest neighbors
+    sim_rows = similarity[similarity["line_id"] == line_id].nlargest(3, "jaccard_score")
+    neighbors_str = ", ".join(
+        f"{r['neighbor_line_id']} ({r['jaccard_score']:.3f})"
+        for _, r in sim_rows.iterrows()
+    )
+    # Core/shell/cloud for this line
+    core_shell_cloud = ""
+    if pav is not None and line_id in pav.columns:
+        my_genes = set(pav.index[pav[line_id] == 1])
+        my_freq = gene_freq[gene_freq["gene_id"].isin(my_genes)]
+        core_count = int((my_freq["core_class"] == "core").sum())
+        shell_count = int((my_freq["core_class"] == "shell").sum())
+        cloud_count = int((my_freq["core_class"] == "cloud").sum())
+        core_shell_cloud = (
+            f"- Core genes: **{core_count}**\n"
+            f"- Shell genes: **{shell_count}**\n"
+            f"- Cloud genes: **{cloud_count}**\n"
+        )
+    # Top rare genes
+    rare_genes_str = ""
+    if pav is not None and line_id in pav.columns:
+        my_genes_list = pav.index[pav[line_id] == 1].tolist()
+        rare = gene_freq[
+            (gene_freq["gene_id"].isin(my_genes_list)) &
+            (gene_freq["freq_count"] <= 5)
+        ].nsmallest(5, "freq_count")
+        if len(rare) > 0:
+            rare_genes_str = "\n".join(
+                f"- **{r['gene_id']}** (present in {r['freq_count']} lines)"
+                for _, r in rare.iterrows()
+            )
+        else:
+            rare_genes_str = "No rare genes (<=5 lines) found in this line."
+    # Backpack
+    backpack_str = ""
+    if state.backpack_genes:
+        backpack_items = []
+        for g in state.backpack_genes:
+            gf = gene_freq[gene_freq["gene_id"] == g]
+            if len(gf) > 0:
+                backpack_items.append(
+                    f"- **{g}**: {gf.iloc[0]['core_class']} "
+                    f"({gf.iloc[0]['freq_count']} lines)"
+                )
+            else:
+                backpack_items.append(f"- **{g}**: no frequency data")
+        backpack_str = "\n".join(backpack_items)
+    else:
+        backpack_str = "No genes pinned to backpack."
+    # Achievements
+    achievements_str = ", ".join(
+        f"**{a}**" for a in sorted(state.achievements)
+    ) if state.achievements else "None yet"
+    report = f"""
+## Field Report: {line_id}
+**Country of origin:** {country}
+**Cluster:** {cluster_id}
+**Genes present:** {genes_present:,}
+**Unique genes:** {unique_genes}
+### Nearest Neighbors
+{neighbors_str}
+### Gene Composition
+{core_shell_cloud}
+### Top 5 Rare Genes
+{rare_genes_str}
+### Backpack Collection
+{backpack_str}
+### Achievements
+{achievements_str}
+---
+*Generated by Pigeon Pea Pangenome Atlas*
+"""
+    return report.strip()
+def export_report_json(state: AppState, data: dict) -> str:
+    """Export full report data as JSON. Return file path."""
+    report_data = {
+        "selected_line": state.selected_line,
+        "backpack_genes": state.backpack_genes,
+        "achievements": list(state.achievements),
+        "core_threshold": state.core_threshold,
+        "cloud_threshold": state.cloud_threshold,
+    }
+    if state.selected_line:
+        ls = data["line_stats"]
+        row = ls[ls["line_id"] == state.selected_line]
+        if len(row) > 0:
+            report_data["line_stats"] = row.iloc[0].to_dict()
+    tmpdir = tempfile.mkdtemp()
+    path = os.path.join(tmpdir, "field_report.json")
+    with open(path, "w") as f:
+        json.dump(report_data, f, indent=2, default=str)
+    return path
+def export_report_csv(state: AppState, data: dict) -> str:
+    """Export tabular portions as CSV. Return file path."""
+    tmpdir = tempfile.mkdtemp()
+    path = os.path.join(tmpdir, "field_report.csv")
+    rows = []
+    if state.backpack_genes:
+        gene_freq = data["gene_freq"]
+        for g in state.backpack_genes:
+            gf = gene_freq[gene_freq["gene_id"] == g]
+            if len(gf) > 0:
+                rows.append({
+                    "gene_id": g,
+                    "freq_count": int(gf.iloc[0]["freq_count"]),
+                    "freq_pct": float(gf.iloc[0]["freq_pct"]),
+                    "core_class": gf.iloc[0]["core_class"],
+                })
+    if rows:
+        with open(path, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=["gene_id", "freq_count", "freq_pct", "core_class"])
+            writer.writeheader()
+            writer.writerows(rows)
+    else:
+        with open(path, "w") as f:
+            f.write("gene_id,freq_count,freq_pct,core_class\n")
+    return path

src/gene_card.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""Gene Card builder for the Pigeon Pea Pangenome Atlas."""
+import json
+import os
+import tempfile
+import pandas as pd
+def build_gene_card(gene_id: str, data: dict) -> dict:
+    """
+    Assemble all data for the Gene Card.
+    Returns dict with all fields for rendering.
+    """
+    result = {"gene_id": gene_id}
+    # Frequency info
+    gene_freq = data["gene_freq"]
+    freq_row = gene_freq[gene_freq["gene_id"] == gene_id]
+    if len(freq_row) > 0:
+        row = freq_row.iloc[0]
+        result["freq_count"] = int(row["freq_count"])
+        result["freq_pct"] = float(row["freq_pct"])
+        result["core_class"] = row["core_class"]
+    else:
+        result["freq_count"] = None
+        result["freq_pct"] = None
+        result["core_class"] = "unknown"
+    # Presence vector
+    pav = data.get("pav")
+    if pav is not None and gene_id in pav.index:
+        result["presence_vector"] = pav.loc[gene_id]
+    else:
+        result["presence_vector"] = None
+    # GFF info
+    gff = data["gff_index"]
+    gff_row = gff[gff["gene_id"] == gene_id]
+    if len(gff_row) > 0:
+        row = gff_row.iloc[0]
+        result["contig"] = row["contig_id"]
+        result["start"] = int(row["start"])
+        result["end"] = int(row["end"])
+        result["strand"] = row["strand"]
+    else:
+        result["contig"] = None
+        result["start"] = None
+        result["end"] = None
+        result["strand"] = None
+    # Protein info
+    protein = data["protein"]
+    prot_row = protein[protein["gene_id"] == gene_id]
+    if len(prot_row) > 0:
+        row = prot_row.iloc[0]
+        result["protein_length"] = int(row["protein_length"])
+        result["composition_summary"] = row["composition_summary"]
+    else:
+        result["protein_length"] = None
+        result["composition_summary"] = None
+    return result
+def render_gene_card_html(card: dict) -> str:
+    """Render Gene Card data as HTML."""
+    gene_id = card["gene_id"]
+    # Badge
+    cc = card.get("core_class", "unknown")
+    badge_map = {
+        "core": ('<span class="quest-badge badge-core">Core</span>', "Present in >=95% of lines"),
+        "shell": ('<span class="quest-badge badge-shell">Shell</span>', "Present in 15-95% of lines"),
+        "cloud": ('<span class="quest-badge badge-cloud">Cloud</span>', "Present in <15% of lines"),
+    }
+    badge_html, badge_desc = badge_map.get(cc, ('<span class="quest-badge">Unknown</span>', ""))
+    # Frequency
+    freq_text = ""
+    if card.get("freq_count") is not None:
+        freq_text = f"<p>Present in <b>{card['freq_count']}</b> lines ({card['freq_pct']:.1f}%)</p>"
+    # Presence barcode
+    barcode_html = ""
+    if card.get("presence_vector") is not None:
+        barcode_html = render_presence_barcode(card["presence_vector"])
+    # Location
+    loc_text = ""
+    if card.get("contig"):
+        loc_text = (
+            f"<p>Contig: <code>{card['contig']}</code></p>"
+            f"<p>Position: {card['start']:,} - {card['end']:,} ({card['strand']})</p>"
+        )
+    else:
+        loc_text = "<p><i>No coordinate annotation available</i></p>"
+    # Protein
+    prot_text = ""
+    if card.get("protein_length"):
+        prot_text = (
+            f"<p>Protein length: <b>{card['protein_length']}</b> aa</p>"
+            f"<p>Top AAs: {card['composition_summary']}</p>"
+        )
+    else:
+        prot_text = "<p><i>No protein data available</i></p>"
+    html = f"""
+    <div class="gene-card">
+        <h3>Gene: {gene_id}</h3>
+        {badge_html}
+        <p style="font-size:0.8em;color:#666">{badge_desc}</p>
+        {freq_text}
+        <h4>Presence Barcode</h4>
+        {barcode_html}
+        <h4>Genomic Location</h4>
+        {loc_text}
+        <h4>Protein</h4>
+        {prot_text}
+    </div>
+    """
+    return html
+def render_presence_barcode(presence_vector: pd.Series) -> str:
+    """
+    Returns HTML string: colored spans for each line.
+    Green (#4CAF50) = present, light gray (#E0E0E0) = absent.
+    """
+    spans = []
+    for val in presence_vector:
+        cls = "present" if val == 1 else "absent"
+        spans.append(f'<span class="{cls}"></span>')
+    return '<div class="presence-barcode">' + "".join(spans) + '</div>'
+def export_gene_report(gene_id: str, data: dict) -> str:
+    """Export gene data as JSON file. Return file path."""
+    card = build_gene_card(gene_id, data)
+    # Convert presence vector to list for JSON
+    if card.get("presence_vector") is not None:
+        card["presence_vector"] = card["presence_vector"].to_dict()
+    tmpdir = tempfile.mkdtemp()
+    path = os.path.join(tmpdir, f"{gene_id}_report.json")
+    with open(path, "w") as f:
+        json.dump(card, f, indent=2, default=str)
+    return path

src/precompute.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Offline precomputation for the Pigeon Pea Pangenome Atlas."""
+import numpy as np
+import pandas as pd
+from scipy.spatial.distance import pdist, squareform
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from src.utils import logger, timer, parse_country
+@timer
+def compute_gene_frequency(pav: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compute per-gene frequency and core class.
+    Output columns: gene_id, freq_count, freq_pct, core_class
+    """
+    n_lines = pav.shape[1]
+    freq_count = pav.sum(axis=1).astype(int)
+    freq_pct = (freq_count / n_lines * 100).round(2)
+    def classify(pct):
+        if pct >= 95:
+            return "core"
+        elif pct >= 15:
+            return "shell"
+        return "cloud"
+    core_class = freq_pct.map(classify)
+    df = pd.DataFrame({
+        "gene_id": pav.index,
+        "freq_count": freq_count.values,
+        "freq_pct": freq_pct.values,
+        "core_class": core_class.values,
+    })
+    logger.info(f"Gene frequency: {(df['core_class']=='core').sum()} core, "
+                f"{(df['core_class']=='shell').sum()} shell, "
+                f"{(df['core_class']=='cloud').sum()} cloud")
+    return df
+@timer
+def compute_line_stats(pav: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compute per-line statistics.
+    Output columns: line_id, country, genes_present_count, unique_genes_count
+    """
+    n_lines = pav.shape[1]
+    records = []
+    for line_id in pav.columns:
+        presence = pav[line_id]
+        genes_present = int(presence.sum())
+        # Unique genes: present in this line but no others
+        unique_mask = (pav.sum(axis=1) == 1) & (presence == 1)
+        unique_count = int(unique_mask.sum())
+        country = parse_country(line_id)
+        records.append({
+            "line_id": line_id,
+            "country": country,
+            "genes_present_count": genes_present,
+            "unique_genes_count": unique_count,
+        })
+    df = pd.DataFrame(records)
+    logger.info(f"Line stats computed for {len(df)} lines")
+    return df
+@timer
+def compute_line_embedding(pav: pd.DataFrame) -> pd.DataFrame:
+    """
+    UMAP embedding + KMeans clustering of lines.
+    Output columns: line_id, umap_x, umap_y, cluster_id
+    """
+    import umap
+    # Transpose: rows = lines, columns = genes
+    X = pav.T.values.astype(np.float32)
+    line_ids = list(pav.columns)
+    # UMAP
+    reducer = umap.UMAP(n_components=2, metric="jaccard", n_neighbors=15,
+                        min_dist=0.1, random_state=42)
+    embedding = reducer.fit_transform(X)
+    # KMeans clustering — pick k by silhouette
+    best_k, best_score = 3, -1
+    for k in range(3, min(9, len(line_ids))):
+        km = KMeans(n_clusters=k, random_state=42, n_init=10)
+        labels = km.fit_predict(embedding)
+        score = silhouette_score(embedding, labels)
+        if score > best_score:
+            best_k, best_score = k, score
+            best_labels = labels
+    logger.info(f"UMAP + KMeans: best k={best_k}, silhouette={best_score:.3f}")
+    df = pd.DataFrame({
+        "line_id": line_ids,
+        "umap_x": embedding[:, 0],
+        "umap_y": embedding[:, 1],
+        "cluster_id": best_labels,
+    })
+    return df
+@timer
+def compute_similarity_topk(pav: pd.DataFrame, k: int = 15) -> pd.DataFrame:
+    """
+    Pairwise Jaccard similarity, keep top-K neighbors per line.
+    Output columns: line_id, neighbor_line_id, jaccard_score
+    """
+    X = pav.T.values.astype(np.float32)
+    line_ids = list(pav.columns)
+    n = len(line_ids)
+    # Compute pairwise Jaccard distance, convert to similarity
+    dist_vec = pdist(X, metric="jaccard")
+    dist_mat = squareform(dist_vec)
+    sim_mat = 1.0 - dist_mat
+    records = []
+    for i in range(n):
+        scores = sim_mat[i].copy()
+        scores[i] = -1  # exclude self
+        top_idx = np.argsort(scores)[::-1][:k]
+        for j in top_idx:
+            records.append({
+                "line_id": line_ids[i],
+                "neighbor_line_id": line_ids[j],
+                "jaccard_score": round(float(scores[j]), 4),
+            })
+    df = pd.DataFrame(records)
+    logger.info(f"Similarity top-{k}: {len(df)} pairs")
+    return df
+@timer
+def build_gff_gene_parquet(gff_genes: pd.DataFrame, output_path: str) -> None:
+    """Save parsed GFF gene DataFrame to parquet."""
+    gff_genes.to_parquet(output_path, index=False)
+    logger.info(f"GFF gene index saved: {output_path}")
+@timer
+def build_protein_parquet(protein_df: pd.DataFrame, output_path: str) -> None:
+    """Save protein index to parquet."""
+    protein_df.to_parquet(output_path, index=False)
+    logger.info(f"Protein index saved: {output_path}")
+@timer
+def save_contig_index(contig_index: dict, contig_mapping: dict, output_path: str) -> None:
+    """Save contig index as JSON."""
+    import json
+    data = {}
+    for contig_id, length in contig_index.items():
+        gff_seqid = None
+        for gff_id, fasta_id in contig_mapping.items():
+            if fasta_id == contig_id:
+                gff_seqid = gff_id
+                break
+        data[contig_id] = {
+            "length": length,
+            "gff_seqid": gff_seqid or contig_id,
+            "fasta_header": contig_id,
+        }
+    with open(output_path, "w") as f:
+        json.dump(data, f, indent=2)
+    logger.info(f"Contig index saved: {output_path}")
+@timer
+def compute_hotspot_bins(gff_genes: pd.DataFrame, gene_freq: pd.DataFrame,
+                         contig_index: dict, bin_size: int = 100_000) -> pd.DataFrame:
+    """
+    Bin genes along contigs and compute variability scores.
+    Output columns: contig_id, bin_start, bin_end, total_genes, cloud_genes,
+                    shell_genes, core_genes, mean_freq, variability_score
+    """
+    # Join gff with gene frequency
+    merged = gff_genes.merge(gene_freq, on="gene_id", how="inner")
+    merged["midpoint"] = (merged["start"] + merged["end"]) // 2
+    records = []
+    for contig_id in merged["contig_id"].unique():
+        contig_genes = merged[merged["contig_id"] == contig_id]
+        max_pos = contig_genes["end"].max()
+        for bin_start in range(0, max_pos + bin_size, bin_size):
+            bin_end = bin_start + bin_size
+            in_bin = contig_genes[
+                (contig_genes["midpoint"] >= bin_start) &
+                (contig_genes["midpoint"] < bin_end)
+            ]
+            if len(in_bin) == 0:
+                continue
+            core_count = int((in_bin["core_class"] == "core").sum())
+            shell_count = int((in_bin["core_class"] == "shell").sum())
+            cloud_count = int((in_bin["core_class"] == "cloud").sum())
+            mean_freq = float(in_bin["freq_pct"].mean())
+            variability_score = cloud_count + 0.5 * shell_count
+            records.append({
+                "contig_id": contig_id,
+                "bin_start": bin_start,
+                "bin_end": bin_end,
+                "total_genes": len(in_bin),
+                "core_genes": core_count,
+                "shell_genes": shell_count,
+                "cloud_genes": cloud_count,
+                "mean_freq": round(mean_freq, 2),
+                "variability_score": round(variability_score, 2),
+            })
+    df = pd.DataFrame(records)
+    logger.info(f"Hotspot bins computed: {len(df)} bins across {df['contig_id'].nunique()} contigs")
+    return df
+@timer
+def compute_cluster_markers(pav: pd.DataFrame, embedding: pd.DataFrame,
+                            top_n: int = 50) -> pd.DataFrame:
+    """
+    Find marker genes for each cluster.
+    Output columns: cluster_id, gene_id, in_cluster_freq, out_cluster_freq, marker_score
+    """
+    clusters = embedding[["line_id", "cluster_id"]].copy()
+    records = []
+    for cid in sorted(clusters["cluster_id"].unique()):
+        in_lines = set(clusters[clusters["cluster_id"] == cid]["line_id"])
+        out_lines = set(clusters[clusters["cluster_id"] != cid]["line_id"])
+        in_cols = [c for c in pav.columns if c in in_lines]
+        out_cols = [c for c in pav.columns if c in out_lines]
+        if not in_cols or not out_cols:
+            continue
+        in_freq = pav[in_cols].mean(axis=1)
+        out_freq = pav[out_cols].mean(axis=1)
+        marker_score = in_freq - out_freq
+        top_genes = marker_score.nlargest(top_n)
+        for gene_id, score in top_genes.items():
+            records.append({
+                "cluster_id": int(cid),
+                "gene_id": gene_id,
+                "in_cluster_freq": round(float(in_freq[gene_id]), 4),
+                "out_cluster_freq": round(float(out_freq[gene_id]), 4),
+                "marker_score": round(float(score), 4),
+            })
+    df = pd.DataFrame(records)
+    logger.info(f"Cluster markers: {len(df)} total across {df['cluster_id'].nunique()} clusters")
+    return df

src/state.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Application state for the Pigeon Pea Pangenome Atlas."""
+from dataclasses import dataclass, field
+@dataclass
+class AppState:
+    selected_line: str | None = None
+    selected_party: list[str] = field(default_factory=list)
+    selected_gene: str | None = None
+    backpack_genes: list[str] = field(default_factory=list)
+    core_threshold: float = 95.0
+    cloud_threshold: float = 15.0
+    selected_contig: str | None = None
+    achievements: set[str] = field(default_factory=set)
+    def add_to_backpack(self, gene_id: str) -> bool:
+        """Add gene to backpack. Returns True if added, False if already present or full."""
+        if gene_id in self.backpack_genes:
+            return False
+        if len(self.backpack_genes) >= 50:
+            return False
+        self.backpack_genes.append(gene_id)
+        if len(self.backpack_genes) >= 5:
+            self.achievements.add("Collector")
+        return True
+    def remove_from_backpack(self, gene_id: str) -> bool:
+        """Remove gene from backpack."""
+        if gene_id in self.backpack_genes:
+            self.backpack_genes.remove(gene_id)
+            return True
+        return False
+    def award(self, badge: str):
+        """Award an achievement badge."""
+        self.achievements.add(badge)

src/utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Shared helpers for the Pigeon Pea Pangenome Atlas."""
+import os
+import logging
+import time
+from pathlib import Path
+from functools import wraps
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger("pangenome")
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DATA_DIR = PROJECT_ROOT / "data"
+PRECOMPUTED_DIR = PROJECT_ROOT / "precomputed"
+def find_file(directory: Path, pattern: str) -> Path:
+    """Find first file matching glob pattern in directory."""
+    matches = list(directory.glob(pattern))
+    if not matches:
+        raise FileNotFoundError(f"No file matching '{pattern}' in {directory}")
+    return matches[0]
+def timer(func):
+    """Decorator that logs execution time."""
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        t0 = time.time()
+        result = func(*args, **kwargs)
+        dt = time.time() - t0
+        logger.info(f"{func.__name__} completed in {dt:.2f}s")
+        return result
+    return wrapper
+KNOWN_COUNTRIES = {
+    "India", "Myanmar", "Unknown", "Zaire", "Uganda", "Indonesia", "Jamaica",
+    "South_Africa", "Puerto_Rico", "Philippines", "Sierra_Leone", "Nigeria",
+    "Malawi", "Italy", "Kenya", "Sri_Lanka", "Thailand", "Nepal",
+}
+def parse_country(line_id: str) -> str:
+    """Extract country from line ID (last token after underscore)."""
+    parts = line_id.rsplit("_", 1)
+    if len(parts) == 2 and parts[1] in KNOWN_COUNTRIES:
+        return parts[1]
+    # Try two-word countries
+    parts2 = line_id.rsplit("_", 2)
+    if len(parts2) >= 3:
+        two_word = f"{parts2[-2]}_{parts2[-1]}"
+        if two_word in KNOWN_COUNTRIES:
+            return two_word
+    return "Unknown"

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""Shared test fixtures for the Pigeon Pea Pangenome Atlas."""
+import os
+import sys
+import pytest
+import numpy as np
+import pandas as pd
+from pathlib import Path
+# Ensure project root is in path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.state import AppState
+@pytest.fixture
+def synthetic_pav():
+    """20 genes x 10 lines, random 0/1."""
+    np.random.seed(42)
+    genes = [f"g{i:05d}" for i in range(20)]
+    lines = [f"Line_{i}_India" for i in range(10)]
+    data = np.random.randint(0, 2, size=(20, 10))
+    return pd.DataFrame(data, index=genes, columns=lines)
+@pytest.fixture
+def synthetic_gff():
+    """20 genes on 3 contigs."""
+    records = []
+    for i in range(20):
+        contig = f"contig_{i % 3}"
+        start = i * 1000
+        end = start + 500
+        records.append({
+            "gene_id": f"g{i:05d}",
+            "contig_id": contig,
+            "start": start,
+            "end": end,
+            "strand": "+" if i % 2 == 0 else "-",
+        })
+    return pd.DataFrame(records)
+@pytest.fixture
+def synthetic_protein():
+    """20 genes with random lengths."""
+    records = []
+    for i in range(20):
+        records.append({
+            "gene_id": f"g{i:05d}",
+            "protein_length": np.random.randint(50, 1000),
+            "composition_summary": "L:9.0%, A:8.0%, G:7.0%, S:6.0%, V:5.0%",
+        })
+    return pd.DataFrame(records)
+@pytest.fixture
+def synthetic_contig_index():
+    """3 contigs."""
+    return {"contig_0": 50000, "contig_1": 80000, "contig_2": 30000}
+@pytest.fixture
+def empty_state():
+    """Fresh AppState."""
+    return AppState()
+@pytest.fixture
+def populated_state():
+    """AppState with some selections."""
+    state = AppState()
+    state.selected_line = "Line_0_India"
+    state.backpack_genes = ["g00001", "g00002"]
+    state.achievements = {"Explorer"}
+    return state
+@pytest.fixture
+def synthetic_data(synthetic_pav, synthetic_gff, synthetic_protein, synthetic_contig_index):
+    """Complete synthetic data dict matching app DATA format."""
+    from src.precompute import (
+        compute_gene_frequency, compute_line_stats,
+        compute_line_embedding, compute_similarity_topk,
+        compute_hotspot_bins, compute_cluster_markers,
+    )
+    gene_freq = compute_gene_frequency(synthetic_pav)
+    line_stats = compute_line_stats(synthetic_pav)
+    # Simple embedding without UMAP (too small for meaningful UMAP)
+    line_ids = list(synthetic_pav.columns)
+    embedding = pd.DataFrame({
+        "line_id": line_ids,
+        "umap_x": np.random.randn(len(line_ids)),
+        "umap_y": np.random.randn(len(line_ids)),
+        "cluster_id": [i % 3 for i in range(len(line_ids))],
+    })
+    similarity = compute_similarity_topk(synthetic_pav, k=5)
+    hotspots = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
+    markers = compute_cluster_markers(synthetic_pav, embedding)
+    return {
+        "pav": synthetic_pav,
+        "gene_freq": gene_freq,
+        "line_stats": line_stats,
+        "embedding": embedding,
+        "similarity": similarity,
+        "gff_index": synthetic_gff,
+        "protein": synthetic_protein,
+        "hotspots": hotspots,
+        "markers": markers,
+    }
+@pytest.fixture
+def precomputed_dir():
+    """Path to precomputed directory (if exists)."""
+    p = Path(__file__).resolve().parent.parent / "precomputed"
+    if p.exists():
+        return p
+    return None

tests/test_callbacks.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""Tests for src/callbacks.py."""
+import pytest
+import plotly.graph_objects as go
+from src.state import AppState
+from src.callbacks import (
+    on_line_selected, build_umap_plot,
+    build_donut_chart, build_frequency_histogram, build_treasure_table,
+    on_pin_gene, build_hotspot_heatmap, get_protein_stats_html,
+    build_backpack_comparison,
+)
+class TestQuest0:
+    def test_on_line_selected_returns(self, synthetic_data):
+        state = AppState()
+        line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
+        total, unique, neighbor, new_state = on_line_selected(line_id, state, synthetic_data)
+        assert total != "--"
+        assert int(total) > 0
+    def test_on_line_selected_updates_state(self, synthetic_data):
+        state = AppState()
+        line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
+        _, _, _, new_state = on_line_selected(line_id, state, synthetic_data)
+        assert new_state.selected_line == line_id
+class TestQuest1:
+    def test_umap_plot_returns_figure(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        fig = build_umap_plot("Country", state, synthetic_data)
+        assert isinstance(fig, go.Figure)
+        assert len(fig.data) >= 1
+class TestQuest2:
+    def test_donut_three_segments(self, synthetic_data):
+        fig = build_donut_chart(95, 15, synthetic_data)
+        assert isinstance(fig, go.Figure)
+        assert len(fig.data) == 1
+        assert len(fig.data[0].values) == 3
+    def test_histogram_returns_figure(self, synthetic_data):
+        fig = build_frequency_histogram(95, 15, synthetic_data)
+        assert isinstance(fig, go.Figure)
+    def test_treasure_table_columns(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        df = build_treasure_table(state, 95, 15, "All", synthetic_data)
+        assert "gene_id" in df.columns
+        assert "freq_count" in df.columns
+        assert "core_class" in df.columns
+    def test_pin_gene(self):
+        state = AppState()
+        backpack_text, new_state = on_pin_gene("g00001", state)
+        assert "g00001" in new_state.backpack_genes
+    def test_pin_gene_idempotent(self):
+        state = AppState()
+        on_pin_gene("g00001", state)
+        backpack_text, new_state = on_pin_gene("g00001", state)
+        assert state.backpack_genes.count("g00001") == 1
+    def test_backpack_limit(self):
+        state = AppState()
+        for i in range(55):
+            state.add_to_backpack(f"g{i:05d}")
+        assert len(state.backpack_genes) <= 50
+class TestQuest3:
+    def test_hotspot_heatmap(self, synthetic_data):
+        fig = build_hotspot_heatmap(synthetic_data)
+        assert isinstance(fig, go.Figure)
+class TestQuest4:
+    def test_protein_stats_html(self, synthetic_data):
+        gene_id = synthetic_data["protein"].iloc[0]["gene_id"]
+        html = get_protein_stats_html(gene_id, synthetic_data)
+        assert "aa" in html or "protein" in html.lower()
+    def test_backpack_comparison_needs_2(self, synthetic_data):
+        state = AppState()
+        state.backpack_genes = ["g00001"]
+        fig = build_backpack_comparison(state, synthetic_data)
+        assert isinstance(fig, go.Figure)

tests/test_data_loader.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Tests for src/data_loader.py."""
+import pytest
+import pandas as pd
+from pathlib import Path
+from src.data_loader import load_pav, parse_gff_genes, parse_protein_fasta, build_contig_index, validate_joins
+DATA_DIR = Path(__file__).resolve().parent.parent / "data"
+HAS_REAL_DATA = (DATA_DIR / "89_line_PAV.txt").exists()
+@pytest.mark.skipif(not HAS_REAL_DATA, reason="Real data not available")
+class TestRealData:
+    def test_load_pav_shape(self):
+        df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
+        assert df.shape[0] > 0
+        assert df.shape[1] > 0
+    def test_load_pav_values(self):
+        df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
+        unique_vals = set(df.values.flatten())
+        assert unique_vals.issubset({0, 1})
+    def test_load_pav_gene_ids(self):
+        df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
+        assert df.index.dtype == object  # string type
+        assert df.index.is_unique
+    def test_load_pav_line_count(self):
+        df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
+        # 89 lines + reference = 90 columns
+        assert df.shape[1] >= 89, f"Expected >=89 lines, got {df.shape[1]}"
+    def test_parse_gff_columns(self):
+        gff_files = list(DATA_DIR.glob("*.gff"))
+        assert len(gff_files) > 0
+        df = parse_gff_genes(str(gff_files[0]))
+        assert set(df.columns) == {"gene_id", "contig_id", "start", "end", "strand"}
+    def test_parse_gff_no_empty_ids(self):
+        gff_files = list(DATA_DIR.glob("*.gff"))
+        df = parse_gff_genes(str(gff_files[0]))
+        assert not df["gene_id"].isna().any()
+        assert not (df["gene_id"] == "").any()
+    def test_protein_fasta_lengths(self):
+        fasta_files = list(DATA_DIR.glob("*protein*.fasta"))
+        assert len(fasta_files) > 0
+        df = parse_protein_fasta(str(fasta_files[0]))
+        assert (df["protein_length"] > 0).all()
+    def test_contig_index_nonempty(self):
+        fasta_files = [f for f in DATA_DIR.glob("*.fasta") if "protein" not in f.name]
+        if fasta_files:
+            ci = build_contig_index(str(fasta_files[0]))
+            assert len(ci) > 0
+    def test_validate_joins_coverage(self):
+        pav = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
+        gff_files = list(DATA_DIR.glob("*.gff"))
+        gff = parse_gff_genes(str(gff_files[0]))
+        fasta_files = list(DATA_DIR.glob("*protein*.fasta"))
+        protein = parse_protein_fasta(str(fasta_files[0]))
+        genome_files = [f for f in DATA_DIR.glob("*.fasta") if "protein" not in f.name]
+        ci = build_contig_index(str(genome_files[0])) if genome_files else {}
+        report = validate_joins(pav, gff, protein, ci)
+        assert report["pav_genes_in_gff_pct"] > 50
+class TestSynthetic:
+    def test_pav_shape(self, synthetic_pav):
+        assert synthetic_pav.shape == (20, 10)
+    def test_pav_values(self, synthetic_pav):
+        unique_vals = set(synthetic_pav.values.flatten())
+        assert unique_vals.issubset({0, 1})
+    def test_gff_columns(self, synthetic_gff):
+        assert set(synthetic_gff.columns) == {"gene_id", "contig_id", "start", "end", "strand"}
+    def test_gff_no_empty_ids(self, synthetic_gff):
+        assert not synthetic_gff["gene_id"].isna().any()
+    def test_protein_lengths(self, synthetic_protein):
+        assert (synthetic_protein["protein_length"] > 0).all()

tests/test_field_report.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Tests for src/field_report.py."""
+import pytest
+import json
+from src.state import AppState
+from src.field_report import generate_field_report, export_report_json, export_report_csv
+class TestFieldReport:
+    def test_report_has_sections(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        report = generate_field_report(state, synthetic_data)
+        assert state.selected_line in report
+        assert "Cluster" in report
+        assert "Rare" in report or "rare" in report.lower()
+    def test_report_not_empty(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        report = generate_field_report(state, synthetic_data)
+        assert len(report) > 100
+    def test_report_with_empty_backpack(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        state.backpack_genes = []
+        report = generate_field_report(state, synthetic_data)
+        assert "No genes pinned" in report or "backpack" in report.lower()
+    def test_report_no_line(self, synthetic_data):
+        state = AppState()
+        report = generate_field_report(state, synthetic_data)
+        assert "No line selected" in report
+    def test_export_json_valid(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        path = export_report_json(state, synthetic_data)
+        with open(path) as f:
+            data = json.load(f)
+        assert "selected_line" in data
+    def test_export_json_schema(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        state.backpack_genes = ["g00001"]
+        path = export_report_json(state, synthetic_data)
+        with open(path) as f:
+            data = json.load(f)
+        assert "backpack_genes" in data
+        assert "achievements" in data
+    def test_export_csv(self, synthetic_data):
+        state = AppState()
+        state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
+        state.backpack_genes = [synthetic_data["gene_freq"].iloc[0]["gene_id"]]
+        path = export_report_csv(state, synthetic_data)
+        with open(path) as f:
+            content = f.read()
+        assert "gene_id" in content

tests/test_gene_card.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Tests for src/gene_card.py."""
+import pytest
+import json
+from src.gene_card import build_gene_card, render_gene_card_html, render_presence_barcode, export_gene_report
+class TestGeneCard:
+    def test_gene_card_build(self, synthetic_data):
+        gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
+        card = build_gene_card(gene_id, synthetic_data)
+        assert card["gene_id"] == gene_id
+        assert "freq_count" in card
+        assert "core_class" in card
+        assert "contig" in card
+        assert "protein_length" in card
+    def test_gene_card_missing_gff(self, synthetic_data):
+        # Use a gene_id not in GFF
+        card = build_gene_card("nonexistent_gene", synthetic_data)
+        assert card["contig"] is None
+    def test_gene_card_missing_protein(self, synthetic_data):
+        card = build_gene_card("nonexistent_gene", synthetic_data)
+        assert card["protein_length"] is None
+    def test_gene_card_html(self, synthetic_data):
+        gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
+        card = build_gene_card(gene_id, synthetic_data)
+        html = render_gene_card_html(card)
+        assert gene_id in html
+        assert "gene-card" in html
+    def test_presence_barcode_length(self, synthetic_pav):
+        gene_id = synthetic_pav.index[0]
+        vector = synthetic_pav.loc[gene_id]
+        html = render_presence_barcode(vector)
+        # Should have as many spans as lines
+        assert html.count("<span") == len(vector)
+    def test_presence_barcode_html_valid(self, synthetic_pav):
+        gene_id = synthetic_pav.index[0]
+        vector = synthetic_pav.loc[gene_id]
+        html = render_presence_barcode(vector)
+        assert 'class="present"' in html or 'class="absent"' in html
+    def test_export_gene_report(self, synthetic_data):
+        gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
+        path = export_gene_report(gene_id, synthetic_data)
+        with open(path) as f:
+            data = json.load(f)
+        assert data["gene_id"] == gene_id

tests/test_integration.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Integration tests for the Pigeon Pea Pangenome Atlas."""
+import pytest
+from pathlib import Path
+from src.state import AppState
+from src.callbacks import (
+    on_line_selected, build_umap_plot, build_donut_chart,
+    build_treasure_table, on_pin_gene, build_hotspot_heatmap,
+    get_protein_stats_html, on_generate_report,
+)
+from src.gene_card import build_gene_card, render_gene_card_html
+class TestFullFlow:
+    def test_full_flow(self, synthetic_data):
+        """Simulate: select line -> UMAP -> thresholds -> gene card -> pin -> report."""
+        state = AppState()
+        # Step 1: Select line
+        line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
+        total, unique, neighbor, state = on_line_selected(line_id, state, synthetic_data)
+        assert state.selected_line == line_id
+        # Step 2: View UMAP
+        fig = build_umap_plot("Country", state, synthetic_data)
+        assert fig is not None
+        # Step 3: Change thresholds
+        donut = build_donut_chart(90, 10, synthetic_data)
+        assert donut is not None
+        # Step 4: View treasure table
+        table = build_treasure_table(state, 90, 10, "All", synthetic_data)
+        assert len(table) > 0
+        # Step 5: Click gene -> gene card
+        gene_id = table.iloc[0]["gene_id"]
+        card = build_gene_card(gene_id, synthetic_data)
+        html = render_gene_card_html(card)
+        assert gene_id in html
+        # Step 6: Pin gene
+        state.selected_gene = gene_id
+        _, state = on_pin_gene(gene_id, state)
+        assert gene_id in state.backpack_genes
+        # Step 7: Generate report
+        report_md, json_path, csv_path, _, _, badges, state = on_generate_report(state, synthetic_data)
+        assert len(report_md) > 100
+        assert "Cartographer" in state.achievements
+class TestPrecomputedConsistency:
+    def test_embedding_line_ids(self, synthetic_data):
+        """All line_ids in embedding exist in line_stats."""
+        emb_lines = set(synthetic_data["embedding"]["line_id"])
+        stat_lines = set(synthetic_data["line_stats"]["line_id"])
+        assert emb_lines.issubset(stat_lines)
+    def test_marker_gene_ids(self, synthetic_data):
+        """All gene_ids in markers exist in gene_freq."""
+        if len(synthetic_data["markers"]) == 0:
+            pytest.skip("No markers computed")
+        marker_genes = set(synthetic_data["markers"]["gene_id"])
+        freq_genes = set(synthetic_data["gene_freq"]["gene_id"])
+        assert marker_genes.issubset(freq_genes)

tests/test_precompute.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Tests for src/precompute.py."""
+import pytest
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from src.precompute import (
+    compute_gene_frequency, compute_line_stats,
+    compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers,
+)
+class TestGeneFrequency:
+    def test_freq_count_range(self, synthetic_pav):
+        df = compute_gene_frequency(synthetic_pav)
+        assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all()
+        assert (df["freq_count"] <= synthetic_pav.shape[1]).all()
+    def test_freq_classes(self, synthetic_pav):
+        df = compute_gene_frequency(synthetic_pav)
+        assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"})
+    def test_freq_pct_range(self, synthetic_pav):
+        df = compute_gene_frequency(synthetic_pav)
+        assert (df["freq_pct"] >= 0).all()
+        assert (df["freq_pct"] <= 100).all()
+    def test_gene_id_column(self, synthetic_pav):
+        df = compute_gene_frequency(synthetic_pav)
+        assert "gene_id" in df.columns
+        assert len(df) == len(synthetic_pav)
+class TestLineStats:
+    def test_line_count(self, synthetic_pav):
+        df = compute_line_stats(synthetic_pav)
+        assert len(df) == synthetic_pav.shape[1]
+    def test_columns(self, synthetic_pav):
+        df = compute_line_stats(synthetic_pav)
+        assert "line_id" in df.columns
+        assert "country" in df.columns
+        assert "genes_present_count" in df.columns
+        assert "unique_genes_count" in df.columns
+class TestSimilarity:
+    def test_jaccard_range(self, synthetic_pav):
+        df = compute_similarity_topk(synthetic_pav, k=5)
+        assert (df["jaccard_score"] >= 0).all()
+        assert (df["jaccard_score"] <= 1).all()
+    def test_jaccard_symmetry(self, synthetic_pav):
+        df = compute_similarity_topk(synthetic_pav, k=5)
+        # Check a few pairs
+        for _, row in df.head(5).iterrows():
+            reverse = df[
+                (df["line_id"] == row["neighbor_line_id"]) &
+                (df["neighbor_line_id"] == row["line_id"])
+            ]
+            if len(reverse) > 0:
+                assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001
+class TestHotspots:
+    def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index):
+        gene_freq = compute_gene_frequency(synthetic_pav)
+        df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
+        if len(df) > 0:
+            assert (df["bin_start"] < df["bin_end"]).all()
+            assert (df["total_genes"] >= 0).all()
+class TestClusterMarkers:
+    def test_marker_count(self, synthetic_pav):
+        line_ids = list(synthetic_pav.columns)
+        embedding = pd.DataFrame({
+            "line_id": line_ids,
+            "umap_x": np.random.randn(len(line_ids)),
+            "umap_y": np.random.randn(len(line_ids)),
+            "cluster_id": [i % 3 for i in range(len(line_ids))],
+        })
+        df = compute_cluster_markers(synthetic_pav, embedding, top_n=5)
+        # Each cluster should have at most top_n markers
+        for cid in df["cluster_id"].unique():
+            assert len(df[df["cluster_id"] == cid]) <= 5
+class TestPrecomputedFiles:
+    def test_files_exist(self, precomputed_dir):
+        if precomputed_dir is None:
+            pytest.skip("Precomputed directory not available")
+        expected = [
+            "pav_gene_frequency.parquet",
+            "line_stats.parquet",
+            "line_embedding.parquet",
+            "line_similarity_topk.parquet",
+            "gff_gene_index.parquet",
+            "protein_index.parquet",
+            "genome_contig_index.json",
+            "hotspot_bins.parquet",
+            "cluster_markers.parquet",
+        ]
+        for f in expected:
+            assert (precomputed_dir / f).exists(), f"Missing: {f}"

ui/__init__.py ADDED Viewed

File without changes

ui/final.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Final tab: Field Report generation and export."""
+import gradio as gr
+def build_final_tab():
+    """Build Final Report tab components. Returns dict of components."""
+    with gr.Tab("Field Report", id="final") as tab:
+        gr.Markdown("## Your Exploration Report")
+        gr.Markdown(
+            "Generate a summary of your pangenome exploration journey, "
+            "including your selected line, findings, and backpack collection."
+        )
+        generate_btn = gr.Button("Generate Report", variant="primary")
+        report_md = gr.Markdown(value="*Click 'Generate Report' to create your field report.*")
+        with gr.Row():
+            download_json = gr.File(label="Download JSON", visible=False)
+            download_csv = gr.File(label="Download CSV", visible=False)
+        gr.Markdown("### Achievements Earned")
+        achievements_html = gr.HTML(value="<p>Complete quests to earn badges!</p>")
+    return {
+        "tab": tab,
+        "generate_btn": generate_btn,
+        "report_md": report_md,
+        "download_json": download_json,
+        "download_csv": download_csv,
+        "achievements_html": achievements_html,
+    }

ui/gene_card_ui.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""Gene Card side panel UI."""
+import gradio as gr
+def build_gene_card_panel():
+    """Build Gene Card side panel. Returns dict of components."""
+    with gr.Column(visible=False, scale=1) as gene_card_col:
+        gr.Markdown("### Gene Card")
+        gene_card_html = gr.HTML(value="")
+        with gr.Row():
+            show_genome_btn = gr.Button("Show on Genome", size="sm")
+            show_protein_btn = gr.Button("Show Protein", size="sm")
+        with gr.Row():
+            pin_card_btn = gr.Button("Pin to Backpack", size="sm", variant="secondary")
+            download_gene_btn = gr.Button("Download Report", size="sm")
+        gene_report_file = gr.File(label="Gene Report", visible=False)
+    return {
+        "column": gene_card_col,
+        "gene_card_html": gene_card_html,
+        "show_genome_btn": show_genome_btn,
+        "show_protein_btn": show_protein_btn,
+        "pin_card_btn": pin_card_btn,
+        "download_gene_btn": download_gene_btn,
+        "gene_report_file": gene_report_file,
+    }

ui/layout.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Master Gradio layout for the Pigeon Pea Pangenome Atlas."""
+import gradio as gr
+from ui.theme import build_theme, CUSTOM_CSS
+from ui.quest0 import build_quest0
+from ui.quest1 import build_quest1
+from ui.quest2 import build_quest2
+from ui.quest3 import build_quest3
+from ui.quest4 import build_quest4
+from ui.final import build_final_tab
+from ui.gene_card_ui import build_gene_card_panel
+def build_app(line_choices: list[str], contig_choices: list[str],
+              gene_choices: list[str]) -> tuple:
+    """
+    Build the full Gradio Blocks app.
+    Returns (demo, components_dict) where components_dict maps all UI elements.
+    """
+    theme = build_theme()
+    with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="Pigeon Pea Pangenome Atlas") as demo:
+        # State
+        state = gr.State(value=None)
+        # Header
+        gr.Markdown("# Pigeon Pea Pangenome Atlas")
+        gr.Markdown("*An interactive exploration of 89 pigeon pea lines and their pangenome*")
+        # Progress tracker
+        progress_html = gr.HTML(
+            value=_build_progress_html(0),
+            elem_classes=["progress-tracker"],
+        )
+        # Global filters
+        with gr.Accordion("Global Filters", open=False):
+            with gr.Row():
+                country_filter = gr.CheckboxGroup(
+                    choices=[],
+                    label="Filter by country",
+                    info="Leave empty to show all",
+                )
+                annotated_toggle = gr.Checkbox(
+                    label="Only annotated genes (with GFF entry)",
+                    value=False,
+                )
+            with gr.Accordion("What is PAV?", open=False):
+                gr.Markdown(
+                    "**Presence-Absence Variation (PAV)** describes genes that are present "
+                    "in some lines but absent from others. The pangenome captures all genes "
+                    "across the species, not just those in a single reference genome."
+                )
+        # Main content area
+        with gr.Row():
+            # Main tabs (left ~75%)
+            with gr.Column(scale=3):
+                with gr.Tabs() as tabs:
+                    q0 = build_quest0(line_choices)
+                    q1 = build_quest1()
+                    q2 = build_quest2()
+                    q3 = build_quest3(contig_choices)
+                    q4 = build_quest4(gene_choices)
+                    final = build_final_tab()
+            # Gene Card side panel (right ~25%)
+            gc = build_gene_card_panel()
+        # Data Health accordion
+        with gr.Accordion("Data Health", open=False, visible=True):
+            data_health_html = gr.HTML(value="<p>Loading data health report...</p>")
+    components = {
+        "state": state,
+        "progress_html": progress_html,
+        "tabs": tabs,
+        "country_filter": country_filter,
+        "annotated_toggle": annotated_toggle,
+        "data_health_html": data_health_html,
+        **{f"q0_{k}": v for k, v in q0.items()},
+        **{f"q1_{k}": v for k, v in q1.items()},
+        **{f"q2_{k}": v for k, v in q2.items()},
+        **{f"q3_{k}": v for k, v in q3.items()},
+        **{f"q4_{k}": v for k, v in q4.items()},
+        **{f"final_{k}": v for k, v in final.items()},
+        **{f"gc_{k}": v for k, v in gc.items()},
+    }
+    return demo, components
+def _build_progress_html(active_quest: int) -> str:
+    """Build progress tracker HTML."""
+    steps = [
+        ("Explorer", 0),
+        ("Map the World", 1),
+        ("Core vs Accessory", 2),
+        ("Genome Landmarks", 3),
+        ("Protein Relics", 4),
+        ("Field Report", 5),
+    ]
+    parts = []
+    for label, idx in steps:
+        cls = "progress-step active" if idx == active_quest else "progress-step"
+        parts.append(f'<span class="{cls}">{label}</span>')
+    return '<div class="progress-tracker">' + " ".join(parts) + '</div>'

ui/quest0.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Quest 0: Choose Your Explorer — line selection and overview."""
+import gradio as gr
+def build_quest0(line_choices: list[str]):
+    """Build Quest 0 tab components. Returns dict of components."""
+    with gr.Tab("Choose Your Explorer", id="quest0") as tab:
+        gr.Markdown("## Choose your line to explore")
+        gr.Markdown(
+            "Select one of the 89 pigeon pea lines to begin your pangenome journey. "
+            "Each line has a unique gene repertoire shaped by geography and breeding history."
+        )
+        line_dropdown = gr.Dropdown(
+            choices=line_choices,
+            label="Select a pigeon pea line",
+            info="89 lines from across the world",
+            interactive=True,
+        )
+        with gr.Row():
+            total_genes_box = gr.Textbox(
+                label="Total Genes Present",
+                interactive=False,
+                value="--",
+            )
+            unique_genes_box = gr.Textbox(
+                label="Unique Genes",
+                interactive=False,
+                value="--",
+                info="Genes found only in this line",
+            )
+            nearest_neighbor_box = gr.Textbox(
+                label="Nearest Neighbor",
+                interactive=False,
+                value="--",
+                info="Most similar line by gene content",
+            )
+        start_btn = gr.Button(
+            "Start the Journey",
+            variant="primary",
+            size="lg",
+        )
+    return {
+        "tab": tab,
+        "line_dropdown": line_dropdown,
+        "total_genes": total_genes_box,
+        "unique_genes": unique_genes_box,
+        "nearest_neighbor": nearest_neighbor_box,
+        "start_btn": start_btn,
+    }

ui/quest1.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Quest 1: Map the World — UMAP scatter of lines."""
+import gradio as gr
+def build_quest1():
+    """Build Quest 1 tab components. Returns dict of components."""
+    with gr.Tab("Map the World", id="quest1") as tab:
+        gr.Markdown("## How do 89 lines relate by gene content?")
+        gr.Markdown(
+            "This UMAP projection arranges lines by their gene presence/absence profiles. "
+            "Lines closer together share more genes."
+        )
+        color_radio = gr.Radio(
+            choices=["Country", "Cluster"],
+            value="Country",
+            label="Color by",
+            interactive=True,
+        )
+        umap_plot = gr.Plot(label="UMAP of 89 pigeon pea lines")
+        with gr.Row():
+            party_display = gr.Textbox(
+                label="Selected party (lasso/click to select)",
+                interactive=False,
+                value="None selected",
+                lines=2,
+            )
+            compare_btn = gr.Button("Compare my line to party", variant="secondary")
+        comparison_plot = gr.Plot(label="Comparison", visible=False)
+        with gr.Accordion("What does this mean?", open=False):
+            gr.Markdown(
+                "**UMAP** reduces the high-dimensional PAV matrix to 2D.\n\n"
+                "- **Country coloring** shows geographic origins.\n"
+                "- **Cluster coloring** shows groups identified by KMeans.\n"
+                "- **Click** a point to see its stats.\n"
+                "- **Lasso select** multiple points to compare with your chosen line."
+            )
+    return {
+        "tab": tab,
+        "color_radio": color_radio,
+        "umap_plot": umap_plot,
+        "party_display": party_display,
+        "compare_btn": compare_btn,
+        "comparison_plot": comparison_plot,
+    }

ui/quest2.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Quest 2: Core vs Accessory — gene classification explorer."""
+import gradio as gr
+def build_quest2():
+    """Build Quest 2 tab components. Returns dict of components."""
+    with gr.Tab("Core vs Accessory", id="quest2") as tab:
+        gr.Markdown("## Explore the Core, Shell, and Cloud genome")
+        gr.Markdown(
+            "Genes are classified by how many of the 89 lines carry them. "
+            "Adjust the thresholds to explore different definitions."
+        )
+        with gr.Row():
+            core_slider = gr.Slider(
+                minimum=50, maximum=100, value=95, step=1,
+                label="Core threshold (>=X%)",
+                info="Genes present in at least this % of lines",
+            )
+            cloud_slider = gr.Slider(
+                minimum=1, maximum=50, value=15, step=1,
+                label="Cloud threshold (<X%)",
+                info="Genes present in fewer than this % of lines",
+            )
+        with gr.Row():
+            donut_plot = gr.Plot(label="Core / Shell / Cloud distribution")
+            histogram_plot = gr.Plot(label="Gene frequency distribution")
+        gr.Markdown("### Gene Treasure List")
+        filter_radio = gr.Radio(
+            choices=["All", "Unique to my line", "Rare (<5 lines)", "Cluster markers"],
+            value="All",
+            label="Filter genes",
+        )
+        treasure_table = gr.Dataframe(
+            headers=["gene_id", "freq_count", "freq_pct", "core_class", "in_my_line"],
+            label="Genes",
+            interactive=False,
+            wrap=True,
+        )
+        with gr.Row():
+            selected_gene_text = gr.Textbox(
+                label="Selected gene",
+                interactive=False,
+                value="Click a row to select",
+            )
+            pin_btn = gr.Button("Pin to Backpack", variant="secondary")
+        backpack_display = gr.Textbox(
+            label="Backpack (pinned genes)",
+            interactive=False,
+            value="Empty",
+            lines=2,
+        )
+    return {
+        "tab": tab,
+        "core_slider": core_slider,
+        "cloud_slider": cloud_slider,
+        "donut_plot": donut_plot,
+        "histogram_plot": histogram_plot,
+        "filter_radio": filter_radio,
+        "treasure_table": treasure_table,
+        "selected_gene_text": selected_gene_text,
+        "pin_btn": pin_btn,
+        "backpack_display": backpack_display,
+    }

ui/quest3.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Quest 3: Genome Landmarks — hotspot exploration."""
+import gradio as gr
+def build_quest3(contig_choices: list[str]):
+    """Build Quest 3 tab components. Returns dict of components."""
+    with gr.Tab("Genome Landmarks", id="quest3") as tab:
+        gr.Markdown("## Explore genomic hotspots of variation")
+        gr.Markdown(
+            "The genome is divided into 100 kb bins. Hotter bins contain more "
+            "variable (shell/cloud) genes — potential regions of adaptation."
+        )
+        contig_dropdown = gr.Dropdown(
+            choices=contig_choices,
+            label="Select contig (top contigs by gene count)",
+            interactive=True,
+        )
+        heatmap_plot = gr.Plot(label="Variability heatmap (contigs x bins)")
+        gr.Markdown("### Contig Detail")
+        track_plot = gr.Plot(label="Gene track (colored by class)", visible=False)
+        region_table = gr.Dataframe(
+            headers=["gene_id", "start", "end", "strand", "core_class", "freq_pct"],
+            label="Genes in selected region",
+            interactive=False,
+        )
+        region_gene_text = gr.Textbox(
+            label="Selected gene from region",
+            interactive=False,
+            visible=False,
+        )
+    return {
+        "tab": tab,
+        "contig_dropdown": contig_dropdown,
+        "heatmap_plot": heatmap_plot,
+        "track_plot": track_plot,
+        "region_table": region_table,
+        "region_gene_text": region_gene_text,
+    }

ui/quest4.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Quest 4: Protein Relics — protein analysis."""
+import gradio as gr
+def build_quest4(gene_choices: list[str]):
+    """Build Quest 4 tab components. Returns dict of components."""
+    with gr.Tab("Protein Relics", id="quest4") as tab:
+        gr.Markdown("## Examine protein properties")
+        gr.Markdown(
+            "Explore the protein products of pangenome genes. Compare amino acid "
+            "compositions and lengths across your backpack collection."
+        )
+        gene_dropdown = gr.Dropdown(
+            choices=gene_choices,
+            label="Select a gene (or pick from backpack)",
+            interactive=True,
+            allow_custom_value=True,
+        )
+        protein_stats_html = gr.HTML(
+            value="<p>Select a gene to see protein stats</p>",
+            label="Protein Statistics",
+        )
+        gr.Markdown("### Backpack Comparison")
+        gr.Markdown("Pin at least 2 genes to your backpack to see comparisons.")
+        comparison_bar_plot = gr.Plot(label="Protein lengths comparison")
+        composition_heatmap = gr.Plot(label="Amino acid composition heatmap")
+    return {
+        "tab": tab,
+        "gene_dropdown": gene_dropdown,
+        "protein_stats_html": protein_stats_html,
+        "comparison_bar_plot": comparison_bar_plot,
+        "composition_heatmap": composition_heatmap,
+    }

ui/theme.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""Custom Gradio theme for the Pigeon Pea Pangenome Atlas."""
+import gradio as gr
+CUSTOM_CSS = """
+.quest-badge {
+    display: inline-block;
+    padding: 4px 12px;
+    border-radius: 16px;
+    font-size: 0.85em;
+    font-weight: 600;
+    margin: 2px 4px;
+}
+.badge-core { background: #2E7D32; color: white; }
+.badge-shell { background: #FFC107; color: #333; }
+.badge-cloud { background: #F44336; color: white; }
+.gene-card {
+    border: 2px solid #2E7D32;
+    border-radius: 8px;
+    padding: 16px;
+    background: #F1F8E9;
+}
+.presence-barcode span {
+    display: inline-block;
+    width: 3px;
+    height: 20px;
+    margin: 0;
+}
+.presence-barcode .present { background: #4CAF50; }
+.presence-barcode .absent { background: #E0E0E0; }
+.progress-tracker {
+    display: flex;
+    justify-content: center;
+    gap: 8px;
+    padding: 12px 0;
+}
+.progress-step {
+    display: inline-flex;
+    align-items: center;
+    gap: 4px;
+    padding: 4px 10px;
+    border-radius: 12px;
+    font-size: 0.8em;
+    background: #E8F5E9;
+    color: #2E7D32;
+}
+.progress-step.active {
+    background: #2E7D32;
+    color: white;
+    font-weight: 600;
+}
+.stat-card {
+    text-align: center;
+    padding: 16px;
+    border-radius: 8px;
+    background: #F1F8E9;
+    border: 1px solid #C8E6C9;
+}
+.stat-card .stat-value {
+    font-size: 1.8em;
+    font-weight: 700;
+    color: #2E7D32;
+}
+.stat-card .stat-label {
+    font-size: 0.85em;
+    color: #666;
+}
+.achievement-badge {
+    display: inline-block;
+    padding: 6px 14px;
+    border-radius: 20px;
+    background: linear-gradient(135deg, #FFC107, #FF9800);
+    color: #333;
+    font-weight: 600;
+    margin: 4px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+"""
+def build_theme():
+    """Build custom Gradio theme."""
+    theme = gr.themes.Soft(
+        primary_hue=gr.themes.colors.green,
+        secondary_hue=gr.themes.colors.amber,
+        neutral_hue=gr.themes.colors.gray,
+        font=gr.themes.GoogleFont("Inter"),
+    ).set(
+        body_background_fill="#FAFAF5",
+        block_border_width="1px",
+        block_border_color="#C8E6C9",
+        block_radius="8px",
+        button_primary_background_fill="#2E7D32",
+        button_primary_text_color="white",
+    )
+    return theme