Ashkan Taghipour (The University of Western Australia) commited on
Commit
16e4ad5
·
1 Parent(s): ad78656

Initial deploy: Pigeon Pea Pangenome Atlas

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.fasta filter=lfs diff=lfs merge=lfs -text
37
+ *.fasta.gz filter=lfs diff=lfs merge=lfs -text
38
+ *.gff filter=lfs diff=lfs merge=lfs -text
39
+ 89_line_PAV.txt filter=lfs diff=lfs merge=lfs -text
40
+ genome_contig_index.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ CREDENTIALS_AND_CONFIG.txt
2
+ __pycache__/
3
+ *.pyc
4
+ .env
5
+ *.egg-info/
6
+ .conda/
7
+ .pytest_cache/
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
- title: PanGenomeWatchAI
3
- emoji: 🐢
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: PanGenome analysis
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Pigeon Pea Pangenome Atlas
3
+ emoji: "\U0001F331"
4
+ colorFrom: green
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Interactive pangenome exploration of 89 pigeon pea lines
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pigeon Pea Pangenome Atlas — Main entry point.
3
+ Gradio app for exploring the pangenome of 89 pigeon pea lines.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import json
9
+ import pandas as pd
10
+ import gradio as gr
11
+
12
+ from src.state import AppState
13
+ from src.utils import logger, PRECOMPUTED_DIR
14
+ from src.data_loader import validate_joins
15
+ from src.callbacks import (
16
+ on_line_selected, on_start_journey,
17
+ build_umap_plot, on_umap_select, on_compare_party,
18
+ build_donut_chart, build_frequency_histogram, build_treasure_table,
19
+ on_pin_gene, on_gene_click_table,
20
+ build_hotspot_heatmap, on_contig_selected,
21
+ get_protein_stats_html, build_backpack_comparison, build_composition_heatmap,
22
+ on_open_gene_card, on_download_gene_report,
23
+ on_generate_report, build_data_health_html,
24
+ )
25
+ from ui.layout import build_app
26
+
27
+ # ===========================================================
28
+ # Load precomputed data
29
+ # ===========================================================
30
+ logger.info("Loading precomputed data...")
31
+
32
+ DATA = {}
33
+
34
+
35
+ def load_data():
36
+ """Load all precomputed parquets into memory."""
37
+ p = PRECOMPUTED_DIR
38
+
39
+ DATA["gene_freq"] = pd.read_parquet(p / "pav_gene_frequency.parquet")
40
+ DATA["line_stats"] = pd.read_parquet(p / "line_stats.parquet")
41
+ DATA["embedding"] = pd.read_parquet(p / "line_embedding.parquet")
42
+ DATA["similarity"] = pd.read_parquet(p / "line_similarity_topk.parquet")
43
+ DATA["gff_index"] = pd.read_parquet(p / "gff_gene_index.parquet")
44
+ DATA["protein"] = pd.read_parquet(p / "protein_index.parquet")
45
+ DATA["hotspots"] = pd.read_parquet(p / "hotspot_bins.parquet")
46
+ DATA["markers"] = pd.read_parquet(p / "cluster_markers.parquet")
47
+
48
+ # Load PAV matrix for live queries
49
+ pav_path = p / "pav_matrix.parquet"
50
+ if pav_path.exists():
51
+ DATA["pav"] = pd.read_parquet(pav_path)
52
+ logger.info(f"PAV matrix loaded: {DATA['pav'].shape}")
53
+
54
+ # Load contig index
55
+ ci_path = p / "genome_contig_index.json"
56
+ if ci_path.exists():
57
+ with open(ci_path) as f:
58
+ DATA["contig_index"] = json.load(f)
59
+
60
+ logger.info("All data loaded successfully")
61
+
62
+
63
+ load_data()
64
+
65
+ # ===========================================================
66
+ # Derive UI choices
67
+ # ===========================================================
68
+ line_choices = sorted(DATA["line_stats"]["line_id"].tolist())
69
+
70
+ # Top contigs by gene count
71
+ contig_gene_counts = DATA["gff_index"]["contig_id"].value_counts()
72
+ contig_choices = contig_gene_counts.head(30).index.tolist()
73
+
74
+ # Gene choices (all genes with protein data)
75
+ gene_choices = sorted(DATA["protein"]["gene_id"].tolist())
76
+
77
+ # Country list for filters
78
+ country_list = sorted(DATA["line_stats"]["country"].unique().tolist())
79
+
80
+ # ===========================================================
81
+ # Build UI
82
+ # ===========================================================
83
+ demo, C = build_app(line_choices, contig_choices, gene_choices)
84
+
85
+ # ===========================================================
86
+ # Wire callbacks
87
+ # ===========================================================
88
+ with demo:
89
+
90
+ # Update country filter choices
91
+ C["country_filter"].choices = country_list
92
+
93
+ # -- Data Health on load --
94
+ try:
95
+ report = {
96
+ "Total genes in PAV": len(DATA["gene_freq"]),
97
+ "Lines": len(DATA["line_stats"]),
98
+ "Genes with GFF annotation": len(DATA["gff_index"]),
99
+ "Genes with protein data": len(DATA["protein"]),
100
+ "Clusters found": DATA["embedding"]["cluster_id"].nunique(),
101
+ }
102
+ C["data_health_html"].value = build_data_health_html(report)
103
+ except Exception as e:
104
+ C["data_health_html"].value = f"<p>Error: {e}</p>"
105
+
106
+ # -- Quest 0 --
107
+ C["q0_line_dropdown"].change(
108
+ fn=lambda line_id, state: on_line_selected(line_id, state, DATA),
109
+ inputs=[C["q0_line_dropdown"], C["state"]],
110
+ outputs=[C["q0_total_genes"], C["q0_unique_genes"],
111
+ C["q0_nearest_neighbor"], C["state"]],
112
+ )
113
+
114
+ C["q0_start_btn"].click(
115
+ fn=on_start_journey,
116
+ inputs=[C["state"]],
117
+ outputs=[C["tabs"], C["state"]],
118
+ )
119
+
120
+ # -- Quest 1 --
121
+ C["q1_color_radio"].change(
122
+ fn=lambda color_by, state: build_umap_plot(color_by, state, DATA),
123
+ inputs=[C["q1_color_radio"], C["state"]],
124
+ outputs=[C["q1_umap_plot"]],
125
+ )
126
+
127
+ # Render UMAP on tab load
128
+ C["q1_tab"].select(
129
+ fn=lambda state: build_umap_plot("Country", state, DATA),
130
+ inputs=[C["state"]],
131
+ outputs=[C["q1_umap_plot"]],
132
+ )
133
+
134
+ C["q1_compare_btn"].click(
135
+ fn=lambda state: on_compare_party(state, DATA),
136
+ inputs=[C["state"]],
137
+ outputs=[C["q1_comparison_plot"], C["q1_comparison_plot"]],
138
+ )
139
+
140
+ # -- Quest 2 --
141
+ def update_quest2(core_thresh, cloud_thresh, filter_type, state):
142
+ donut = build_donut_chart(core_thresh, cloud_thresh, DATA)
143
+ hist = build_frequency_histogram(core_thresh, cloud_thresh, DATA)
144
+ table = build_treasure_table(state, core_thresh, cloud_thresh, filter_type, DATA)
145
+ return donut, hist, table
146
+
147
+ for trigger in [C["q2_core_slider"], C["q2_cloud_slider"]]:
148
+ trigger.change(
149
+ fn=update_quest2,
150
+ inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
151
+ C["q2_filter_radio"], C["state"]],
152
+ outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
153
+ C["q2_treasure_table"]],
154
+ )
155
+
156
+ C["q2_filter_radio"].change(
157
+ fn=update_quest2,
158
+ inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
159
+ C["q2_filter_radio"], C["state"]],
160
+ outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
161
+ C["q2_treasure_table"]],
162
+ )
163
+
164
+ # Render Quest 2 on tab select
165
+ C["q2_tab"].select(
166
+ fn=update_quest2,
167
+ inputs=[C["q2_core_slider"], C["q2_cloud_slider"],
168
+ C["q2_filter_radio"], C["state"]],
169
+ outputs=[C["q2_donut_plot"], C["q2_histogram_plot"],
170
+ C["q2_treasure_table"]],
171
+ )
172
+
173
+ C["q2_pin_btn"].click(
174
+ fn=lambda gene_id, state: on_pin_gene(gene_id, state),
175
+ inputs=[C["q2_selected_gene_text"], C["state"]],
176
+ outputs=[C["q2_backpack_display"], C["state"]],
177
+ )
178
+
179
+ # Table row click → select gene
180
+ C["q2_treasure_table"].select(
181
+ fn=lambda evt, state: (str(evt.value) if evt else "", state),
182
+ inputs=[C["state"]],
183
+ outputs=[C["q2_selected_gene_text"], C["state"]],
184
+ )
185
+
186
+ # -- Quest 3 --
187
+ C["q3_tab"].select(
188
+ fn=lambda: build_hotspot_heatmap(DATA),
189
+ inputs=[],
190
+ outputs=[C["q3_heatmap_plot"]],
191
+ )
192
+
193
+ C["q3_contig_dropdown"].change(
194
+ fn=lambda contig_id, state: on_contig_selected(contig_id, DATA, state),
195
+ inputs=[C["q3_contig_dropdown"], C["state"]],
196
+ outputs=[C["q3_track_plot"], C["q3_region_table"]],
197
+ )
198
+
199
+ # -- Quest 4 --
200
+ C["q4_gene_dropdown"].change(
201
+ fn=lambda gene_id: get_protein_stats_html(gene_id, DATA),
202
+ inputs=[C["q4_gene_dropdown"]],
203
+ outputs=[C["q4_protein_stats_html"]],
204
+ )
205
+
206
+ C["q4_tab"].select(
207
+ fn=lambda state: (
208
+ build_backpack_comparison(state, DATA),
209
+ build_composition_heatmap(state, DATA),
210
+ ),
211
+ inputs=[C["state"]],
212
+ outputs=[C["q4_comparison_bar_plot"], C["q4_composition_heatmap"]],
213
+ )
214
+
215
+ # -- Gene Card --
216
+ C["gc_show_genome_btn"].click(
217
+ fn=lambda state: (
218
+ gr.Tabs(selected="quest3"),
219
+ state.selected_gene if state and state.selected_gene else None,
220
+ ),
221
+ inputs=[C["state"]],
222
+ outputs=[C["tabs"], C["q3_contig_dropdown"]],
223
+ )
224
+
225
+ C["gc_show_protein_btn"].click(
226
+ fn=lambda state: (
227
+ gr.Tabs(selected="quest4"),
228
+ state.selected_gene if state and state.selected_gene else None,
229
+ ),
230
+ inputs=[C["state"]],
231
+ outputs=[C["tabs"], C["q4_gene_dropdown"]],
232
+ )
233
+
234
+ C["gc_pin_card_btn"].click(
235
+ fn=lambda state: on_pin_gene(state.selected_gene if state else "", state),
236
+ inputs=[C["state"]],
237
+ outputs=[C["q2_backpack_display"], C["state"]],
238
+ )
239
+
240
+ C["gc_download_gene_btn"].click(
241
+ fn=lambda state: on_download_gene_report(state, DATA),
242
+ inputs=[C["state"]],
243
+ outputs=[C["gc_gene_report_file"]],
244
+ )
245
+
246
+ # -- Final Report --
247
+ C["final_generate_btn"].click(
248
+ fn=lambda state: on_generate_report(state, DATA),
249
+ inputs=[C["state"]],
250
+ outputs=[
251
+ C["final_report_md"],
252
+ C["final_download_json"],
253
+ C["final_download_csv"],
254
+ C["final_download_json"], # visibility
255
+ C["final_download_csv"], # visibility
256
+ C["final_achievements_html"],
257
+ C["state"],
258
+ ],
259
+ )
260
+
261
+
262
+ # ===========================================================
263
+ # Launch
264
+ # ===========================================================
265
+ if __name__ == "__main__":
266
+ demo.launch()
data/89_line_PAV.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4eaac454bd3103e8738cf483fc45ac821c8ae0dc99d28c5abd383b0005ef38c
3
+ size 10437803
data/Pigenopea_89lines_pangenome_clean.all_leAED1_sort_byPerl_clean.gff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a288ef682398b642d4024a09f55727d6391fb164b8b6d72603b13796b6b95eba
3
+ size 64791788
data/Pigenopea_89lines_pangenome_clean.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a282a51d87323f1effd70ee43a9ab7b95dbc6584f4df0e6b3c0d095a0b992f7
3
+ size 637456599
data/Pigenopea_89lines_pangenome_clean_bg1k_protein.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9efd3242fffb67a569937fb4842027238c86a245f303b5521d48ec84c50389
3
+ size 18085493
precomputed/cluster_markers.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b195070da682210273a73da98b3aa3bcf8146d1c9dd6aa1094ff0e7b17ca0485
3
+ size 6411
precomputed/genome_contig_index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7184ff7812f745877dd62bda4310f7336906e3aa3658908fee9a767dbf53d04d
3
+ size 11022538
precomputed/gff_gene_index.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d48537f3aae0c63e4afb65831be8762f805a5215b7ff0fff64de44a7385d50e
3
+ size 1247901
precomputed/hotspot_bins.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbf2c937e61d39a60a1d51e20bcd6b0c5dd681429a763b43467d5d9e95fcfa59
3
+ size 132552
precomputed/line_embedding.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97492cd82529decb539371fde3696da357b670ec44ed1a4f32015fc6ad621ef8
3
+ size 4680
precomputed/line_similarity_topk.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8339f144d99301b979159e14e8a413364727a8f93dcc4801945aee7671ef9e10
3
+ size 8063
precomputed/line_stats.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:645aba02382a23a3cdb476bb3e24f9111fb806909d076732bead9662d8eba4b6
3
+ size 4672
precomputed/pav_gene_frequency.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecd91ec6198f2d8489177a03d81e9c62b6edcc4575994eb534c9c4507c38071d
3
+ size 376243
precomputed/pav_matrix.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24abc505527d02d5d1c465f0b93c1902833e95c224ad00f3ccc9ee881173729c
3
+ size 490920
precomputed/protein_index.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c67ced53bb82255c9027d4d492acf8945bcc0c28ec233a73ced14054a482649
3
+ size 1623053
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ plotly>=5.24
2
+ pandas>=2.2
3
+ pyarrow>=17.0
4
+ numpy>=1.26
5
+ scipy>=1.14
6
+ scikit-learn>=1.5
7
+ umap-learn>=0.5.7
8
+ biopython>=1.84
9
+ pyfaidx>=0.8
10
+ Pillow
11
+ reportlab
scripts/run_precompute.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI script to generate all precomputed data.
4
+ Usage: python scripts/run_precompute.py --data-dir data/ --output-dir precomputed/
5
+ """
6
+
7
+ import argparse
8
+ import sys
9
+ import os
10
+ import time
11
+
12
+ # Add project root to path
13
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+
15
+ from src.data_loader import (
16
+ load_pav, parse_gff_genes, parse_protein_fasta,
17
+ build_contig_index, build_contig_name_mapping, validate_joins,
18
+ )
19
+ from src.precompute import (
20
+ compute_gene_frequency, compute_line_stats, compute_line_embedding,
21
+ compute_similarity_topk, build_gff_gene_parquet, build_protein_parquet,
22
+ save_contig_index, compute_hotspot_bins, compute_cluster_markers,
23
+ )
24
+ from src.utils import logger, find_file
25
+
26
+
27
+ def main():
28
+ parser = argparse.ArgumentParser(description="Precompute pangenome data")
29
+ parser.add_argument("--data-dir", default="data/", help="Input data directory")
30
+ parser.add_argument("--output-dir", default="precomputed/", help="Output directory")
31
+ args = parser.parse_args()
32
+
33
+ data_dir = os.path.abspath(args.data_dir)
34
+ output_dir = os.path.abspath(args.output_dir)
35
+ os.makedirs(output_dir, exist_ok=True)
36
+
37
+ t_total = time.time()
38
+
39
+ # 1. Load raw data
40
+ logger.info("=== Phase 1: Loading raw data ===")
41
+ pav_path = os.path.join(data_dir, "89_line_PAV.txt")
42
+ from pathlib import Path
43
+ data_p = Path(data_dir)
44
+
45
+ gff_files = list(data_p.glob("*.gff"))
46
+ protein_files = list(data_p.glob("*protein*.fasta"))
47
+ genome_files = [f for f in data_p.glob("*.fasta") if "protein" not in f.name]
48
+
49
+ if not gff_files:
50
+ logger.error("No GFF file found in data directory")
51
+ sys.exit(1)
52
+ if not protein_files:
53
+ logger.error("No protein FASTA file found in data directory")
54
+ sys.exit(1)
55
+
56
+ pav = load_pav(pav_path)
57
+ gff_genes = parse_gff_genes(str(gff_files[0]))
58
+ protein_index = parse_protein_fasta(str(protein_files[0]))
59
+
60
+ contig_index = {}
61
+ if genome_files:
62
+ contig_index = build_contig_index(str(genome_files[0]))
63
+ else:
64
+ logger.warning("No genome FASTA found; contig index will be empty")
65
+
66
+ # Validation
67
+ logger.info("=== Validation ===")
68
+ contig_mapping = build_contig_name_mapping(gff_genes, contig_index)
69
+ report = validate_joins(pav, gff_genes, protein_index, contig_index)
70
+ for k, v in report.items():
71
+ logger.info(f" {k}: {v}")
72
+
73
+ # 2. Compute derived data
74
+ logger.info("=== Phase 2: Computing derived data ===")
75
+
76
+ gene_freq = compute_gene_frequency(pav)
77
+ gene_freq.to_parquet(os.path.join(output_dir, "pav_gene_frequency.parquet"), index=False)
78
+
79
+ line_stats = compute_line_stats(pav)
80
+ line_stats.to_parquet(os.path.join(output_dir, "line_stats.parquet"), index=False)
81
+
82
+ embedding = compute_line_embedding(pav)
83
+ embedding.to_parquet(os.path.join(output_dir, "line_embedding.parquet"), index=False)
84
+
85
+ similarity = compute_similarity_topk(pav, k=15)
86
+ similarity.to_parquet(os.path.join(output_dir, "line_similarity_topk.parquet"), index=False)
87
+
88
+ build_gff_gene_parquet(gff_genes, os.path.join(output_dir, "gff_gene_index.parquet"))
89
+ build_protein_parquet(protein_index, os.path.join(output_dir, "protein_index.parquet"))
90
+ save_contig_index(contig_index, contig_mapping, os.path.join(output_dir, "genome_contig_index.json"))
91
+
92
+ hotspots = compute_hotspot_bins(gff_genes, gene_freq, contig_index)
93
+ hotspots.to_parquet(os.path.join(output_dir, "hotspot_bins.parquet"), index=False)
94
+
95
+ markers = compute_cluster_markers(pav, embedding)
96
+ markers.to_parquet(os.path.join(output_dir, "cluster_markers.parquet"), index=False)
97
+
98
+ # Also save the PAV matrix as parquet for efficient loading
99
+ pav.to_parquet(os.path.join(output_dir, "pav_matrix.parquet"))
100
+
101
+ dt = time.time() - t_total
102
+ logger.info(f"=== All precomputation done in {dt:.1f}s ===")
103
+
104
+ # List output files
105
+ for f in sorted(Path(output_dir).glob("*")):
106
+ size_mb = f.stat().st_size / 1024 / 1024
107
+ logger.info(f" {f.name}: {size_mb:.2f} MB")
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
src/__init__.py ADDED
File without changes
src/callbacks.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """All Gradio callbacks for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import plotly.graph_objects as go
4
+ import plotly.express as px
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ from src.state import AppState
9
+ from src.gene_card import build_gene_card, render_gene_card_html, export_gene_report
10
+ from src.field_report import generate_field_report, export_report_json, export_report_csv
11
+
12
+ # Color palettes
13
+ CORE_COLORS = {"core": "#2E7D32", "shell": "#FFC107", "cloud": "#F44336", "unknown": "#9E9E9E"}
14
+ COUNTRY_COLORS = px.colors.qualitative.Set3
15
+
16
+
17
+ # ============================================================
18
+ # Quest 0 Callbacks
19
+ # ============================================================
20
+
21
+ def on_line_selected(line_id: str, state: AppState, data: dict) -> tuple:
22
+ """
23
+ Triggered by dropdown change.
24
+ Returns: (total_genes, unique_genes, nearest_neighbor, updated_state)
25
+ """
26
+ if not line_id or state is None:
27
+ state = AppState()
28
+ state.selected_line = line_id
29
+
30
+ line_stats = data["line_stats"]
31
+ similarity = data["similarity"]
32
+
33
+ row = line_stats[line_stats["line_id"] == line_id]
34
+ if len(row) == 0:
35
+ return "--", "--", "--", state
36
+
37
+ total_genes = str(int(row.iloc[0]["genes_present_count"]))
38
+ unique_genes = str(int(row.iloc[0]["unique_genes_count"]))
39
+
40
+ # Nearest neighbor
41
+ sim_rows = similarity[similarity["line_id"] == line_id]
42
+ if len(sim_rows) > 0:
43
+ top = sim_rows.nlargest(1, "jaccard_score").iloc[0]
44
+ nearest = f"{top['neighbor_line_id']} ({top['jaccard_score']:.3f})"
45
+ else:
46
+ nearest = "--"
47
+
48
+ return total_genes, unique_genes, nearest, state
49
+
50
+
51
+ def on_start_journey(state: AppState) -> tuple:
52
+ """Award Explorer achievement and switch to Quest 1."""
53
+ if state is None:
54
+ state = AppState()
55
+ state.award("Explorer")
56
+ return gr.Tabs(selected="quest1"), state
57
+
58
+
59
+ # ============================================================
60
+ # Quest 1 Callbacks
61
+ # ============================================================
62
+
63
+ def build_umap_plot(color_by: str, state: AppState, data: dict) -> go.Figure:
64
+ """Build Plotly UMAP scatter."""
65
+ embedding = data["embedding"]
66
+ line_stats = data["line_stats"]
67
+
68
+ df = embedding.merge(line_stats[["line_id", "country"]], on="line_id", how="left")
69
+ df["country"] = df["country"].fillna("Unknown")
70
+
71
+ color_col = "country" if color_by == "Country" else "cluster_id"
72
+ if color_col == "cluster_id":
73
+ df["cluster_id"] = df["cluster_id"].astype(str)
74
+
75
+ fig = px.scatter(
76
+ df, x="umap_x", y="umap_y", color=color_col,
77
+ hover_data=["line_id", "country"],
78
+ title="UMAP of 89 Pigeon Pea Lines",
79
+ labels={"umap_x": "UMAP 1", "umap_y": "UMAP 2"},
80
+ color_discrete_sequence=COUNTRY_COLORS if color_by == "Country" else px.colors.qualitative.Bold,
81
+ )
82
+
83
+ # Highlight selected line
84
+ if state and state.selected_line:
85
+ sel = df[df["line_id"] == state.selected_line]
86
+ if len(sel) > 0:
87
+ fig.add_trace(go.Scatter(
88
+ x=sel["umap_x"], y=sel["umap_y"],
89
+ mode="markers",
90
+ marker=dict(symbol="star", size=18, color="red", line=dict(width=2, color="black")),
91
+ name=f"Your line: {state.selected_line}",
92
+ hovertext=state.selected_line,
93
+ ))
94
+
95
+ fig.update_layout(
96
+ plot_bgcolor="white",
97
+ legend=dict(orientation="h", yanchor="bottom", y=-0.3),
98
+ )
99
+ return fig
100
+
101
+
102
+ def on_umap_select(selected_data, state: AppState) -> tuple:
103
+ """Handle UMAP point selection."""
104
+ if state is None:
105
+ state = AppState()
106
+ if selected_data and "points" in selected_data:
107
+ selected_lines = [p.get("hovertext", p.get("text", "")) for p in selected_data["points"]]
108
+ selected_lines = [l for l in selected_lines if l]
109
+ state.selected_party = selected_lines
110
+ party_text = f"Selected {len(selected_lines)} lines: " + ", ".join(selected_lines[:10])
111
+ if len(selected_lines) > 10:
112
+ party_text += f" ... +{len(selected_lines) - 10} more"
113
+ else:
114
+ state.selected_party = []
115
+ party_text = "None selected"
116
+ return party_text, state
117
+
118
+
119
+ def on_compare_party(state: AppState, data: dict) -> tuple:
120
+ """Compare selected line vs party."""
121
+ if not state or not state.selected_line or not state.selected_party:
122
+ fig = go.Figure()
123
+ fig.add_annotation(text="Select your line and a party first", showarrow=False)
124
+ return fig, True
125
+
126
+ pav = data.get("pav")
127
+ if pav is None:
128
+ fig = go.Figure()
129
+ fig.add_annotation(text="PAV data not loaded", showarrow=False)
130
+ return fig, True
131
+
132
+ my_genes = set(pav.index[pav[state.selected_line] == 1])
133
+ party_cols = [c for c in state.selected_party if c in pav.columns and c != state.selected_line]
134
+ if not party_cols:
135
+ fig = go.Figure()
136
+ fig.add_annotation(text="No valid party members", showarrow=False)
137
+ return fig, True
138
+
139
+ party_genes = set()
140
+ for col in party_cols:
141
+ party_genes |= set(pav.index[pav[col] == 1])
142
+
143
+ shared = len(my_genes & party_genes)
144
+ only_mine = len(my_genes - party_genes)
145
+ only_party = len(party_genes - my_genes)
146
+
147
+ fig = go.Figure(data=[
148
+ go.Bar(name="Shared", x=["Gene Sets"], y=[shared], marker_color="#2E7D32"),
149
+ go.Bar(name=f"Only {state.selected_line}", x=["Gene Sets"], y=[only_mine], marker_color="#1565C0"),
150
+ go.Bar(name="Only Party", x=["Gene Sets"], y=[only_party], marker_color="#FFC107"),
151
+ ])
152
+ fig.update_layout(
153
+ barmode="group",
154
+ title=f"Gene Comparison: {state.selected_line} vs {len(party_cols)} party members",
155
+ yaxis_title="Number of genes",
156
+ )
157
+ return fig, True
158
+
159
+
160
+ # ============================================================
161
+ # Quest 2 Callbacks
162
+ # ============================================================
163
+
164
+ def build_donut_chart(core_thresh: float, cloud_thresh: float, data: dict) -> go.Figure:
165
+ """Build core/shell/cloud donut chart."""
166
+ gene_freq = data["gene_freq"]
167
+
168
+ core = int((gene_freq["freq_pct"] >= core_thresh).sum())
169
+ cloud = int((gene_freq["freq_pct"] < cloud_thresh).sum())
170
+ shell = len(gene_freq) - core - cloud
171
+
172
+ fig = go.Figure(data=[go.Pie(
173
+ labels=["Core", "Shell", "Cloud"],
174
+ values=[core, shell, cloud],
175
+ hole=0.5,
176
+ marker_colors=[CORE_COLORS["core"], CORE_COLORS["shell"], CORE_COLORS["cloud"]],
177
+ textinfo="label+value+percent",
178
+ )])
179
+ fig.update_layout(
180
+ title=f"Gene Classification (Core>={core_thresh}%, Cloud<{cloud_thresh}%)",
181
+ showlegend=True,
182
+ )
183
+ return fig
184
+
185
+
186
+ def build_frequency_histogram(core_thresh: float, cloud_thresh: float, data: dict) -> go.Figure:
187
+ """Build colored histogram of gene frequencies."""
188
+ gene_freq = data["gene_freq"]
189
+
190
+ fig = go.Figure()
191
+ for cls, color in CORE_COLORS.items():
192
+ if cls == "unknown":
193
+ continue
194
+ subset = gene_freq[gene_freq["core_class"] == cls]
195
+ fig.add_trace(go.Histogram(
196
+ x=subset["freq_pct"],
197
+ name=cls.capitalize(),
198
+ marker_color=color,
199
+ opacity=0.75,
200
+ nbinsx=50,
201
+ ))
202
+
203
+ fig.update_layout(
204
+ barmode="overlay",
205
+ title="Gene Frequency Distribution",
206
+ xaxis_title="Frequency (%)",
207
+ yaxis_title="Count",
208
+ )
209
+ # Add threshold lines
210
+ fig.add_vline(x=core_thresh, line_dash="dash", line_color="green",
211
+ annotation_text=f"Core>={core_thresh}%")
212
+ fig.add_vline(x=cloud_thresh, line_dash="dash", line_color="red",
213
+ annotation_text=f"Cloud<{cloud_thresh}%")
214
+ return fig
215
+
216
+
217
+ def build_treasure_table(state: AppState, core_thresh: float, cloud_thresh: float,
218
+ filter_type: str, data: dict) -> pd.DataFrame:
219
+ """Build gene treasure table with current filters."""
220
+ gene_freq = data["gene_freq"].copy()
221
+
222
+ # Reclassify based on current thresholds
223
+ gene_freq["core_class"] = gene_freq["freq_pct"].apply(
224
+ lambda x: "core" if x >= core_thresh else ("cloud" if x < cloud_thresh else "shell")
225
+ )
226
+
227
+ # Add in_my_line column
228
+ pav = data.get("pav")
229
+ if pav is not None and state and state.selected_line and state.selected_line in pav.columns:
230
+ my_presence = pav[state.selected_line]
231
+ gene_freq["in_my_line"] = gene_freq["gene_id"].map(
232
+ lambda g: "Yes" if g in my_presence.index and my_presence.get(g, 0) == 1 else "No"
233
+ )
234
+ else:
235
+ gene_freq["in_my_line"] = "N/A"
236
+
237
+ # Filter
238
+ if filter_type == "Unique to my line":
239
+ if pav is not None and state and state.selected_line:
240
+ unique_mask = (pav.sum(axis=1) == 1) & (pav[state.selected_line] == 1)
241
+ unique_genes = set(pav.index[unique_mask])
242
+ gene_freq = gene_freq[gene_freq["gene_id"].isin(unique_genes)]
243
+ elif filter_type == "Rare (<5 lines)":
244
+ gene_freq = gene_freq[gene_freq["freq_count"] <= 5]
245
+ elif filter_type == "Cluster markers":
246
+ markers = data.get("markers")
247
+ if markers is not None:
248
+ marker_genes = set(markers["gene_id"])
249
+ gene_freq = gene_freq[gene_freq["gene_id"].isin(marker_genes)]
250
+
251
+ # Sort and limit
252
+ gene_freq = gene_freq.sort_values("freq_count", ascending=True).head(500)
253
+ return gene_freq[["gene_id", "freq_count", "freq_pct", "core_class", "in_my_line"]]
254
+
255
+
256
+ def on_pin_gene(gene_id: str, state: AppState) -> tuple:
257
+ """Add gene to backpack."""
258
+ if state is None:
259
+ state = AppState()
260
+ if not gene_id or gene_id == "Click a row to select":
261
+ return "Select a gene first", state
262
+
263
+ added = state.add_to_backpack(gene_id)
264
+ backpack_text = ", ".join(state.backpack_genes) if state.backpack_genes else "Empty"
265
+ if not added:
266
+ backpack_text = f"(already in backpack) {backpack_text}"
267
+ return backpack_text, state
268
+
269
+
270
+ def on_gene_click_table(evt, state: AppState) -> tuple:
271
+ """Handle table row selection."""
272
+ if state is None:
273
+ state = AppState()
274
+ if evt is not None and hasattr(evt, 'value'):
275
+ gene_id = str(evt.value)
276
+ state.selected_gene = gene_id
277
+ return gene_id, state
278
+ return "Click a row to select", state
279
+
280
+
281
+ # ============================================================
282
+ # Quest 3 Callbacks
283
+ # ============================================================
284
+
285
+ def build_hotspot_heatmap(data: dict, top_n_contigs: int = 20) -> go.Figure:
286
+ """Build contig x bin heatmap from hotspot_bins."""
287
+ hotspots = data["hotspots"]
288
+
289
+ # Top N contigs by total genes
290
+ contig_counts = hotspots.groupby("contig_id")["total_genes"].sum()
291
+ top_contigs = contig_counts.nlargest(top_n_contigs).index.tolist()
292
+ subset = hotspots[hotspots["contig_id"].isin(top_contigs)]
293
+
294
+ if len(subset) == 0:
295
+ fig = go.Figure()
296
+ fig.add_annotation(text="No hotspot data available", showarrow=False)
297
+ return fig
298
+
299
+ pivot = subset.pivot_table(
300
+ index="contig_id", columns="bin_start",
301
+ values="variability_score", aggfunc="max"
302
+ ).fillna(0)
303
+
304
+ # Shorten contig names for display
305
+ short_names = [c.split("|")[-1] if "|" in c else c[:30] for c in pivot.index]
306
+
307
+ fig = go.Figure(data=go.Heatmap(
308
+ z=pivot.values,
309
+ x=[f"{int(c/1000)}kb" for c in pivot.columns],
310
+ y=short_names,
311
+ colorscale=[[0, "#E8F5E9"], [0.5, "#FFC107"], [1.0, "#F44336"]],
312
+ colorbar_title="Variability",
313
+ hovertemplate="Contig: %{y}<br>Bin: %{x}<br>Score: %{z:.1f}<extra></extra>",
314
+ ))
315
+ fig.update_layout(
316
+ title=f"Genomic Variability Heatmap (Top {top_n_contigs} contigs)",
317
+ xaxis_title="Genomic position",
318
+ yaxis_title="Contig",
319
+ height=600,
320
+ )
321
+ return fig
322
+
323
+
324
+ def on_contig_selected(contig_id: str, data: dict, state: AppState) -> tuple:
325
+ """Build track plot for selected contig."""
326
+ if not contig_id:
327
+ return go.Figure(), pd.DataFrame()
328
+
329
+ gff = data["gff_index"]
330
+ gene_freq = data["gene_freq"]
331
+
332
+ contig_genes = gff[gff["contig_id"] == contig_id].merge(
333
+ gene_freq[["gene_id", "core_class", "freq_pct"]], on="gene_id", how="left"
334
+ )
335
+ contig_genes["core_class"] = contig_genes["core_class"].fillna("unknown")
336
+
337
+ if len(contig_genes) == 0:
338
+ fig = go.Figure()
339
+ fig.add_annotation(text="No genes on this contig", showarrow=False)
340
+ return fig, pd.DataFrame()
341
+
342
+ fig = go.Figure()
343
+ for cls, color in CORE_COLORS.items():
344
+ subset = contig_genes[contig_genes["core_class"] == cls]
345
+ if len(subset) == 0:
346
+ continue
347
+ fig.add_trace(go.Scatter(
348
+ x=(subset["start"] + subset["end"]) / 2,
349
+ y=[cls] * len(subset),
350
+ mode="markers",
351
+ marker=dict(
352
+ symbol="line-ew", size=12, color=color,
353
+ line=dict(width=2, color=color),
354
+ ),
355
+ name=cls.capitalize(),
356
+ text=subset["gene_id"],
357
+ hovertemplate="Gene: %{text}<br>Position: %{x:,.0f}<extra></extra>",
358
+ ))
359
+
360
+ short_name = contig_id.split("|")[-1] if "|" in contig_id else contig_id[:30]
361
+ fig.update_layout(
362
+ title=f"Gene Track: {short_name}",
363
+ xaxis_title="Genomic position (bp)",
364
+ yaxis_title="Gene class",
365
+ showlegend=True,
366
+ )
367
+
368
+ table_df = contig_genes[["gene_id", "start", "end", "strand", "core_class", "freq_pct"]].sort_values("start")
369
+ return fig, table_df
370
+
371
+
372
+ # ============================================================
373
+ # Quest 4 Callbacks
374
+ # ============================================================
375
+
376
+ def get_protein_stats_html(gene_id: str, data: dict) -> str:
377
+ """Get protein stats as HTML."""
378
+ if not gene_id:
379
+ return "<p>Select a gene</p>"
380
+
381
+ protein = data["protein"]
382
+ row = protein[protein["gene_id"] == gene_id]
383
+ if len(row) == 0:
384
+ return "<p><i>No protein data available for this gene.</i></p>"
385
+
386
+ r = row.iloc[0]
387
+ return (
388
+ f"<div class='stat-card'>"
389
+ f"<p><b>Protein Length:</b> {int(r['protein_length'])} aa</p>"
390
+ f"<p><b>Top Amino Acids:</b> {r['composition_summary']}</p>"
391
+ f"</div>"
392
+ )
393
+
394
+
395
+ def build_backpack_comparison(state: AppState, data: dict) -> go.Figure:
396
+ """Bar chart of protein lengths for backpack genes."""
397
+ if not state or len(state.backpack_genes) < 2:
398
+ fig = go.Figure()
399
+ fig.add_annotation(text="Pin at least 2 genes to compare", showarrow=False)
400
+ return fig
401
+
402
+ protein = data["protein"]
403
+ bp_prot = protein[protein["gene_id"].isin(state.backpack_genes)]
404
+
405
+ fig = go.Figure(data=[go.Bar(
406
+ x=bp_prot["gene_id"],
407
+ y=bp_prot["protein_length"],
408
+ marker_color="#2E7D32",
409
+ text=bp_prot["protein_length"],
410
+ textposition="auto",
411
+ )])
412
+ fig.update_layout(
413
+ title="Backpack Genes: Protein Length Comparison",
414
+ xaxis_title="Gene",
415
+ yaxis_title="Protein Length (aa)",
416
+ )
417
+ return fig
418
+
419
+
420
+ def build_composition_heatmap(state: AppState, data: dict) -> go.Figure:
421
+ """Heatmap of amino acid composition for backpack genes."""
422
+ if not state or len(state.backpack_genes) < 2:
423
+ fig = go.Figure()
424
+ fig.add_annotation(text="Pin at least 2 genes to compare", showarrow=False)
425
+ return fig
426
+
427
+ # Parse composition from summary strings
428
+ protein = data["protein"]
429
+ bp_prot = protein[protein["gene_id"].isin(state.backpack_genes)]
430
+
431
+ aa_data = {}
432
+ for _, row in bp_prot.iterrows():
433
+ gene_id = row["gene_id"]
434
+ comp = row["composition_summary"]
435
+ aa_dict = {}
436
+ for item in comp.split(", "):
437
+ parts = item.split(":")
438
+ if len(parts) == 2:
439
+ aa = parts[0].strip()
440
+ pct = float(parts[1].replace("%", ""))
441
+ aa_dict[aa] = pct
442
+ aa_data[gene_id] = aa_dict
443
+
444
+ if not aa_data:
445
+ fig = go.Figure()
446
+ fig.add_annotation(text="No composition data", showarrow=False)
447
+ return fig
448
+
449
+ df = pd.DataFrame(aa_data).fillna(0).T
450
+ fig = go.Figure(data=go.Heatmap(
451
+ z=df.values,
452
+ x=df.columns.tolist(),
453
+ y=df.index.tolist(),
454
+ colorscale="YlGn",
455
+ colorbar_title="%",
456
+ ))
457
+ fig.update_layout(
458
+ title="Amino Acid Composition Heatmap",
459
+ xaxis_title="Amino Acid",
460
+ yaxis_title="Gene",
461
+ )
462
+ return fig
463
+
464
+
465
+ # ============================================================
466
+ # Gene Card Callbacks
467
+ # ============================================================
468
+
469
+ def on_open_gene_card(gene_id: str, state: AppState, data: dict) -> tuple:
470
+ """Open Gene Card side panel."""
471
+ if not gene_id:
472
+ return "", False, state
473
+
474
+ state.selected_gene = gene_id
475
+ card = build_gene_card(gene_id, data)
476
+ html = render_gene_card_html(card)
477
+ state.award("Gene Hunter")
478
+ return html, True, state
479
+
480
+
481
+ def on_download_gene_report(state: AppState, data: dict) -> str:
482
+ """Download gene report."""
483
+ if state and state.selected_gene:
484
+ return export_gene_report(state.selected_gene, data)
485
+ return None
486
+
487
+
488
+ # ============================================================
489
+ # Final Report Callbacks
490
+ # ============================================================
491
+
492
+ def on_generate_report(state: AppState, data: dict) -> tuple:
493
+ """Generate field report."""
494
+ if state is None:
495
+ state = AppState()
496
+ state.award("Cartographer")
497
+ report_md = generate_field_report(state, data)
498
+ json_path = export_report_json(state, data)
499
+ csv_path = export_report_csv(state, data)
500
+
501
+ # Achievement HTML
502
+ badges = " ".join(
503
+ f'<span class="achievement-badge">{a}</span>'
504
+ for a in sorted(state.achievements)
505
+ )
506
+ return report_md, json_path, csv_path, True, True, badges, state
507
+
508
+
509
+ # ============================================================
510
+ # Data Health
511
+ # ============================================================
512
+
513
+ def build_data_health_html(validation_report: dict) -> str:
514
+ """Build data health HTML from validation report."""
515
+ rows = ""
516
+ for k, v in validation_report.items():
517
+ if isinstance(v, float):
518
+ v = f"{v:.1f}%"
519
+ rows += f"<tr><td><b>{k}</b></td><td>{v}</td></tr>"
520
+ return f"<table style='width:100%'>{rows}</table>"
521
+
522
+
523
+ # Need gr import for Tabs update
524
+ import gradio as gr
src/data_loader.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data parsing and validation for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import re
4
+ import pandas as pd
5
+ import numpy as np
6
+ from pathlib import Path
7
+ from collections import Counter
8
+
9
+ from src.utils import logger, timer
10
+
11
+
12
+ @timer
13
+ def load_pav(path: str) -> pd.DataFrame:
14
+ """
15
+ Load 89_line_PAV.txt.
16
+ Returns DataFrame: index=gene_id (str), columns=line_ids (str), values=int {0,1}.
17
+ """
18
+ df = pd.read_csv(path, sep="\t", index_col=0)
19
+ df.index.name = "gene"
20
+ df.index = df.index.astype(str)
21
+ df.columns = df.columns.astype(str)
22
+ # Validate all values are 0 or 1
23
+ unique_vals = set(df.values.flatten())
24
+ assert unique_vals.issubset({0, 1}), f"PAV contains values other than 0/1: {unique_vals - {0, 1}}"
25
+ logger.info(f"PAV matrix loaded: {df.shape[0]} genes x {df.shape[1]} lines")
26
+ return df
27
+
28
+
29
+ @timer
30
+ def parse_gff_genes(path: str) -> pd.DataFrame:
31
+ """
32
+ Parse GFF3; keep only feature == 'gene' rows.
33
+ Returns DataFrame: gene_id, contig_id, start, end, strand.
34
+ """
35
+ records = []
36
+ with open(path, "r") as f:
37
+ for line in f:
38
+ if line.startswith("#"):
39
+ continue
40
+ parts = line.strip().split("\t")
41
+ if len(parts) < 9:
42
+ continue
43
+ if parts[2] != "gene":
44
+ continue
45
+
46
+ contig_id = parts[0]
47
+ start = int(parts[3])
48
+ end = int(parts[4])
49
+ strand = parts[6]
50
+ attrs = parts[8]
51
+
52
+ # Extract gene_id from attributes: ID=<value>
53
+ gene_id = None
54
+ for attr in attrs.split(";"):
55
+ attr = attr.strip()
56
+ if attr.startswith("ID="):
57
+ gene_id = attr[3:]
58
+ break
59
+
60
+ if gene_id:
61
+ records.append({
62
+ "gene_id": gene_id,
63
+ "contig_id": contig_id,
64
+ "start": start,
65
+ "end": end,
66
+ "strand": strand,
67
+ })
68
+
69
+ df = pd.DataFrame(records)
70
+ logger.info(f"GFF parsed: {len(df)} genes on {df['contig_id'].nunique()} contigs")
71
+ return df
72
+
73
+
74
+ @timer
75
+ def parse_protein_fasta(path: str) -> pd.DataFrame:
76
+ """
77
+ Returns DataFrame: gene_id, protein_length, aa_composition (dict as string).
78
+ gene_id = header token after '>' up to first whitespace.
79
+ """
80
+ records = []
81
+ current_id = None
82
+ current_seq = []
83
+
84
+ def flush():
85
+ if current_id and current_seq:
86
+ seq = "".join(current_seq).replace("*", "")
87
+ length = len(seq)
88
+ counts = Counter(seq)
89
+ total = max(length, 1)
90
+ top_aas = sorted(counts.items(), key=lambda x: -x[1])[:5]
91
+ comp_str = ", ".join(f"{aa}:{count/total*100:.1f}%" for aa, count in top_aas)
92
+ records.append({
93
+ "gene_id": current_id,
94
+ "protein_length": length,
95
+ "composition_summary": comp_str,
96
+ })
97
+
98
+ with open(path, "r") as f:
99
+ for line in f:
100
+ line = line.strip()
101
+ if line.startswith(">"):
102
+ flush()
103
+ current_id = line[1:].split()[0]
104
+ current_seq = []
105
+ else:
106
+ current_seq.append(line)
107
+ flush()
108
+
109
+ df = pd.DataFrame(records)
110
+ logger.info(f"Protein FASTA parsed: {len(df)} proteins")
111
+ return df
112
+
113
+
114
+ @timer
115
+ def build_contig_index(path: str) -> dict:
116
+ """
117
+ Returns dict: {contig_id: length}.
118
+ Sequential scan of FASTA headers and sequences.
119
+ """
120
+ contig_index = {}
121
+ current_contig = None
122
+ current_len = 0
123
+
124
+ with open(path, "r") as f:
125
+ for line in f:
126
+ if line.startswith(">"):
127
+ if current_contig is not None:
128
+ contig_index[current_contig] = current_len
129
+ current_contig = line[1:].strip().split()[0]
130
+ current_len = 0
131
+ else:
132
+ current_len += len(line.strip())
133
+ if current_contig is not None:
134
+ contig_index[current_contig] = current_len
135
+
136
+ logger.info(f"Contig index built: {len(contig_index)} contigs")
137
+ return contig_index
138
+
139
+
140
+ def build_contig_name_mapping(gff_genes: pd.DataFrame, contig_index: dict) -> dict:
141
+ """
142
+ Build mapping from GFF contig IDs to FASTA contig IDs.
143
+ Strategy: exact match first, then substring match on accession tokens.
144
+ Returns dict: {gff_contig_id: fasta_contig_id}
145
+ """
146
+ gff_contigs = set(gff_genes["contig_id"].unique())
147
+ fasta_contigs = set(contig_index.keys())
148
+ mapping = {}
149
+
150
+ # Exact match
151
+ for gc in gff_contigs:
152
+ if gc in fasta_contigs:
153
+ mapping[gc] = gc
154
+
155
+ # For unmatched, try accession-based matching
156
+ unmatched = gff_contigs - set(mapping.keys())
157
+ if unmatched:
158
+ # Extract accession-like tokens from FASTA headers (e.g. NC_033813.1)
159
+ fasta_accession_map = {}
160
+ for fc in fasta_contigs:
161
+ # Try to extract RefSeq accession
162
+ match = re.search(r'(N[CWZ]_\d+\.\d+)', fc)
163
+ if match:
164
+ fasta_accession_map[match.group(1)] = fc
165
+
166
+ for gc in unmatched:
167
+ match = re.search(r'(N[CWZ]_\d+\.\d+)', gc)
168
+ if match and match.group(1) in fasta_accession_map:
169
+ mapping[gc] = fasta_accession_map[match.group(1)]
170
+
171
+ logger.info(f"Contig mapping: {len(mapping)}/{len(gff_contigs)} GFF contigs matched to FASTA")
172
+ return mapping
173
+
174
+
175
+ def validate_joins(pav: pd.DataFrame, gff_genes: pd.DataFrame,
176
+ protein_index: pd.DataFrame, contig_index: dict) -> dict:
177
+ """
178
+ Returns validation report with coverage percentages and orphan genes.
179
+ """
180
+ pav_genes = set(pav.index)
181
+ gff_gene_set = set(gff_genes["gene_id"])
182
+ protein_gene_set = set(protein_index["gene_id"])
183
+ contig_set = set(contig_index.keys())
184
+ gff_contig_set = set(gff_genes["contig_id"])
185
+
186
+ pav_in_gff = pav_genes & gff_gene_set
187
+ pav_in_protein = pav_genes & protein_gene_set
188
+ gff_contigs_in_fasta = gff_contig_set & contig_set
189
+ orphans = pav_genes - (gff_gene_set | protein_gene_set)
190
+
191
+ report = {
192
+ "pav_gene_count": len(pav_genes),
193
+ "gff_gene_count": len(gff_gene_set),
194
+ "protein_gene_count": len(protein_gene_set),
195
+ "pav_genes_in_gff_pct": len(pav_in_gff) / max(len(pav_genes), 1) * 100,
196
+ "pav_genes_in_protein_pct": len(pav_in_protein) / max(len(pav_genes), 1) * 100,
197
+ "gff_contigs_in_fasta_pct": len(gff_contigs_in_fasta) / max(len(gff_contig_set), 1) * 100,
198
+ "orphan_genes_count": len(orphans),
199
+ }
200
+
201
+ if orphans:
202
+ logger.warning(f"{len(orphans)} orphan genes (in PAV but missing from both GFF and protein)")
203
+ for key, val in report.items():
204
+ logger.info(f" {key}: {val}")
205
+
206
+ return report
src/field_report.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Field Report generator for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import json
4
+ import csv
5
+ import os
6
+ import tempfile
7
+ import pandas as pd
8
+
9
+ from src.state import AppState
10
+
11
+
12
+ def generate_field_report(state: AppState, data: dict) -> str:
13
+ """
14
+ Generate markdown field report.
15
+ Returns markdown string.
16
+ """
17
+ if not state.selected_line:
18
+ return "*No line selected. Go back to Quest 0 to choose a line.*"
19
+
20
+ line_id = state.selected_line
21
+ line_stats = data["line_stats"]
22
+ embedding = data["embedding"]
23
+ similarity = data["similarity"]
24
+ gene_freq = data["gene_freq"]
25
+ pav = data.get("pav")
26
+
27
+ # Line info
28
+ ls_row = line_stats[line_stats["line_id"] == line_id]
29
+ country = ls_row.iloc[0]["country"] if len(ls_row) > 0 else "Unknown"
30
+ genes_present = int(ls_row.iloc[0]["genes_present_count"]) if len(ls_row) > 0 else 0
31
+ unique_genes = int(ls_row.iloc[0]["unique_genes_count"]) if len(ls_row) > 0 else 0
32
+
33
+ # Cluster
34
+ emb_row = embedding[embedding["line_id"] == line_id]
35
+ cluster_id = int(emb_row.iloc[0]["cluster_id"]) if len(emb_row) > 0 else -1
36
+
37
+ # Nearest neighbors
38
+ sim_rows = similarity[similarity["line_id"] == line_id].nlargest(3, "jaccard_score")
39
+ neighbors_str = ", ".join(
40
+ f"{r['neighbor_line_id']} ({r['jaccard_score']:.3f})"
41
+ for _, r in sim_rows.iterrows()
42
+ )
43
+
44
+ # Core/shell/cloud for this line
45
+ core_shell_cloud = ""
46
+ if pav is not None and line_id in pav.columns:
47
+ my_genes = set(pav.index[pav[line_id] == 1])
48
+ my_freq = gene_freq[gene_freq["gene_id"].isin(my_genes)]
49
+ core_count = int((my_freq["core_class"] == "core").sum())
50
+ shell_count = int((my_freq["core_class"] == "shell").sum())
51
+ cloud_count = int((my_freq["core_class"] == "cloud").sum())
52
+ core_shell_cloud = (
53
+ f"- Core genes: **{core_count}**\n"
54
+ f"- Shell genes: **{shell_count}**\n"
55
+ f"- Cloud genes: **{cloud_count}**\n"
56
+ )
57
+
58
+ # Top rare genes
59
+ rare_genes_str = ""
60
+ if pav is not None and line_id in pav.columns:
61
+ my_genes_list = pav.index[pav[line_id] == 1].tolist()
62
+ rare = gene_freq[
63
+ (gene_freq["gene_id"].isin(my_genes_list)) &
64
+ (gene_freq["freq_count"] <= 5)
65
+ ].nsmallest(5, "freq_count")
66
+ if len(rare) > 0:
67
+ rare_genes_str = "\n".join(
68
+ f"- **{r['gene_id']}** (present in {r['freq_count']} lines)"
69
+ for _, r in rare.iterrows()
70
+ )
71
+ else:
72
+ rare_genes_str = "No rare genes (<=5 lines) found in this line."
73
+
74
+ # Backpack
75
+ backpack_str = ""
76
+ if state.backpack_genes:
77
+ backpack_items = []
78
+ for g in state.backpack_genes:
79
+ gf = gene_freq[gene_freq["gene_id"] == g]
80
+ if len(gf) > 0:
81
+ backpack_items.append(
82
+ f"- **{g}**: {gf.iloc[0]['core_class']} "
83
+ f"({gf.iloc[0]['freq_count']} lines)"
84
+ )
85
+ else:
86
+ backpack_items.append(f"- **{g}**: no frequency data")
87
+ backpack_str = "\n".join(backpack_items)
88
+ else:
89
+ backpack_str = "No genes pinned to backpack."
90
+
91
+ # Achievements
92
+ achievements_str = ", ".join(
93
+ f"**{a}**" for a in sorted(state.achievements)
94
+ ) if state.achievements else "None yet"
95
+
96
+ report = f"""
97
+ ## Field Report: {line_id}
98
+
99
+ **Country of origin:** {country}
100
+ **Cluster:** {cluster_id}
101
+ **Genes present:** {genes_present:,}
102
+ **Unique genes:** {unique_genes}
103
+
104
+ ### Nearest Neighbors
105
+ {neighbors_str}
106
+
107
+ ### Gene Composition
108
+ {core_shell_cloud}
109
+
110
+ ### Top 5 Rare Genes
111
+ {rare_genes_str}
112
+
113
+ ### Backpack Collection
114
+ {backpack_str}
115
+
116
+ ### Achievements
117
+ {achievements_str}
118
+
119
+ ---
120
+ *Generated by Pigeon Pea Pangenome Atlas*
121
+ """
122
+ return report.strip()
123
+
124
+
125
+ def export_report_json(state: AppState, data: dict) -> str:
126
+ """Export full report data as JSON. Return file path."""
127
+ report_data = {
128
+ "selected_line": state.selected_line,
129
+ "backpack_genes": state.backpack_genes,
130
+ "achievements": list(state.achievements),
131
+ "core_threshold": state.core_threshold,
132
+ "cloud_threshold": state.cloud_threshold,
133
+ }
134
+
135
+ if state.selected_line:
136
+ ls = data["line_stats"]
137
+ row = ls[ls["line_id"] == state.selected_line]
138
+ if len(row) > 0:
139
+ report_data["line_stats"] = row.iloc[0].to_dict()
140
+
141
+ tmpdir = tempfile.mkdtemp()
142
+ path = os.path.join(tmpdir, "field_report.json")
143
+ with open(path, "w") as f:
144
+ json.dump(report_data, f, indent=2, default=str)
145
+ return path
146
+
147
+
148
+ def export_report_csv(state: AppState, data: dict) -> str:
149
+ """Export tabular portions as CSV. Return file path."""
150
+ tmpdir = tempfile.mkdtemp()
151
+ path = os.path.join(tmpdir, "field_report.csv")
152
+
153
+ rows = []
154
+ if state.backpack_genes:
155
+ gene_freq = data["gene_freq"]
156
+ for g in state.backpack_genes:
157
+ gf = gene_freq[gene_freq["gene_id"] == g]
158
+ if len(gf) > 0:
159
+ rows.append({
160
+ "gene_id": g,
161
+ "freq_count": int(gf.iloc[0]["freq_count"]),
162
+ "freq_pct": float(gf.iloc[0]["freq_pct"]),
163
+ "core_class": gf.iloc[0]["core_class"],
164
+ })
165
+
166
+ if rows:
167
+ with open(path, "w", newline="") as f:
168
+ writer = csv.DictWriter(f, fieldnames=["gene_id", "freq_count", "freq_pct", "core_class"])
169
+ writer.writeheader()
170
+ writer.writerows(rows)
171
+ else:
172
+ with open(path, "w") as f:
173
+ f.write("gene_id,freq_count,freq_pct,core_class\n")
174
+
175
+ return path
src/gene_card.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gene Card builder for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import json
4
+ import os
5
+ import tempfile
6
+ import pandas as pd
7
+
8
+
9
+ def build_gene_card(gene_id: str, data: dict) -> dict:
10
+ """
11
+ Assemble all data for the Gene Card.
12
+ Returns dict with all fields for rendering.
13
+ """
14
+ result = {"gene_id": gene_id}
15
+
16
+ # Frequency info
17
+ gene_freq = data["gene_freq"]
18
+ freq_row = gene_freq[gene_freq["gene_id"] == gene_id]
19
+ if len(freq_row) > 0:
20
+ row = freq_row.iloc[0]
21
+ result["freq_count"] = int(row["freq_count"])
22
+ result["freq_pct"] = float(row["freq_pct"])
23
+ result["core_class"] = row["core_class"]
24
+ else:
25
+ result["freq_count"] = None
26
+ result["freq_pct"] = None
27
+ result["core_class"] = "unknown"
28
+
29
+ # Presence vector
30
+ pav = data.get("pav")
31
+ if pav is not None and gene_id in pav.index:
32
+ result["presence_vector"] = pav.loc[gene_id]
33
+ else:
34
+ result["presence_vector"] = None
35
+
36
+ # GFF info
37
+ gff = data["gff_index"]
38
+ gff_row = gff[gff["gene_id"] == gene_id]
39
+ if len(gff_row) > 0:
40
+ row = gff_row.iloc[0]
41
+ result["contig"] = row["contig_id"]
42
+ result["start"] = int(row["start"])
43
+ result["end"] = int(row["end"])
44
+ result["strand"] = row["strand"]
45
+ else:
46
+ result["contig"] = None
47
+ result["start"] = None
48
+ result["end"] = None
49
+ result["strand"] = None
50
+
51
+ # Protein info
52
+ protein = data["protein"]
53
+ prot_row = protein[protein["gene_id"] == gene_id]
54
+ if len(prot_row) > 0:
55
+ row = prot_row.iloc[0]
56
+ result["protein_length"] = int(row["protein_length"])
57
+ result["composition_summary"] = row["composition_summary"]
58
+ else:
59
+ result["protein_length"] = None
60
+ result["composition_summary"] = None
61
+
62
+ return result
63
+
64
+
65
+ def render_gene_card_html(card: dict) -> str:
66
+ """Render Gene Card data as HTML."""
67
+ gene_id = card["gene_id"]
68
+
69
+ # Badge
70
+ cc = card.get("core_class", "unknown")
71
+ badge_map = {
72
+ "core": ('<span class="quest-badge badge-core">Core</span>', "Present in >=95% of lines"),
73
+ "shell": ('<span class="quest-badge badge-shell">Shell</span>', "Present in 15-95% of lines"),
74
+ "cloud": ('<span class="quest-badge badge-cloud">Cloud</span>', "Present in <15% of lines"),
75
+ }
76
+ badge_html, badge_desc = badge_map.get(cc, ('<span class="quest-badge">Unknown</span>', ""))
77
+
78
+ # Frequency
79
+ freq_text = ""
80
+ if card.get("freq_count") is not None:
81
+ freq_text = f"<p>Present in <b>{card['freq_count']}</b> lines ({card['freq_pct']:.1f}%)</p>"
82
+
83
+ # Presence barcode
84
+ barcode_html = ""
85
+ if card.get("presence_vector") is not None:
86
+ barcode_html = render_presence_barcode(card["presence_vector"])
87
+
88
+ # Location
89
+ loc_text = ""
90
+ if card.get("contig"):
91
+ loc_text = (
92
+ f"<p>Contig: <code>{card['contig']}</code></p>"
93
+ f"<p>Position: {card['start']:,} - {card['end']:,} ({card['strand']})</p>"
94
+ )
95
+ else:
96
+ loc_text = "<p><i>No coordinate annotation available</i></p>"
97
+
98
+ # Protein
99
+ prot_text = ""
100
+ if card.get("protein_length"):
101
+ prot_text = (
102
+ f"<p>Protein length: <b>{card['protein_length']}</b> aa</p>"
103
+ f"<p>Top AAs: {card['composition_summary']}</p>"
104
+ )
105
+ else:
106
+ prot_text = "<p><i>No protein data available</i></p>"
107
+
108
+ html = f"""
109
+ <div class="gene-card">
110
+ <h3>Gene: {gene_id}</h3>
111
+ {badge_html}
112
+ <p style="font-size:0.8em;color:#666">{badge_desc}</p>
113
+ {freq_text}
114
+ <h4>Presence Barcode</h4>
115
+ {barcode_html}
116
+ <h4>Genomic Location</h4>
117
+ {loc_text}
118
+ <h4>Protein</h4>
119
+ {prot_text}
120
+ </div>
121
+ """
122
+ return html
123
+
124
+
125
+ def render_presence_barcode(presence_vector: pd.Series) -> str:
126
+ """
127
+ Returns HTML string: colored spans for each line.
128
+ Green (#4CAF50) = present, light gray (#E0E0E0) = absent.
129
+ """
130
+ spans = []
131
+ for val in presence_vector:
132
+ cls = "present" if val == 1 else "absent"
133
+ spans.append(f'<span class="{cls}"></span>')
134
+ return '<div class="presence-barcode">' + "".join(spans) + '</div>'
135
+
136
+
137
+ def export_gene_report(gene_id: str, data: dict) -> str:
138
+ """Export gene data as JSON file. Return file path."""
139
+ card = build_gene_card(gene_id, data)
140
+ # Convert presence vector to list for JSON
141
+ if card.get("presence_vector") is not None:
142
+ card["presence_vector"] = card["presence_vector"].to_dict()
143
+
144
+ tmpdir = tempfile.mkdtemp()
145
+ path = os.path.join(tmpdir, f"{gene_id}_report.json")
146
+ with open(path, "w") as f:
147
+ json.dump(card, f, indent=2, default=str)
148
+ return path
src/precompute.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Offline precomputation for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy.spatial.distance import pdist, squareform
6
+ from sklearn.cluster import KMeans
7
+ from sklearn.metrics import silhouette_score
8
+
9
+ from src.utils import logger, timer, parse_country
10
+
11
+
12
+ @timer
13
+ def compute_gene_frequency(pav: pd.DataFrame) -> pd.DataFrame:
14
+ """
15
+ Compute per-gene frequency and core class.
16
+ Output columns: gene_id, freq_count, freq_pct, core_class
17
+ """
18
+ n_lines = pav.shape[1]
19
+ freq_count = pav.sum(axis=1).astype(int)
20
+ freq_pct = (freq_count / n_lines * 100).round(2)
21
+
22
+ def classify(pct):
23
+ if pct >= 95:
24
+ return "core"
25
+ elif pct >= 15:
26
+ return "shell"
27
+ return "cloud"
28
+
29
+ core_class = freq_pct.map(classify)
30
+
31
+ df = pd.DataFrame({
32
+ "gene_id": pav.index,
33
+ "freq_count": freq_count.values,
34
+ "freq_pct": freq_pct.values,
35
+ "core_class": core_class.values,
36
+ })
37
+ logger.info(f"Gene frequency: {(df['core_class']=='core').sum()} core, "
38
+ f"{(df['core_class']=='shell').sum()} shell, "
39
+ f"{(df['core_class']=='cloud').sum()} cloud")
40
+ return df
41
+
42
+
43
+ @timer
44
+ def compute_line_stats(pav: pd.DataFrame) -> pd.DataFrame:
45
+ """
46
+ Compute per-line statistics.
47
+ Output columns: line_id, country, genes_present_count, unique_genes_count
48
+ """
49
+ n_lines = pav.shape[1]
50
+ records = []
51
+ for line_id in pav.columns:
52
+ presence = pav[line_id]
53
+ genes_present = int(presence.sum())
54
+ # Unique genes: present in this line but no others
55
+ unique_mask = (pav.sum(axis=1) == 1) & (presence == 1)
56
+ unique_count = int(unique_mask.sum())
57
+ country = parse_country(line_id)
58
+ records.append({
59
+ "line_id": line_id,
60
+ "country": country,
61
+ "genes_present_count": genes_present,
62
+ "unique_genes_count": unique_count,
63
+ })
64
+ df = pd.DataFrame(records)
65
+ logger.info(f"Line stats computed for {len(df)} lines")
66
+ return df
67
+
68
+
69
+ @timer
70
+ def compute_line_embedding(pav: pd.DataFrame) -> pd.DataFrame:
71
+ """
72
+ UMAP embedding + KMeans clustering of lines.
73
+ Output columns: line_id, umap_x, umap_y, cluster_id
74
+ """
75
+ import umap
76
+
77
+ # Transpose: rows = lines, columns = genes
78
+ X = pav.T.values.astype(np.float32)
79
+ line_ids = list(pav.columns)
80
+
81
+ # UMAP
82
+ reducer = umap.UMAP(n_components=2, metric="jaccard", n_neighbors=15,
83
+ min_dist=0.1, random_state=42)
84
+ embedding = reducer.fit_transform(X)
85
+
86
+ # KMeans clustering — pick k by silhouette
87
+ best_k, best_score = 3, -1
88
+ for k in range(3, min(9, len(line_ids))):
89
+ km = KMeans(n_clusters=k, random_state=42, n_init=10)
90
+ labels = km.fit_predict(embedding)
91
+ score = silhouette_score(embedding, labels)
92
+ if score > best_score:
93
+ best_k, best_score = k, score
94
+ best_labels = labels
95
+
96
+ logger.info(f"UMAP + KMeans: best k={best_k}, silhouette={best_score:.3f}")
97
+
98
+ df = pd.DataFrame({
99
+ "line_id": line_ids,
100
+ "umap_x": embedding[:, 0],
101
+ "umap_y": embedding[:, 1],
102
+ "cluster_id": best_labels,
103
+ })
104
+ return df
105
+
106
+
107
+ @timer
108
+ def compute_similarity_topk(pav: pd.DataFrame, k: int = 15) -> pd.DataFrame:
109
+ """
110
+ Pairwise Jaccard similarity, keep top-K neighbors per line.
111
+ Output columns: line_id, neighbor_line_id, jaccard_score
112
+ """
113
+ X = pav.T.values.astype(np.float32)
114
+ line_ids = list(pav.columns)
115
+ n = len(line_ids)
116
+
117
+ # Compute pairwise Jaccard distance, convert to similarity
118
+ dist_vec = pdist(X, metric="jaccard")
119
+ dist_mat = squareform(dist_vec)
120
+ sim_mat = 1.0 - dist_mat
121
+
122
+ records = []
123
+ for i in range(n):
124
+ scores = sim_mat[i].copy()
125
+ scores[i] = -1 # exclude self
126
+ top_idx = np.argsort(scores)[::-1][:k]
127
+ for j in top_idx:
128
+ records.append({
129
+ "line_id": line_ids[i],
130
+ "neighbor_line_id": line_ids[j],
131
+ "jaccard_score": round(float(scores[j]), 4),
132
+ })
133
+
134
+ df = pd.DataFrame(records)
135
+ logger.info(f"Similarity top-{k}: {len(df)} pairs")
136
+ return df
137
+
138
+
139
+ @timer
140
+ def build_gff_gene_parquet(gff_genes: pd.DataFrame, output_path: str) -> None:
141
+ """Save parsed GFF gene DataFrame to parquet."""
142
+ gff_genes.to_parquet(output_path, index=False)
143
+ logger.info(f"GFF gene index saved: {output_path}")
144
+
145
+
146
+ @timer
147
+ def build_protein_parquet(protein_df: pd.DataFrame, output_path: str) -> None:
148
+ """Save protein index to parquet."""
149
+ protein_df.to_parquet(output_path, index=False)
150
+ logger.info(f"Protein index saved: {output_path}")
151
+
152
+
153
+ @timer
154
+ def save_contig_index(contig_index: dict, contig_mapping: dict, output_path: str) -> None:
155
+ """Save contig index as JSON."""
156
+ import json
157
+ data = {}
158
+ for contig_id, length in contig_index.items():
159
+ gff_seqid = None
160
+ for gff_id, fasta_id in contig_mapping.items():
161
+ if fasta_id == contig_id:
162
+ gff_seqid = gff_id
163
+ break
164
+ data[contig_id] = {
165
+ "length": length,
166
+ "gff_seqid": gff_seqid or contig_id,
167
+ "fasta_header": contig_id,
168
+ }
169
+ with open(output_path, "w") as f:
170
+ json.dump(data, f, indent=2)
171
+ logger.info(f"Contig index saved: {output_path}")
172
+
173
+
174
+ @timer
175
+ def compute_hotspot_bins(gff_genes: pd.DataFrame, gene_freq: pd.DataFrame,
176
+ contig_index: dict, bin_size: int = 100_000) -> pd.DataFrame:
177
+ """
178
+ Bin genes along contigs and compute variability scores.
179
+ Output columns: contig_id, bin_start, bin_end, total_genes, cloud_genes,
180
+ shell_genes, core_genes, mean_freq, variability_score
181
+ """
182
+ # Join gff with gene frequency
183
+ merged = gff_genes.merge(gene_freq, on="gene_id", how="inner")
184
+ merged["midpoint"] = (merged["start"] + merged["end"]) // 2
185
+
186
+ records = []
187
+ for contig_id in merged["contig_id"].unique():
188
+ contig_genes = merged[merged["contig_id"] == contig_id]
189
+ max_pos = contig_genes["end"].max()
190
+
191
+ for bin_start in range(0, max_pos + bin_size, bin_size):
192
+ bin_end = bin_start + bin_size
193
+ in_bin = contig_genes[
194
+ (contig_genes["midpoint"] >= bin_start) &
195
+ (contig_genes["midpoint"] < bin_end)
196
+ ]
197
+ if len(in_bin) == 0:
198
+ continue
199
+
200
+ core_count = int((in_bin["core_class"] == "core").sum())
201
+ shell_count = int((in_bin["core_class"] == "shell").sum())
202
+ cloud_count = int((in_bin["core_class"] == "cloud").sum())
203
+ mean_freq = float(in_bin["freq_pct"].mean())
204
+ variability_score = cloud_count + 0.5 * shell_count
205
+
206
+ records.append({
207
+ "contig_id": contig_id,
208
+ "bin_start": bin_start,
209
+ "bin_end": bin_end,
210
+ "total_genes": len(in_bin),
211
+ "core_genes": core_count,
212
+ "shell_genes": shell_count,
213
+ "cloud_genes": cloud_count,
214
+ "mean_freq": round(mean_freq, 2),
215
+ "variability_score": round(variability_score, 2),
216
+ })
217
+
218
+ df = pd.DataFrame(records)
219
+ logger.info(f"Hotspot bins computed: {len(df)} bins across {df['contig_id'].nunique()} contigs")
220
+ return df
221
+
222
+
223
+ @timer
224
+ def compute_cluster_markers(pav: pd.DataFrame, embedding: pd.DataFrame,
225
+ top_n: int = 50) -> pd.DataFrame:
226
+ """
227
+ Find marker genes for each cluster.
228
+ Output columns: cluster_id, gene_id, in_cluster_freq, out_cluster_freq, marker_score
229
+ """
230
+ clusters = embedding[["line_id", "cluster_id"]].copy()
231
+ records = []
232
+
233
+ for cid in sorted(clusters["cluster_id"].unique()):
234
+ in_lines = set(clusters[clusters["cluster_id"] == cid]["line_id"])
235
+ out_lines = set(clusters[clusters["cluster_id"] != cid]["line_id"])
236
+
237
+ in_cols = [c for c in pav.columns if c in in_lines]
238
+ out_cols = [c for c in pav.columns if c in out_lines]
239
+
240
+ if not in_cols or not out_cols:
241
+ continue
242
+
243
+ in_freq = pav[in_cols].mean(axis=1)
244
+ out_freq = pav[out_cols].mean(axis=1)
245
+ marker_score = in_freq - out_freq
246
+
247
+ top_genes = marker_score.nlargest(top_n)
248
+ for gene_id, score in top_genes.items():
249
+ records.append({
250
+ "cluster_id": int(cid),
251
+ "gene_id": gene_id,
252
+ "in_cluster_freq": round(float(in_freq[gene_id]), 4),
253
+ "out_cluster_freq": round(float(out_freq[gene_id]), 4),
254
+ "marker_score": round(float(score), 4),
255
+ })
256
+
257
+ df = pd.DataFrame(records)
258
+ logger.info(f"Cluster markers: {len(df)} total across {df['cluster_id'].nunique()} clusters")
259
+ return df
src/state.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application state for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class AppState:
8
+ selected_line: str | None = None
9
+ selected_party: list[str] = field(default_factory=list)
10
+ selected_gene: str | None = None
11
+ backpack_genes: list[str] = field(default_factory=list)
12
+ core_threshold: float = 95.0
13
+ cloud_threshold: float = 15.0
14
+ selected_contig: str | None = None
15
+ achievements: set[str] = field(default_factory=set)
16
+
17
+ def add_to_backpack(self, gene_id: str) -> bool:
18
+ """Add gene to backpack. Returns True if added, False if already present or full."""
19
+ if gene_id in self.backpack_genes:
20
+ return False
21
+ if len(self.backpack_genes) >= 50:
22
+ return False
23
+ self.backpack_genes.append(gene_id)
24
+ if len(self.backpack_genes) >= 5:
25
+ self.achievements.add("Collector")
26
+ return True
27
+
28
+ def remove_from_backpack(self, gene_id: str) -> bool:
29
+ """Remove gene from backpack."""
30
+ if gene_id in self.backpack_genes:
31
+ self.backpack_genes.remove(gene_id)
32
+ return True
33
+ return False
34
+
35
+ def award(self, badge: str):
36
+ """Award an achievement badge."""
37
+ self.achievements.add(badge)
src/utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared helpers for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import os
4
+ import logging
5
+ import time
6
+ from pathlib import Path
7
+ from functools import wraps
8
+
9
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
10
+ logger = logging.getLogger("pangenome")
11
+
12
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
13
+ DATA_DIR = PROJECT_ROOT / "data"
14
+ PRECOMPUTED_DIR = PROJECT_ROOT / "precomputed"
15
+
16
+
17
+ def find_file(directory: Path, pattern: str) -> Path:
18
+ """Find first file matching glob pattern in directory."""
19
+ matches = list(directory.glob(pattern))
20
+ if not matches:
21
+ raise FileNotFoundError(f"No file matching '{pattern}' in {directory}")
22
+ return matches[0]
23
+
24
+
25
+ def timer(func):
26
+ """Decorator that logs execution time."""
27
+ @wraps(func)
28
+ def wrapper(*args, **kwargs):
29
+ t0 = time.time()
30
+ result = func(*args, **kwargs)
31
+ dt = time.time() - t0
32
+ logger.info(f"{func.__name__} completed in {dt:.2f}s")
33
+ return result
34
+ return wrapper
35
+
36
+
37
+ KNOWN_COUNTRIES = {
38
+ "India", "Myanmar", "Unknown", "Zaire", "Uganda", "Indonesia", "Jamaica",
39
+ "South_Africa", "Puerto_Rico", "Philippines", "Sierra_Leone", "Nigeria",
40
+ "Malawi", "Italy", "Kenya", "Sri_Lanka", "Thailand", "Nepal",
41
+ }
42
+
43
+
44
+ def parse_country(line_id: str) -> str:
45
+ """Extract country from line ID (last token after underscore)."""
46
+ parts = line_id.rsplit("_", 1)
47
+ if len(parts) == 2 and parts[1] in KNOWN_COUNTRIES:
48
+ return parts[1]
49
+ # Try two-word countries
50
+ parts2 = line_id.rsplit("_", 2)
51
+ if len(parts2) >= 3:
52
+ two_word = f"{parts2[-2]}_{parts2[-1]}"
53
+ if two_word in KNOWN_COUNTRIES:
54
+ return two_word
55
+ return "Unknown"
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared test fixtures for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import os
4
+ import sys
5
+ import pytest
6
+ import numpy as np
7
+ import pandas as pd
8
+ from pathlib import Path
9
+
10
+ # Ensure project root is in path
11
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
12
+
13
+ from src.state import AppState
14
+
15
+
16
+ @pytest.fixture
17
+ def synthetic_pav():
18
+ """20 genes x 10 lines, random 0/1."""
19
+ np.random.seed(42)
20
+ genes = [f"g{i:05d}" for i in range(20)]
21
+ lines = [f"Line_{i}_India" for i in range(10)]
22
+ data = np.random.randint(0, 2, size=(20, 10))
23
+ return pd.DataFrame(data, index=genes, columns=lines)
24
+
25
+
26
+ @pytest.fixture
27
+ def synthetic_gff():
28
+ """20 genes on 3 contigs."""
29
+ records = []
30
+ for i in range(20):
31
+ contig = f"contig_{i % 3}"
32
+ start = i * 1000
33
+ end = start + 500
34
+ records.append({
35
+ "gene_id": f"g{i:05d}",
36
+ "contig_id": contig,
37
+ "start": start,
38
+ "end": end,
39
+ "strand": "+" if i % 2 == 0 else "-",
40
+ })
41
+ return pd.DataFrame(records)
42
+
43
+
44
+ @pytest.fixture
45
+ def synthetic_protein():
46
+ """20 genes with random lengths."""
47
+ records = []
48
+ for i in range(20):
49
+ records.append({
50
+ "gene_id": f"g{i:05d}",
51
+ "protein_length": np.random.randint(50, 1000),
52
+ "composition_summary": "L:9.0%, A:8.0%, G:7.0%, S:6.0%, V:5.0%",
53
+ })
54
+ return pd.DataFrame(records)
55
+
56
+
57
+ @pytest.fixture
58
+ def synthetic_contig_index():
59
+ """3 contigs."""
60
+ return {"contig_0": 50000, "contig_1": 80000, "contig_2": 30000}
61
+
62
+
63
+ @pytest.fixture
64
+ def empty_state():
65
+ """Fresh AppState."""
66
+ return AppState()
67
+
68
+
69
+ @pytest.fixture
70
+ def populated_state():
71
+ """AppState with some selections."""
72
+ state = AppState()
73
+ state.selected_line = "Line_0_India"
74
+ state.backpack_genes = ["g00001", "g00002"]
75
+ state.achievements = {"Explorer"}
76
+ return state
77
+
78
+
79
+ @pytest.fixture
80
+ def synthetic_data(synthetic_pav, synthetic_gff, synthetic_protein, synthetic_contig_index):
81
+ """Complete synthetic data dict matching app DATA format."""
82
+ from src.precompute import (
83
+ compute_gene_frequency, compute_line_stats,
84
+ compute_line_embedding, compute_similarity_topk,
85
+ compute_hotspot_bins, compute_cluster_markers,
86
+ )
87
+
88
+ gene_freq = compute_gene_frequency(synthetic_pav)
89
+ line_stats = compute_line_stats(synthetic_pav)
90
+
91
+ # Simple embedding without UMAP (too small for meaningful UMAP)
92
+ line_ids = list(synthetic_pav.columns)
93
+ embedding = pd.DataFrame({
94
+ "line_id": line_ids,
95
+ "umap_x": np.random.randn(len(line_ids)),
96
+ "umap_y": np.random.randn(len(line_ids)),
97
+ "cluster_id": [i % 3 for i in range(len(line_ids))],
98
+ })
99
+
100
+ similarity = compute_similarity_topk(synthetic_pav, k=5)
101
+
102
+ hotspots = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
103
+ markers = compute_cluster_markers(synthetic_pav, embedding)
104
+
105
+ return {
106
+ "pav": synthetic_pav,
107
+ "gene_freq": gene_freq,
108
+ "line_stats": line_stats,
109
+ "embedding": embedding,
110
+ "similarity": similarity,
111
+ "gff_index": synthetic_gff,
112
+ "protein": synthetic_protein,
113
+ "hotspots": hotspots,
114
+ "markers": markers,
115
+ }
116
+
117
+
118
+ @pytest.fixture
119
+ def precomputed_dir():
120
+ """Path to precomputed directory (if exists)."""
121
+ p = Path(__file__).resolve().parent.parent / "precomputed"
122
+ if p.exists():
123
+ return p
124
+ return None
tests/test_callbacks.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for src/callbacks.py."""
2
+
3
+ import pytest
4
+ import plotly.graph_objects as go
5
+
6
+ from src.state import AppState
7
+ from src.callbacks import (
8
+ on_line_selected, build_umap_plot,
9
+ build_donut_chart, build_frequency_histogram, build_treasure_table,
10
+ on_pin_gene, build_hotspot_heatmap, get_protein_stats_html,
11
+ build_backpack_comparison,
12
+ )
13
+
14
+
15
+ class TestQuest0:
16
+
17
+ def test_on_line_selected_returns(self, synthetic_data):
18
+ state = AppState()
19
+ line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
20
+ total, unique, neighbor, new_state = on_line_selected(line_id, state, synthetic_data)
21
+ assert total != "--"
22
+ assert int(total) > 0
23
+
24
+ def test_on_line_selected_updates_state(self, synthetic_data):
25
+ state = AppState()
26
+ line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
27
+ _, _, _, new_state = on_line_selected(line_id, state, synthetic_data)
28
+ assert new_state.selected_line == line_id
29
+
30
+
31
+ class TestQuest1:
32
+
33
+ def test_umap_plot_returns_figure(self, synthetic_data):
34
+ state = AppState()
35
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
36
+ fig = build_umap_plot("Country", state, synthetic_data)
37
+ assert isinstance(fig, go.Figure)
38
+ assert len(fig.data) >= 1
39
+
40
+
41
+ class TestQuest2:
42
+
43
+ def test_donut_three_segments(self, synthetic_data):
44
+ fig = build_donut_chart(95, 15, synthetic_data)
45
+ assert isinstance(fig, go.Figure)
46
+ assert len(fig.data) == 1
47
+ assert len(fig.data[0].values) == 3
48
+
49
+ def test_histogram_returns_figure(self, synthetic_data):
50
+ fig = build_frequency_histogram(95, 15, synthetic_data)
51
+ assert isinstance(fig, go.Figure)
52
+
53
+ def test_treasure_table_columns(self, synthetic_data):
54
+ state = AppState()
55
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
56
+ df = build_treasure_table(state, 95, 15, "All", synthetic_data)
57
+ assert "gene_id" in df.columns
58
+ assert "freq_count" in df.columns
59
+ assert "core_class" in df.columns
60
+
61
+ def test_pin_gene(self):
62
+ state = AppState()
63
+ backpack_text, new_state = on_pin_gene("g00001", state)
64
+ assert "g00001" in new_state.backpack_genes
65
+
66
+ def test_pin_gene_idempotent(self):
67
+ state = AppState()
68
+ on_pin_gene("g00001", state)
69
+ backpack_text, new_state = on_pin_gene("g00001", state)
70
+ assert state.backpack_genes.count("g00001") == 1
71
+
72
+ def test_backpack_limit(self):
73
+ state = AppState()
74
+ for i in range(55):
75
+ state.add_to_backpack(f"g{i:05d}")
76
+ assert len(state.backpack_genes) <= 50
77
+
78
+
79
+ class TestQuest3:
80
+
81
+ def test_hotspot_heatmap(self, synthetic_data):
82
+ fig = build_hotspot_heatmap(synthetic_data)
83
+ assert isinstance(fig, go.Figure)
84
+
85
+
86
+ class TestQuest4:
87
+
88
+ def test_protein_stats_html(self, synthetic_data):
89
+ gene_id = synthetic_data["protein"].iloc[0]["gene_id"]
90
+ html = get_protein_stats_html(gene_id, synthetic_data)
91
+ assert "aa" in html or "protein" in html.lower()
92
+
93
+ def test_backpack_comparison_needs_2(self, synthetic_data):
94
+ state = AppState()
95
+ state.backpack_genes = ["g00001"]
96
+ fig = build_backpack_comparison(state, synthetic_data)
97
+ assert isinstance(fig, go.Figure)
tests/test_data_loader.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for src/data_loader.py."""
2
+
3
+ import pytest
4
+ import pandas as pd
5
+ from pathlib import Path
6
+
7
+ from src.data_loader import load_pav, parse_gff_genes, parse_protein_fasta, build_contig_index, validate_joins
8
+
9
+
10
+ DATA_DIR = Path(__file__).resolve().parent.parent / "data"
11
+ HAS_REAL_DATA = (DATA_DIR / "89_line_PAV.txt").exists()
12
+
13
+
14
+ @pytest.mark.skipif(not HAS_REAL_DATA, reason="Real data not available")
15
+ class TestRealData:
16
+
17
+ def test_load_pav_shape(self):
18
+ df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
19
+ assert df.shape[0] > 0
20
+ assert df.shape[1] > 0
21
+
22
+ def test_load_pav_values(self):
23
+ df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
24
+ unique_vals = set(df.values.flatten())
25
+ assert unique_vals.issubset({0, 1})
26
+
27
+ def test_load_pav_gene_ids(self):
28
+ df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
29
+ assert df.index.dtype == object # string type
30
+ assert df.index.is_unique
31
+
32
+ def test_load_pav_line_count(self):
33
+ df = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
34
+ # 89 lines + reference = 90 columns
35
+ assert df.shape[1] >= 89, f"Expected >=89 lines, got {df.shape[1]}"
36
+
37
+ def test_parse_gff_columns(self):
38
+ gff_files = list(DATA_DIR.glob("*.gff"))
39
+ assert len(gff_files) > 0
40
+ df = parse_gff_genes(str(gff_files[0]))
41
+ assert set(df.columns) == {"gene_id", "contig_id", "start", "end", "strand"}
42
+
43
+ def test_parse_gff_no_empty_ids(self):
44
+ gff_files = list(DATA_DIR.glob("*.gff"))
45
+ df = parse_gff_genes(str(gff_files[0]))
46
+ assert not df["gene_id"].isna().any()
47
+ assert not (df["gene_id"] == "").any()
48
+
49
+ def test_protein_fasta_lengths(self):
50
+ fasta_files = list(DATA_DIR.glob("*protein*.fasta"))
51
+ assert len(fasta_files) > 0
52
+ df = parse_protein_fasta(str(fasta_files[0]))
53
+ assert (df["protein_length"] > 0).all()
54
+
55
+ def test_contig_index_nonempty(self):
56
+ fasta_files = [f for f in DATA_DIR.glob("*.fasta") if "protein" not in f.name]
57
+ if fasta_files:
58
+ ci = build_contig_index(str(fasta_files[0]))
59
+ assert len(ci) > 0
60
+
61
+ def test_validate_joins_coverage(self):
62
+ pav = load_pav(str(DATA_DIR / "89_line_PAV.txt"))
63
+ gff_files = list(DATA_DIR.glob("*.gff"))
64
+ gff = parse_gff_genes(str(gff_files[0]))
65
+ fasta_files = list(DATA_DIR.glob("*protein*.fasta"))
66
+ protein = parse_protein_fasta(str(fasta_files[0]))
67
+ genome_files = [f for f in DATA_DIR.glob("*.fasta") if "protein" not in f.name]
68
+ ci = build_contig_index(str(genome_files[0])) if genome_files else {}
69
+
70
+ report = validate_joins(pav, gff, protein, ci)
71
+ assert report["pav_genes_in_gff_pct"] > 50
72
+
73
+
74
+ class TestSynthetic:
75
+
76
+ def test_pav_shape(self, synthetic_pav):
77
+ assert synthetic_pav.shape == (20, 10)
78
+
79
+ def test_pav_values(self, synthetic_pav):
80
+ unique_vals = set(synthetic_pav.values.flatten())
81
+ assert unique_vals.issubset({0, 1})
82
+
83
+ def test_gff_columns(self, synthetic_gff):
84
+ assert set(synthetic_gff.columns) == {"gene_id", "contig_id", "start", "end", "strand"}
85
+
86
+ def test_gff_no_empty_ids(self, synthetic_gff):
87
+ assert not synthetic_gff["gene_id"].isna().any()
88
+
89
+ def test_protein_lengths(self, synthetic_protein):
90
+ assert (synthetic_protein["protein_length"] > 0).all()
tests/test_field_report.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for src/field_report.py."""
2
+
3
+ import pytest
4
+ import json
5
+
6
+ from src.state import AppState
7
+ from src.field_report import generate_field_report, export_report_json, export_report_csv
8
+
9
+
10
+ class TestFieldReport:
11
+
12
+ def test_report_has_sections(self, synthetic_data):
13
+ state = AppState()
14
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
15
+ report = generate_field_report(state, synthetic_data)
16
+ assert state.selected_line in report
17
+ assert "Cluster" in report
18
+ assert "Rare" in report or "rare" in report.lower()
19
+
20
+ def test_report_not_empty(self, synthetic_data):
21
+ state = AppState()
22
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
23
+ report = generate_field_report(state, synthetic_data)
24
+ assert len(report) > 100
25
+
26
+ def test_report_with_empty_backpack(self, synthetic_data):
27
+ state = AppState()
28
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
29
+ state.backpack_genes = []
30
+ report = generate_field_report(state, synthetic_data)
31
+ assert "No genes pinned" in report or "backpack" in report.lower()
32
+
33
+ def test_report_no_line(self, synthetic_data):
34
+ state = AppState()
35
+ report = generate_field_report(state, synthetic_data)
36
+ assert "No line selected" in report
37
+
38
+ def test_export_json_valid(self, synthetic_data):
39
+ state = AppState()
40
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
41
+ path = export_report_json(state, synthetic_data)
42
+ with open(path) as f:
43
+ data = json.load(f)
44
+ assert "selected_line" in data
45
+
46
+ def test_export_json_schema(self, synthetic_data):
47
+ state = AppState()
48
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
49
+ state.backpack_genes = ["g00001"]
50
+ path = export_report_json(state, synthetic_data)
51
+ with open(path) as f:
52
+ data = json.load(f)
53
+ assert "backpack_genes" in data
54
+ assert "achievements" in data
55
+
56
+ def test_export_csv(self, synthetic_data):
57
+ state = AppState()
58
+ state.selected_line = synthetic_data["line_stats"].iloc[0]["line_id"]
59
+ state.backpack_genes = [synthetic_data["gene_freq"].iloc[0]["gene_id"]]
60
+ path = export_report_csv(state, synthetic_data)
61
+ with open(path) as f:
62
+ content = f.read()
63
+ assert "gene_id" in content
tests/test_gene_card.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for src/gene_card.py."""
2
+
3
+ import pytest
4
+ import json
5
+
6
+ from src.gene_card import build_gene_card, render_gene_card_html, render_presence_barcode, export_gene_report
7
+
8
+
9
+ class TestGeneCard:
10
+
11
+ def test_gene_card_build(self, synthetic_data):
12
+ gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
13
+ card = build_gene_card(gene_id, synthetic_data)
14
+ assert card["gene_id"] == gene_id
15
+ assert "freq_count" in card
16
+ assert "core_class" in card
17
+ assert "contig" in card
18
+ assert "protein_length" in card
19
+
20
+ def test_gene_card_missing_gff(self, synthetic_data):
21
+ # Use a gene_id not in GFF
22
+ card = build_gene_card("nonexistent_gene", synthetic_data)
23
+ assert card["contig"] is None
24
+
25
+ def test_gene_card_missing_protein(self, synthetic_data):
26
+ card = build_gene_card("nonexistent_gene", synthetic_data)
27
+ assert card["protein_length"] is None
28
+
29
+ def test_gene_card_html(self, synthetic_data):
30
+ gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
31
+ card = build_gene_card(gene_id, synthetic_data)
32
+ html = render_gene_card_html(card)
33
+ assert gene_id in html
34
+ assert "gene-card" in html
35
+
36
+ def test_presence_barcode_length(self, synthetic_pav):
37
+ gene_id = synthetic_pav.index[0]
38
+ vector = synthetic_pav.loc[gene_id]
39
+ html = render_presence_barcode(vector)
40
+ # Should have as many spans as lines
41
+ assert html.count("<span") == len(vector)
42
+
43
+ def test_presence_barcode_html_valid(self, synthetic_pav):
44
+ gene_id = synthetic_pav.index[0]
45
+ vector = synthetic_pav.loc[gene_id]
46
+ html = render_presence_barcode(vector)
47
+ assert 'class="present"' in html or 'class="absent"' in html
48
+
49
+ def test_export_gene_report(self, synthetic_data):
50
+ gene_id = synthetic_data["gene_freq"].iloc[0]["gene_id"]
51
+ path = export_gene_report(gene_id, synthetic_data)
52
+ with open(path) as f:
53
+ data = json.load(f)
54
+ assert data["gene_id"] == gene_id
tests/test_integration.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Integration tests for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import pytest
4
+ from pathlib import Path
5
+
6
+ from src.state import AppState
7
+ from src.callbacks import (
8
+ on_line_selected, build_umap_plot, build_donut_chart,
9
+ build_treasure_table, on_pin_gene, build_hotspot_heatmap,
10
+ get_protein_stats_html, on_generate_report,
11
+ )
12
+ from src.gene_card import build_gene_card, render_gene_card_html
13
+
14
+
15
+ class TestFullFlow:
16
+
17
+ def test_full_flow(self, synthetic_data):
18
+ """Simulate: select line -> UMAP -> thresholds -> gene card -> pin -> report."""
19
+ state = AppState()
20
+
21
+ # Step 1: Select line
22
+ line_id = synthetic_data["line_stats"].iloc[0]["line_id"]
23
+ total, unique, neighbor, state = on_line_selected(line_id, state, synthetic_data)
24
+ assert state.selected_line == line_id
25
+
26
+ # Step 2: View UMAP
27
+ fig = build_umap_plot("Country", state, synthetic_data)
28
+ assert fig is not None
29
+
30
+ # Step 3: Change thresholds
31
+ donut = build_donut_chart(90, 10, synthetic_data)
32
+ assert donut is not None
33
+
34
+ # Step 4: View treasure table
35
+ table = build_treasure_table(state, 90, 10, "All", synthetic_data)
36
+ assert len(table) > 0
37
+
38
+ # Step 5: Click gene -> gene card
39
+ gene_id = table.iloc[0]["gene_id"]
40
+ card = build_gene_card(gene_id, synthetic_data)
41
+ html = render_gene_card_html(card)
42
+ assert gene_id in html
43
+
44
+ # Step 6: Pin gene
45
+ state.selected_gene = gene_id
46
+ _, state = on_pin_gene(gene_id, state)
47
+ assert gene_id in state.backpack_genes
48
+
49
+ # Step 7: Generate report
50
+ report_md, json_path, csv_path, _, _, badges, state = on_generate_report(state, synthetic_data)
51
+ assert len(report_md) > 100
52
+ assert "Cartographer" in state.achievements
53
+
54
+
55
+ class TestPrecomputedConsistency:
56
+
57
+ def test_embedding_line_ids(self, synthetic_data):
58
+ """All line_ids in embedding exist in line_stats."""
59
+ emb_lines = set(synthetic_data["embedding"]["line_id"])
60
+ stat_lines = set(synthetic_data["line_stats"]["line_id"])
61
+ assert emb_lines.issubset(stat_lines)
62
+
63
+ def test_marker_gene_ids(self, synthetic_data):
64
+ """All gene_ids in markers exist in gene_freq."""
65
+ if len(synthetic_data["markers"]) == 0:
66
+ pytest.skip("No markers computed")
67
+ marker_genes = set(synthetic_data["markers"]["gene_id"])
68
+ freq_genes = set(synthetic_data["gene_freq"]["gene_id"])
69
+ assert marker_genes.issubset(freq_genes)
tests/test_precompute.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for src/precompute.py."""
2
+
3
+ import pytest
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pathlib import Path
7
+
8
+ from src.precompute import (
9
+ compute_gene_frequency, compute_line_stats,
10
+ compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers,
11
+ )
12
+
13
+
14
+ class TestGeneFrequency:
15
+
16
+ def test_freq_count_range(self, synthetic_pav):
17
+ df = compute_gene_frequency(synthetic_pav)
18
+ assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all()
19
+ assert (df["freq_count"] <= synthetic_pav.shape[1]).all()
20
+
21
+ def test_freq_classes(self, synthetic_pav):
22
+ df = compute_gene_frequency(synthetic_pav)
23
+ assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"})
24
+
25
+ def test_freq_pct_range(self, synthetic_pav):
26
+ df = compute_gene_frequency(synthetic_pav)
27
+ assert (df["freq_pct"] >= 0).all()
28
+ assert (df["freq_pct"] <= 100).all()
29
+
30
+ def test_gene_id_column(self, synthetic_pav):
31
+ df = compute_gene_frequency(synthetic_pav)
32
+ assert "gene_id" in df.columns
33
+ assert len(df) == len(synthetic_pav)
34
+
35
+
36
+ class TestLineStats:
37
+
38
+ def test_line_count(self, synthetic_pav):
39
+ df = compute_line_stats(synthetic_pav)
40
+ assert len(df) == synthetic_pav.shape[1]
41
+
42
+ def test_columns(self, synthetic_pav):
43
+ df = compute_line_stats(synthetic_pav)
44
+ assert "line_id" in df.columns
45
+ assert "country" in df.columns
46
+ assert "genes_present_count" in df.columns
47
+ assert "unique_genes_count" in df.columns
48
+
49
+
50
+ class TestSimilarity:
51
+
52
+ def test_jaccard_range(self, synthetic_pav):
53
+ df = compute_similarity_topk(synthetic_pav, k=5)
54
+ assert (df["jaccard_score"] >= 0).all()
55
+ assert (df["jaccard_score"] <= 1).all()
56
+
57
+ def test_jaccard_symmetry(self, synthetic_pav):
58
+ df = compute_similarity_topk(synthetic_pav, k=5)
59
+ # Check a few pairs
60
+ for _, row in df.head(5).iterrows():
61
+ reverse = df[
62
+ (df["line_id"] == row["neighbor_line_id"]) &
63
+ (df["neighbor_line_id"] == row["line_id"])
64
+ ]
65
+ if len(reverse) > 0:
66
+ assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001
67
+
68
+
69
+ class TestHotspots:
70
+
71
+ def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index):
72
+ gene_freq = compute_gene_frequency(synthetic_pav)
73
+ df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
74
+ if len(df) > 0:
75
+ assert (df["bin_start"] < df["bin_end"]).all()
76
+ assert (df["total_genes"] >= 0).all()
77
+
78
+
79
+ class TestClusterMarkers:
80
+
81
+ def test_marker_count(self, synthetic_pav):
82
+ line_ids = list(synthetic_pav.columns)
83
+ embedding = pd.DataFrame({
84
+ "line_id": line_ids,
85
+ "umap_x": np.random.randn(len(line_ids)),
86
+ "umap_y": np.random.randn(len(line_ids)),
87
+ "cluster_id": [i % 3 for i in range(len(line_ids))],
88
+ })
89
+ df = compute_cluster_markers(synthetic_pav, embedding, top_n=5)
90
+ # Each cluster should have at most top_n markers
91
+ for cid in df["cluster_id"].unique():
92
+ assert len(df[df["cluster_id"] == cid]) <= 5
93
+
94
+
95
+ class TestPrecomputedFiles:
96
+
97
+ def test_files_exist(self, precomputed_dir):
98
+ if precomputed_dir is None:
99
+ pytest.skip("Precomputed directory not available")
100
+ expected = [
101
+ "pav_gene_frequency.parquet",
102
+ "line_stats.parquet",
103
+ "line_embedding.parquet",
104
+ "line_similarity_topk.parquet",
105
+ "gff_gene_index.parquet",
106
+ "protein_index.parquet",
107
+ "genome_contig_index.json",
108
+ "hotspot_bins.parquet",
109
+ "cluster_markers.parquet",
110
+ ]
111
+ for f in expected:
112
+ assert (precomputed_dir / f).exists(), f"Missing: {f}"
ui/__init__.py ADDED
File without changes
ui/final.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Final tab: Field Report generation and export."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_final_tab():
7
+ """Build Final Report tab components. Returns dict of components."""
8
+ with gr.Tab("Field Report", id="final") as tab:
9
+ gr.Markdown("## Your Exploration Report")
10
+ gr.Markdown(
11
+ "Generate a summary of your pangenome exploration journey, "
12
+ "including your selected line, findings, and backpack collection."
13
+ )
14
+
15
+ generate_btn = gr.Button("Generate Report", variant="primary")
16
+
17
+ report_md = gr.Markdown(value="*Click 'Generate Report' to create your field report.*")
18
+
19
+ with gr.Row():
20
+ download_json = gr.File(label="Download JSON", visible=False)
21
+ download_csv = gr.File(label="Download CSV", visible=False)
22
+
23
+ gr.Markdown("### Achievements Earned")
24
+ achievements_html = gr.HTML(value="<p>Complete quests to earn badges!</p>")
25
+
26
+ return {
27
+ "tab": tab,
28
+ "generate_btn": generate_btn,
29
+ "report_md": report_md,
30
+ "download_json": download_json,
31
+ "download_csv": download_csv,
32
+ "achievements_html": achievements_html,
33
+ }
ui/gene_card_ui.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gene Card side panel UI."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_gene_card_panel():
7
+ """Build Gene Card side panel. Returns dict of components."""
8
+ with gr.Column(visible=False, scale=1) as gene_card_col:
9
+ gr.Markdown("### Gene Card")
10
+ gene_card_html = gr.HTML(value="")
11
+
12
+ with gr.Row():
13
+ show_genome_btn = gr.Button("Show on Genome", size="sm")
14
+ show_protein_btn = gr.Button("Show Protein", size="sm")
15
+
16
+ with gr.Row():
17
+ pin_card_btn = gr.Button("Pin to Backpack", size="sm", variant="secondary")
18
+ download_gene_btn = gr.Button("Download Report", size="sm")
19
+
20
+ gene_report_file = gr.File(label="Gene Report", visible=False)
21
+
22
+ return {
23
+ "column": gene_card_col,
24
+ "gene_card_html": gene_card_html,
25
+ "show_genome_btn": show_genome_btn,
26
+ "show_protein_btn": show_protein_btn,
27
+ "pin_card_btn": pin_card_btn,
28
+ "download_gene_btn": download_gene_btn,
29
+ "gene_report_file": gene_report_file,
30
+ }
ui/layout.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Master Gradio layout for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import gradio as gr
4
+
5
+ from ui.theme import build_theme, CUSTOM_CSS
6
+ from ui.quest0 import build_quest0
7
+ from ui.quest1 import build_quest1
8
+ from ui.quest2 import build_quest2
9
+ from ui.quest3 import build_quest3
10
+ from ui.quest4 import build_quest4
11
+ from ui.final import build_final_tab
12
+ from ui.gene_card_ui import build_gene_card_panel
13
+
14
+
15
+ def build_app(line_choices: list[str], contig_choices: list[str],
16
+ gene_choices: list[str]) -> tuple:
17
+ """
18
+ Build the full Gradio Blocks app.
19
+ Returns (demo, components_dict) where components_dict maps all UI elements.
20
+ """
21
+ theme = build_theme()
22
+
23
+ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="Pigeon Pea Pangenome Atlas") as demo:
24
+ # State
25
+ state = gr.State(value=None)
26
+
27
+ # Header
28
+ gr.Markdown("# Pigeon Pea Pangenome Atlas")
29
+ gr.Markdown("*An interactive exploration of 89 pigeon pea lines and their pangenome*")
30
+
31
+ # Progress tracker
32
+ progress_html = gr.HTML(
33
+ value=_build_progress_html(0),
34
+ elem_classes=["progress-tracker"],
35
+ )
36
+
37
+ # Global filters
38
+ with gr.Accordion("Global Filters", open=False):
39
+ with gr.Row():
40
+ country_filter = gr.CheckboxGroup(
41
+ choices=[],
42
+ label="Filter by country",
43
+ info="Leave empty to show all",
44
+ )
45
+ annotated_toggle = gr.Checkbox(
46
+ label="Only annotated genes (with GFF entry)",
47
+ value=False,
48
+ )
49
+ with gr.Accordion("What is PAV?", open=False):
50
+ gr.Markdown(
51
+ "**Presence-Absence Variation (PAV)** describes genes that are present "
52
+ "in some lines but absent from others. The pangenome captures all genes "
53
+ "across the species, not just those in a single reference genome."
54
+ )
55
+
56
+ # Main content area
57
+ with gr.Row():
58
+ # Main tabs (left ~75%)
59
+ with gr.Column(scale=3):
60
+ with gr.Tabs() as tabs:
61
+ q0 = build_quest0(line_choices)
62
+ q1 = build_quest1()
63
+ q2 = build_quest2()
64
+ q3 = build_quest3(contig_choices)
65
+ q4 = build_quest4(gene_choices)
66
+ final = build_final_tab()
67
+
68
+ # Gene Card side panel (right ~25%)
69
+ gc = build_gene_card_panel()
70
+
71
+ # Data Health accordion
72
+ with gr.Accordion("Data Health", open=False, visible=True):
73
+ data_health_html = gr.HTML(value="<p>Loading data health report...</p>")
74
+
75
+ components = {
76
+ "state": state,
77
+ "progress_html": progress_html,
78
+ "tabs": tabs,
79
+ "country_filter": country_filter,
80
+ "annotated_toggle": annotated_toggle,
81
+ "data_health_html": data_health_html,
82
+ **{f"q0_{k}": v for k, v in q0.items()},
83
+ **{f"q1_{k}": v for k, v in q1.items()},
84
+ **{f"q2_{k}": v for k, v in q2.items()},
85
+ **{f"q3_{k}": v for k, v in q3.items()},
86
+ **{f"q4_{k}": v for k, v in q4.items()},
87
+ **{f"final_{k}": v for k, v in final.items()},
88
+ **{f"gc_{k}": v for k, v in gc.items()},
89
+ }
90
+
91
+ return demo, components
92
+
93
+
94
+ def _build_progress_html(active_quest: int) -> str:
95
+ """Build progress tracker HTML."""
96
+ steps = [
97
+ ("Explorer", 0),
98
+ ("Map the World", 1),
99
+ ("Core vs Accessory", 2),
100
+ ("Genome Landmarks", 3),
101
+ ("Protein Relics", 4),
102
+ ("Field Report", 5),
103
+ ]
104
+ parts = []
105
+ for label, idx in steps:
106
+ cls = "progress-step active" if idx == active_quest else "progress-step"
107
+ parts.append(f'<span class="{cls}">{label}</span>')
108
+ return '<div class="progress-tracker">' + " ".join(parts) + '</div>'
ui/quest0.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quest 0: Choose Your Explorer — line selection and overview."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_quest0(line_choices: list[str]):
7
+ """Build Quest 0 tab components. Returns dict of components."""
8
+ with gr.Tab("Choose Your Explorer", id="quest0") as tab:
9
+ gr.Markdown("## Choose your line to explore")
10
+ gr.Markdown(
11
+ "Select one of the 89 pigeon pea lines to begin your pangenome journey. "
12
+ "Each line has a unique gene repertoire shaped by geography and breeding history."
13
+ )
14
+
15
+ line_dropdown = gr.Dropdown(
16
+ choices=line_choices,
17
+ label="Select a pigeon pea line",
18
+ info="89 lines from across the world",
19
+ interactive=True,
20
+ )
21
+
22
+ with gr.Row():
23
+ total_genes_box = gr.Textbox(
24
+ label="Total Genes Present",
25
+ interactive=False,
26
+ value="--",
27
+ )
28
+ unique_genes_box = gr.Textbox(
29
+ label="Unique Genes",
30
+ interactive=False,
31
+ value="--",
32
+ info="Genes found only in this line",
33
+ )
34
+ nearest_neighbor_box = gr.Textbox(
35
+ label="Nearest Neighbor",
36
+ interactive=False,
37
+ value="--",
38
+ info="Most similar line by gene content",
39
+ )
40
+
41
+ start_btn = gr.Button(
42
+ "Start the Journey",
43
+ variant="primary",
44
+ size="lg",
45
+ )
46
+
47
+ return {
48
+ "tab": tab,
49
+ "line_dropdown": line_dropdown,
50
+ "total_genes": total_genes_box,
51
+ "unique_genes": unique_genes_box,
52
+ "nearest_neighbor": nearest_neighbor_box,
53
+ "start_btn": start_btn,
54
+ }
ui/quest1.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quest 1: Map the World — UMAP scatter of lines."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_quest1():
7
+ """Build Quest 1 tab components. Returns dict of components."""
8
+ with gr.Tab("Map the World", id="quest1") as tab:
9
+ gr.Markdown("## How do 89 lines relate by gene content?")
10
+ gr.Markdown(
11
+ "This UMAP projection arranges lines by their gene presence/absence profiles. "
12
+ "Lines closer together share more genes."
13
+ )
14
+
15
+ color_radio = gr.Radio(
16
+ choices=["Country", "Cluster"],
17
+ value="Country",
18
+ label="Color by",
19
+ interactive=True,
20
+ )
21
+
22
+ umap_plot = gr.Plot(label="UMAP of 89 pigeon pea lines")
23
+
24
+ with gr.Row():
25
+ party_display = gr.Textbox(
26
+ label="Selected party (lasso/click to select)",
27
+ interactive=False,
28
+ value="None selected",
29
+ lines=2,
30
+ )
31
+ compare_btn = gr.Button("Compare my line to party", variant="secondary")
32
+
33
+ comparison_plot = gr.Plot(label="Comparison", visible=False)
34
+
35
+ with gr.Accordion("What does this mean?", open=False):
36
+ gr.Markdown(
37
+ "**UMAP** reduces the high-dimensional PAV matrix to 2D.\n\n"
38
+ "- **Country coloring** shows geographic origins.\n"
39
+ "- **Cluster coloring** shows groups identified by KMeans.\n"
40
+ "- **Click** a point to see its stats.\n"
41
+ "- **Lasso select** multiple points to compare with your chosen line."
42
+ )
43
+
44
+ return {
45
+ "tab": tab,
46
+ "color_radio": color_radio,
47
+ "umap_plot": umap_plot,
48
+ "party_display": party_display,
49
+ "compare_btn": compare_btn,
50
+ "comparison_plot": comparison_plot,
51
+ }
ui/quest2.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quest 2: Core vs Accessory — gene classification explorer."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_quest2():
7
+ """Build Quest 2 tab components. Returns dict of components."""
8
+ with gr.Tab("Core vs Accessory", id="quest2") as tab:
9
+ gr.Markdown("## Explore the Core, Shell, and Cloud genome")
10
+ gr.Markdown(
11
+ "Genes are classified by how many of the 89 lines carry them. "
12
+ "Adjust the thresholds to explore different definitions."
13
+ )
14
+
15
+ with gr.Row():
16
+ core_slider = gr.Slider(
17
+ minimum=50, maximum=100, value=95, step=1,
18
+ label="Core threshold (>=X%)",
19
+ info="Genes present in at least this % of lines",
20
+ )
21
+ cloud_slider = gr.Slider(
22
+ minimum=1, maximum=50, value=15, step=1,
23
+ label="Cloud threshold (<X%)",
24
+ info="Genes present in fewer than this % of lines",
25
+ )
26
+
27
+ with gr.Row():
28
+ donut_plot = gr.Plot(label="Core / Shell / Cloud distribution")
29
+ histogram_plot = gr.Plot(label="Gene frequency distribution")
30
+
31
+ gr.Markdown("### Gene Treasure List")
32
+ filter_radio = gr.Radio(
33
+ choices=["All", "Unique to my line", "Rare (<5 lines)", "Cluster markers"],
34
+ value="All",
35
+ label="Filter genes",
36
+ )
37
+ treasure_table = gr.Dataframe(
38
+ headers=["gene_id", "freq_count", "freq_pct", "core_class", "in_my_line"],
39
+ label="Genes",
40
+ interactive=False,
41
+ wrap=True,
42
+ )
43
+
44
+ with gr.Row():
45
+ selected_gene_text = gr.Textbox(
46
+ label="Selected gene",
47
+ interactive=False,
48
+ value="Click a row to select",
49
+ )
50
+ pin_btn = gr.Button("Pin to Backpack", variant="secondary")
51
+
52
+ backpack_display = gr.Textbox(
53
+ label="Backpack (pinned genes)",
54
+ interactive=False,
55
+ value="Empty",
56
+ lines=2,
57
+ )
58
+
59
+ return {
60
+ "tab": tab,
61
+ "core_slider": core_slider,
62
+ "cloud_slider": cloud_slider,
63
+ "donut_plot": donut_plot,
64
+ "histogram_plot": histogram_plot,
65
+ "filter_radio": filter_radio,
66
+ "treasure_table": treasure_table,
67
+ "selected_gene_text": selected_gene_text,
68
+ "pin_btn": pin_btn,
69
+ "backpack_display": backpack_display,
70
+ }
ui/quest3.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quest 3: Genome Landmarks — hotspot exploration."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_quest3(contig_choices: list[str]):
7
+ """Build Quest 3 tab components. Returns dict of components."""
8
+ with gr.Tab("Genome Landmarks", id="quest3") as tab:
9
+ gr.Markdown("## Explore genomic hotspots of variation")
10
+ gr.Markdown(
11
+ "The genome is divided into 100 kb bins. Hotter bins contain more "
12
+ "variable (shell/cloud) genes — potential regions of adaptation."
13
+ )
14
+
15
+ contig_dropdown = gr.Dropdown(
16
+ choices=contig_choices,
17
+ label="Select contig (top contigs by gene count)",
18
+ interactive=True,
19
+ )
20
+
21
+ heatmap_plot = gr.Plot(label="Variability heatmap (contigs x bins)")
22
+
23
+ gr.Markdown("### Contig Detail")
24
+ track_plot = gr.Plot(label="Gene track (colored by class)", visible=False)
25
+
26
+ region_table = gr.Dataframe(
27
+ headers=["gene_id", "start", "end", "strand", "core_class", "freq_pct"],
28
+ label="Genes in selected region",
29
+ interactive=False,
30
+ )
31
+
32
+ region_gene_text = gr.Textbox(
33
+ label="Selected gene from region",
34
+ interactive=False,
35
+ visible=False,
36
+ )
37
+
38
+ return {
39
+ "tab": tab,
40
+ "contig_dropdown": contig_dropdown,
41
+ "heatmap_plot": heatmap_plot,
42
+ "track_plot": track_plot,
43
+ "region_table": region_table,
44
+ "region_gene_text": region_gene_text,
45
+ }
ui/quest4.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quest 4: Protein Relics — protein analysis."""
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def build_quest4(gene_choices: list[str]):
7
+ """Build Quest 4 tab components. Returns dict of components."""
8
+ with gr.Tab("Protein Relics", id="quest4") as tab:
9
+ gr.Markdown("## Examine protein properties")
10
+ gr.Markdown(
11
+ "Explore the protein products of pangenome genes. Compare amino acid "
12
+ "compositions and lengths across your backpack collection."
13
+ )
14
+
15
+ gene_dropdown = gr.Dropdown(
16
+ choices=gene_choices,
17
+ label="Select a gene (or pick from backpack)",
18
+ interactive=True,
19
+ allow_custom_value=True,
20
+ )
21
+
22
+ protein_stats_html = gr.HTML(
23
+ value="<p>Select a gene to see protein stats</p>",
24
+ label="Protein Statistics",
25
+ )
26
+
27
+ gr.Markdown("### Backpack Comparison")
28
+ gr.Markdown("Pin at least 2 genes to your backpack to see comparisons.")
29
+
30
+ comparison_bar_plot = gr.Plot(label="Protein lengths comparison")
31
+ composition_heatmap = gr.Plot(label="Amino acid composition heatmap")
32
+
33
+ return {
34
+ "tab": tab,
35
+ "gene_dropdown": gene_dropdown,
36
+ "protein_stats_html": protein_stats_html,
37
+ "comparison_bar_plot": comparison_bar_plot,
38
+ "composition_heatmap": composition_heatmap,
39
+ }
ui/theme.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom Gradio theme for the Pigeon Pea Pangenome Atlas."""
2
+
3
+ import gradio as gr
4
+
5
+ CUSTOM_CSS = """
6
+ .quest-badge {
7
+ display: inline-block;
8
+ padding: 4px 12px;
9
+ border-radius: 16px;
10
+ font-size: 0.85em;
11
+ font-weight: 600;
12
+ margin: 2px 4px;
13
+ }
14
+ .badge-core { background: #2E7D32; color: white; }
15
+ .badge-shell { background: #FFC107; color: #333; }
16
+ .badge-cloud { background: #F44336; color: white; }
17
+
18
+ .gene-card {
19
+ border: 2px solid #2E7D32;
20
+ border-radius: 8px;
21
+ padding: 16px;
22
+ background: #F1F8E9;
23
+ }
24
+
25
+ .presence-barcode span {
26
+ display: inline-block;
27
+ width: 3px;
28
+ height: 20px;
29
+ margin: 0;
30
+ }
31
+ .presence-barcode .present { background: #4CAF50; }
32
+ .presence-barcode .absent { background: #E0E0E0; }
33
+
34
+ .progress-tracker {
35
+ display: flex;
36
+ justify-content: center;
37
+ gap: 8px;
38
+ padding: 12px 0;
39
+ }
40
+ .progress-step {
41
+ display: inline-flex;
42
+ align-items: center;
43
+ gap: 4px;
44
+ padding: 4px 10px;
45
+ border-radius: 12px;
46
+ font-size: 0.8em;
47
+ background: #E8F5E9;
48
+ color: #2E7D32;
49
+ }
50
+ .progress-step.active {
51
+ background: #2E7D32;
52
+ color: white;
53
+ font-weight: 600;
54
+ }
55
+
56
+ .stat-card {
57
+ text-align: center;
58
+ padding: 16px;
59
+ border-radius: 8px;
60
+ background: #F1F8E9;
61
+ border: 1px solid #C8E6C9;
62
+ }
63
+ .stat-card .stat-value {
64
+ font-size: 1.8em;
65
+ font-weight: 700;
66
+ color: #2E7D32;
67
+ }
68
+ .stat-card .stat-label {
69
+ font-size: 0.85em;
70
+ color: #666;
71
+ }
72
+
73
+ .achievement-badge {
74
+ display: inline-block;
75
+ padding: 6px 14px;
76
+ border-radius: 20px;
77
+ background: linear-gradient(135deg, #FFC107, #FF9800);
78
+ color: #333;
79
+ font-weight: 600;
80
+ margin: 4px;
81
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
82
+ }
83
+ """
84
+
85
+
86
+ def build_theme():
87
+ """Build custom Gradio theme."""
88
+ theme = gr.themes.Soft(
89
+ primary_hue=gr.themes.colors.green,
90
+ secondary_hue=gr.themes.colors.amber,
91
+ neutral_hue=gr.themes.colors.gray,
92
+ font=gr.themes.GoogleFont("Inter"),
93
+ ).set(
94
+ body_background_fill="#FAFAF5",
95
+ block_border_width="1px",
96
+ block_border_color="#C8E6C9",
97
+ block_radius="8px",
98
+ button_primary_background_fill="#2E7D32",
99
+ button_primary_text_color="white",
100
+ )
101
+ return theme