Spaces:
Build error
Build error
Ashkan Taghipour (The University of Western Australia)
Initial deploy: Pigeon Pea Pangenome Atlas
16e4ad5 | """Shared test fixtures for the Pigeon Pea Pangenome Atlas.""" | |
| import os | |
| import sys | |
| import pytest | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| # Ensure project root is in path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from src.state import AppState | |
| def synthetic_pav(): | |
| """20 genes x 10 lines, random 0/1.""" | |
| np.random.seed(42) | |
| genes = [f"g{i:05d}" for i in range(20)] | |
| lines = [f"Line_{i}_India" for i in range(10)] | |
| data = np.random.randint(0, 2, size=(20, 10)) | |
| return pd.DataFrame(data, index=genes, columns=lines) | |
| def synthetic_gff(): | |
| """20 genes on 3 contigs.""" | |
| records = [] | |
| for i in range(20): | |
| contig = f"contig_{i % 3}" | |
| start = i * 1000 | |
| end = start + 500 | |
| records.append({ | |
| "gene_id": f"g{i:05d}", | |
| "contig_id": contig, | |
| "start": start, | |
| "end": end, | |
| "strand": "+" if i % 2 == 0 else "-", | |
| }) | |
| return pd.DataFrame(records) | |
| def synthetic_protein(): | |
| """20 genes with random lengths.""" | |
| records = [] | |
| for i in range(20): | |
| records.append({ | |
| "gene_id": f"g{i:05d}", | |
| "protein_length": np.random.randint(50, 1000), | |
| "composition_summary": "L:9.0%, A:8.0%, G:7.0%, S:6.0%, V:5.0%", | |
| }) | |
| return pd.DataFrame(records) | |
| def synthetic_contig_index(): | |
| """3 contigs.""" | |
| return {"contig_0": 50000, "contig_1": 80000, "contig_2": 30000} | |
| def empty_state(): | |
| """Fresh AppState.""" | |
| return AppState() | |
| def populated_state(): | |
| """AppState with some selections.""" | |
| state = AppState() | |
| state.selected_line = "Line_0_India" | |
| state.backpack_genes = ["g00001", "g00002"] | |
| state.achievements = {"Explorer"} | |
| return state | |
| def synthetic_data(synthetic_pav, synthetic_gff, synthetic_protein, synthetic_contig_index): | |
| """Complete synthetic data dict matching app DATA format.""" | |
| from src.precompute import ( | |
| compute_gene_frequency, compute_line_stats, | |
| compute_line_embedding, compute_similarity_topk, | |
| compute_hotspot_bins, compute_cluster_markers, | |
| ) | |
| gene_freq = compute_gene_frequency(synthetic_pav) | |
| line_stats = compute_line_stats(synthetic_pav) | |
| # Simple embedding without UMAP (too small for meaningful UMAP) | |
| line_ids = list(synthetic_pav.columns) | |
| embedding = pd.DataFrame({ | |
| "line_id": line_ids, | |
| "umap_x": np.random.randn(len(line_ids)), | |
| "umap_y": np.random.randn(len(line_ids)), | |
| "cluster_id": [i % 3 for i in range(len(line_ids))], | |
| }) | |
| similarity = compute_similarity_topk(synthetic_pav, k=5) | |
| hotspots = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index) | |
| markers = compute_cluster_markers(synthetic_pav, embedding) | |
| return { | |
| "pav": synthetic_pav, | |
| "gene_freq": gene_freq, | |
| "line_stats": line_stats, | |
| "embedding": embedding, | |
| "similarity": similarity, | |
| "gff_index": synthetic_gff, | |
| "protein": synthetic_protein, | |
| "hotspots": hotspots, | |
| "markers": markers, | |
| } | |
| def precomputed_dir(): | |
| """Path to precomputed directory (if exists).""" | |
| p = Path(__file__).resolve().parent.parent / "precomputed" | |
| if p.exists(): | |
| return p | |
| return None | |