PanGenomeWatchAI / tests /conftest.py
Ashkan Taghipour (The University of Western Australia)
Initial deploy: Pigeon Pea Pangenome Atlas
16e4ad5
"""Shared test fixtures for the Pigeon Pea Pangenome Atlas."""
import os
import sys
import pytest
import numpy as np
import pandas as pd
from pathlib import Path
# Ensure project root is in path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.state import AppState
@pytest.fixture
def synthetic_pav():
"""20 genes x 10 lines, random 0/1."""
np.random.seed(42)
genes = [f"g{i:05d}" for i in range(20)]
lines = [f"Line_{i}_India" for i in range(10)]
data = np.random.randint(0, 2, size=(20, 10))
return pd.DataFrame(data, index=genes, columns=lines)
@pytest.fixture
def synthetic_gff():
"""20 genes on 3 contigs."""
records = []
for i in range(20):
contig = f"contig_{i % 3}"
start = i * 1000
end = start + 500
records.append({
"gene_id": f"g{i:05d}",
"contig_id": contig,
"start": start,
"end": end,
"strand": "+" if i % 2 == 0 else "-",
})
return pd.DataFrame(records)
@pytest.fixture
def synthetic_protein():
"""20 genes with random lengths."""
records = []
for i in range(20):
records.append({
"gene_id": f"g{i:05d}",
"protein_length": np.random.randint(50, 1000),
"composition_summary": "L:9.0%, A:8.0%, G:7.0%, S:6.0%, V:5.0%",
})
return pd.DataFrame(records)
@pytest.fixture
def synthetic_contig_index():
"""3 contigs."""
return {"contig_0": 50000, "contig_1": 80000, "contig_2": 30000}
@pytest.fixture
def empty_state():
"""Fresh AppState."""
return AppState()
@pytest.fixture
def populated_state():
"""AppState with some selections."""
state = AppState()
state.selected_line = "Line_0_India"
state.backpack_genes = ["g00001", "g00002"]
state.achievements = {"Explorer"}
return state
@pytest.fixture
def synthetic_data(synthetic_pav, synthetic_gff, synthetic_protein, synthetic_contig_index):
"""Complete synthetic data dict matching app DATA format."""
from src.precompute import (
compute_gene_frequency, compute_line_stats,
compute_line_embedding, compute_similarity_topk,
compute_hotspot_bins, compute_cluster_markers,
)
gene_freq = compute_gene_frequency(synthetic_pav)
line_stats = compute_line_stats(synthetic_pav)
# Simple embedding without UMAP (too small for meaningful UMAP)
line_ids = list(synthetic_pav.columns)
embedding = pd.DataFrame({
"line_id": line_ids,
"umap_x": np.random.randn(len(line_ids)),
"umap_y": np.random.randn(len(line_ids)),
"cluster_id": [i % 3 for i in range(len(line_ids))],
})
similarity = compute_similarity_topk(synthetic_pav, k=5)
hotspots = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
markers = compute_cluster_markers(synthetic_pav, embedding)
return {
"pav": synthetic_pav,
"gene_freq": gene_freq,
"line_stats": line_stats,
"embedding": embedding,
"similarity": similarity,
"gff_index": synthetic_gff,
"protein": synthetic_protein,
"hotspots": hotspots,
"markers": markers,
}
@pytest.fixture
def precomputed_dir():
"""Path to precomputed directory (if exists)."""
p = Path(__file__).resolve().parent.parent / "precomputed"
if p.exists():
return p
return None