PanGenomeWatchAI / tests /test_precompute.py
Ashkan Taghipour (The University of Western Australia)
Initial deploy: Pigeon Pea Pangenome Atlas
16e4ad5
"""Tests for src/precompute.py."""
import pytest
import numpy as np
import pandas as pd
from pathlib import Path
from src.precompute import (
compute_gene_frequency, compute_line_stats,
compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers,
)
class TestGeneFrequency:
def test_freq_count_range(self, synthetic_pav):
df = compute_gene_frequency(synthetic_pav)
assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all()
assert (df["freq_count"] <= synthetic_pav.shape[1]).all()
def test_freq_classes(self, synthetic_pav):
df = compute_gene_frequency(synthetic_pav)
assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"})
def test_freq_pct_range(self, synthetic_pav):
df = compute_gene_frequency(synthetic_pav)
assert (df["freq_pct"] >= 0).all()
assert (df["freq_pct"] <= 100).all()
def test_gene_id_column(self, synthetic_pav):
df = compute_gene_frequency(synthetic_pav)
assert "gene_id" in df.columns
assert len(df) == len(synthetic_pav)
class TestLineStats:
def test_line_count(self, synthetic_pav):
df = compute_line_stats(synthetic_pav)
assert len(df) == synthetic_pav.shape[1]
def test_columns(self, synthetic_pav):
df = compute_line_stats(synthetic_pav)
assert "line_id" in df.columns
assert "country" in df.columns
assert "genes_present_count" in df.columns
assert "unique_genes_count" in df.columns
class TestSimilarity:
def test_jaccard_range(self, synthetic_pav):
df = compute_similarity_topk(synthetic_pav, k=5)
assert (df["jaccard_score"] >= 0).all()
assert (df["jaccard_score"] <= 1).all()
def test_jaccard_symmetry(self, synthetic_pav):
df = compute_similarity_topk(synthetic_pav, k=5)
# Check a few pairs
for _, row in df.head(5).iterrows():
reverse = df[
(df["line_id"] == row["neighbor_line_id"]) &
(df["neighbor_line_id"] == row["line_id"])
]
if len(reverse) > 0:
assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001
class TestHotspots:
def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index):
gene_freq = compute_gene_frequency(synthetic_pav)
df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
if len(df) > 0:
assert (df["bin_start"] < df["bin_end"]).all()
assert (df["total_genes"] >= 0).all()
class TestClusterMarkers:
def test_marker_count(self, synthetic_pav):
line_ids = list(synthetic_pav.columns)
embedding = pd.DataFrame({
"line_id": line_ids,
"umap_x": np.random.randn(len(line_ids)),
"umap_y": np.random.randn(len(line_ids)),
"cluster_id": [i % 3 for i in range(len(line_ids))],
})
df = compute_cluster_markers(synthetic_pav, embedding, top_n=5)
# Each cluster should have at most top_n markers
for cid in df["cluster_id"].unique():
assert len(df[df["cluster_id"] == cid]) <= 5
class TestPrecomputedFiles:
def test_files_exist(self, precomputed_dir):
if precomputed_dir is None:
pytest.skip("Precomputed directory not available")
expected = [
"pav_gene_frequency.parquet",
"line_stats.parquet",
"line_embedding.parquet",
"line_similarity_topk.parquet",
"gff_gene_index.parquet",
"protein_index.parquet",
"genome_contig_index.json",
"hotspot_bins.parquet",
"cluster_markers.parquet",
]
for f in expected:
assert (precomputed_dir / f).exists(), f"Missing: {f}"