"""Tests for src/precompute.py.""" import pytest import numpy as np import pandas as pd from pathlib import Path from src.precompute import ( compute_gene_frequency, compute_line_stats, compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers, ) class TestGeneFrequency: def test_freq_count_range(self, synthetic_pav): df = compute_gene_frequency(synthetic_pav) assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all() assert (df["freq_count"] <= synthetic_pav.shape[1]).all() def test_freq_classes(self, synthetic_pav): df = compute_gene_frequency(synthetic_pav) assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"}) def test_freq_pct_range(self, synthetic_pav): df = compute_gene_frequency(synthetic_pav) assert (df["freq_pct"] >= 0).all() assert (df["freq_pct"] <= 100).all() def test_gene_id_column(self, synthetic_pav): df = compute_gene_frequency(synthetic_pav) assert "gene_id" in df.columns assert len(df) == len(synthetic_pav) class TestLineStats: def test_line_count(self, synthetic_pav): df = compute_line_stats(synthetic_pav) assert len(df) == synthetic_pav.shape[1] def test_columns(self, synthetic_pav): df = compute_line_stats(synthetic_pav) assert "line_id" in df.columns assert "country" in df.columns assert "genes_present_count" in df.columns assert "unique_genes_count" in df.columns class TestSimilarity: def test_jaccard_range(self, synthetic_pav): df = compute_similarity_topk(synthetic_pav, k=5) assert (df["jaccard_score"] >= 0).all() assert (df["jaccard_score"] <= 1).all() def test_jaccard_symmetry(self, synthetic_pav): df = compute_similarity_topk(synthetic_pav, k=5) # Check a few pairs for _, row in df.head(5).iterrows(): reverse = df[ (df["line_id"] == row["neighbor_line_id"]) & (df["neighbor_line_id"] == row["line_id"]) ] if len(reverse) > 0: assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001 class TestHotspots: def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index): gene_freq = compute_gene_frequency(synthetic_pav) df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index) if len(df) > 0: assert (df["bin_start"] < df["bin_end"]).all() assert (df["total_genes"] >= 0).all() class TestClusterMarkers: def test_marker_count(self, synthetic_pav): line_ids = list(synthetic_pav.columns) embedding = pd.DataFrame({ "line_id": line_ids, "umap_x": np.random.randn(len(line_ids)), "umap_y": np.random.randn(len(line_ids)), "cluster_id": [i % 3 for i in range(len(line_ids))], }) df = compute_cluster_markers(synthetic_pav, embedding, top_n=5) # Each cluster should have at most top_n markers for cid in df["cluster_id"].unique(): assert len(df[df["cluster_id"] == cid]) <= 5 class TestPrecomputedFiles: def test_files_exist(self, precomputed_dir): if precomputed_dir is None: pytest.skip("Precomputed directory not available") expected = [ "pav_gene_frequency.parquet", "line_stats.parquet", "line_embedding.parquet", "line_similarity_topk.parquet", "gff_gene_index.parquet", "protein_index.parquet", "genome_contig_index.json", "hotspot_bins.parquet", "cluster_markers.parquet", ] for f in expected: assert (precomputed_dir / f).exists(), f"Missing: {f}"