Spaces:
Build error
Build error
Ashkan Taghipour (The University of Western Australia)
Initial deploy: Pigeon Pea Pangenome Atlas
16e4ad5 | """Tests for src/precompute.py.""" | |
| import pytest | |
| import numpy as np | |
| import pandas as pd | |
| from pathlib import Path | |
| from src.precompute import ( | |
| compute_gene_frequency, compute_line_stats, | |
| compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers, | |
| ) | |
| class TestGeneFrequency: | |
| def test_freq_count_range(self, synthetic_pav): | |
| df = compute_gene_frequency(synthetic_pav) | |
| assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all() | |
| assert (df["freq_count"] <= synthetic_pav.shape[1]).all() | |
| def test_freq_classes(self, synthetic_pav): | |
| df = compute_gene_frequency(synthetic_pav) | |
| assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"}) | |
| def test_freq_pct_range(self, synthetic_pav): | |
| df = compute_gene_frequency(synthetic_pav) | |
| assert (df["freq_pct"] >= 0).all() | |
| assert (df["freq_pct"] <= 100).all() | |
| def test_gene_id_column(self, synthetic_pav): | |
| df = compute_gene_frequency(synthetic_pav) | |
| assert "gene_id" in df.columns | |
| assert len(df) == len(synthetic_pav) | |
| class TestLineStats: | |
| def test_line_count(self, synthetic_pav): | |
| df = compute_line_stats(synthetic_pav) | |
| assert len(df) == synthetic_pav.shape[1] | |
| def test_columns(self, synthetic_pav): | |
| df = compute_line_stats(synthetic_pav) | |
| assert "line_id" in df.columns | |
| assert "country" in df.columns | |
| assert "genes_present_count" in df.columns | |
| assert "unique_genes_count" in df.columns | |
| class TestSimilarity: | |
| def test_jaccard_range(self, synthetic_pav): | |
| df = compute_similarity_topk(synthetic_pav, k=5) | |
| assert (df["jaccard_score"] >= 0).all() | |
| assert (df["jaccard_score"] <= 1).all() | |
| def test_jaccard_symmetry(self, synthetic_pav): | |
| df = compute_similarity_topk(synthetic_pav, k=5) | |
| # Check a few pairs | |
| for _, row in df.head(5).iterrows(): | |
| reverse = df[ | |
| (df["line_id"] == row["neighbor_line_id"]) & | |
| (df["neighbor_line_id"] == row["line_id"]) | |
| ] | |
| if len(reverse) > 0: | |
| assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001 | |
| class TestHotspots: | |
| def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index): | |
| gene_freq = compute_gene_frequency(synthetic_pav) | |
| df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index) | |
| if len(df) > 0: | |
| assert (df["bin_start"] < df["bin_end"]).all() | |
| assert (df["total_genes"] >= 0).all() | |
| class TestClusterMarkers: | |
| def test_marker_count(self, synthetic_pav): | |
| line_ids = list(synthetic_pav.columns) | |
| embedding = pd.DataFrame({ | |
| "line_id": line_ids, | |
| "umap_x": np.random.randn(len(line_ids)), | |
| "umap_y": np.random.randn(len(line_ids)), | |
| "cluster_id": [i % 3 for i in range(len(line_ids))], | |
| }) | |
| df = compute_cluster_markers(synthetic_pav, embedding, top_n=5) | |
| # Each cluster should have at most top_n markers | |
| for cid in df["cluster_id"].unique(): | |
| assert len(df[df["cluster_id"] == cid]) <= 5 | |
| class TestPrecomputedFiles: | |
| def test_files_exist(self, precomputed_dir): | |
| if precomputed_dir is None: | |
| pytest.skip("Precomputed directory not available") | |
| expected = [ | |
| "pav_gene_frequency.parquet", | |
| "line_stats.parquet", | |
| "line_embedding.parquet", | |
| "line_similarity_topk.parquet", | |
| "gff_gene_index.parquet", | |
| "protein_index.parquet", | |
| "genome_contig_index.json", | |
| "hotspot_bins.parquet", | |
| "cluster_markers.parquet", | |
| ] | |
| for f in expected: | |
| assert (precomputed_dir / f).exists(), f"Missing: {f}" | |