Spaces:

AshkanTaghipour
/

PanGenomeWatchAI

Build error

PanGenomeWatchAI / tests /test_precompute.py

Ashkan Taghipour (The University of Western Australia)

Initial deploy: Pigeon Pea Pangenome Atlas

16e4ad5 2 months ago

3.88 kB

	"""Tests for src/precompute.py."""

	import pytest
	import numpy as np
	import pandas as pd
	from pathlib import Path

	from src.precompute import (
	compute_gene_frequency, compute_line_stats,
	compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers,
	)


	class TestGeneFrequency:

	def test_freq_count_range(self, synthetic_pav):
	df = compute_gene_frequency(synthetic_pav)
	assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all()
	assert (df["freq_count"] <= synthetic_pav.shape[1]).all()

	def test_freq_classes(self, synthetic_pav):
	df = compute_gene_frequency(synthetic_pav)
	assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"})

	def test_freq_pct_range(self, synthetic_pav):
	df = compute_gene_frequency(synthetic_pav)
	assert (df["freq_pct"] >= 0).all()
	assert (df["freq_pct"] <= 100).all()

	def test_gene_id_column(self, synthetic_pav):
	df = compute_gene_frequency(synthetic_pav)
	assert "gene_id" in df.columns
	assert len(df) == len(synthetic_pav)


	class TestLineStats:

	def test_line_count(self, synthetic_pav):
	df = compute_line_stats(synthetic_pav)
	assert len(df) == synthetic_pav.shape[1]

	def test_columns(self, synthetic_pav):
	df = compute_line_stats(synthetic_pav)
	assert "line_id" in df.columns
	assert "country" in df.columns
	assert "genes_present_count" in df.columns
	assert "unique_genes_count" in df.columns


	class TestSimilarity:

	def test_jaccard_range(self, synthetic_pav):
	df = compute_similarity_topk(synthetic_pav, k=5)
	assert (df["jaccard_score"] >= 0).all()
	assert (df["jaccard_score"] <= 1).all()

	def test_jaccard_symmetry(self, synthetic_pav):
	df = compute_similarity_topk(synthetic_pav, k=5)
	# Check a few pairs
	for _, row in df.head(5).iterrows():
	reverse = df[
	(df["line_id"] == row["neighbor_line_id"]) &
	(df["neighbor_line_id"] == row["line_id"])
	]
	if len(reverse) > 0:
	assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001


	class TestHotspots:

	def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index):
	gene_freq = compute_gene_frequency(synthetic_pav)
	df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
	if len(df) > 0:
	assert (df["bin_start"] < df["bin_end"]).all()
	assert (df["total_genes"] >= 0).all()


	class TestClusterMarkers:

	def test_marker_count(self, synthetic_pav):
	line_ids = list(synthetic_pav.columns)
	embedding = pd.DataFrame({
	"line_id": line_ids,
	"umap_x": np.random.randn(len(line_ids)),
	"umap_y": np.random.randn(len(line_ids)),
	"cluster_id": [i % 3 for i in range(len(line_ids))],
	})
	df = compute_cluster_markers(synthetic_pav, embedding, top_n=5)
	# Each cluster should have at most top_n markers
	for cid in df["cluster_id"].unique():
	assert len(df[df["cluster_id"] == cid]) <= 5


	class TestPrecomputedFiles:

	def test_files_exist(self, precomputed_dir):
	if precomputed_dir is None:
	pytest.skip("Precomputed directory not available")
	expected = [
	"pav_gene_frequency.parquet",
	"line_stats.parquet",
	"line_embedding.parquet",
	"line_similarity_topk.parquet",
	"gff_gene_index.parquet",
	"protein_index.parquet",
	"genome_contig_index.json",
	"hotspot_bins.parquet",
	"cluster_markers.parquet",
	]
	for f in expected:
	assert (precomputed_dir / f).exists(), f"Missing: {f}"