Spaces:

AshkanTaghipour
/

PanGenomeWatchAI

Build error

Ashkan Taghipour (The University of Western Australia)

Initial deploy: Pigeon Pea Pangenome Atlas

16e4ad5 2 months ago

3.44 kB

	"""Shared test fixtures for the Pigeon Pea Pangenome Atlas."""

	import os
	import sys
	import pytest
	import numpy as np
	import pandas as pd
	from pathlib import Path

	# Ensure project root is in path
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from src.state import AppState


	@pytest.fixture
	def synthetic_pav():
	"""20 genes x 10 lines, random 0/1."""
	np.random.seed(42)
	genes = [f"g{i:05d}" for i in range(20)]
	lines = [f"Line_{i}_India" for i in range(10)]
	data = np.random.randint(0, 2, size=(20, 10))
	return pd.DataFrame(data, index=genes, columns=lines)


	@pytest.fixture
	def synthetic_gff():
	"""20 genes on 3 contigs."""
	records = []
	for i in range(20):
	contig = f"contig_{i % 3}"
	start = i * 1000
	end = start + 500
	records.append({
	"gene_id": f"g{i:05d}",
	"contig_id": contig,
	"start": start,
	"end": end,
	"strand": "+" if i % 2 == 0 else "-",
	})
	return pd.DataFrame(records)


	@pytest.fixture
	def synthetic_protein():
	"""20 genes with random lengths."""
	records = []
	for i in range(20):
	records.append({
	"gene_id": f"g{i:05d}",
	"protein_length": np.random.randint(50, 1000),
	"composition_summary": "L:9.0%, A:8.0%, G:7.0%, S:6.0%, V:5.0%",
	})
	return pd.DataFrame(records)


	@pytest.fixture
	def synthetic_contig_index():
	"""3 contigs."""
	return {"contig_0": 50000, "contig_1": 80000, "contig_2": 30000}


	@pytest.fixture
	def empty_state():
	"""Fresh AppState."""
	return AppState()


	@pytest.fixture
	def populated_state():
	"""AppState with some selections."""
	state = AppState()
	state.selected_line = "Line_0_India"
	state.backpack_genes = ["g00001", "g00002"]
	state.achievements = {"Explorer"}
	return state


	@pytest.fixture
	def synthetic_data(synthetic_pav, synthetic_gff, synthetic_protein, synthetic_contig_index):
	"""Complete synthetic data dict matching app DATA format."""
	from src.precompute import (
	compute_gene_frequency, compute_line_stats,
	compute_line_embedding, compute_similarity_topk,
	compute_hotspot_bins, compute_cluster_markers,
	)

	gene_freq = compute_gene_frequency(synthetic_pav)
	line_stats = compute_line_stats(synthetic_pav)

	# Simple embedding without UMAP (too small for meaningful UMAP)
	line_ids = list(synthetic_pav.columns)
	embedding = pd.DataFrame({
	"line_id": line_ids,
	"umap_x": np.random.randn(len(line_ids)),
	"umap_y": np.random.randn(len(line_ids)),
	"cluster_id": [i % 3 for i in range(len(line_ids))],
	})

	similarity = compute_similarity_topk(synthetic_pav, k=5)

	hotspots = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
	markers = compute_cluster_markers(synthetic_pav, embedding)

	return {
	"pav": synthetic_pav,
	"gene_freq": gene_freq,
	"line_stats": line_stats,
	"embedding": embedding,
	"similarity": similarity,
	"gff_index": synthetic_gff,
	"protein": synthetic_protein,
	"hotspots": hotspots,
	"markers": markers,
	}


	@pytest.fixture
	def precomputed_dir():
	"""Path to precomputed directory (if exists)."""
	p = Path(__file__).resolve().parent.parent / "precomputed"
	if p.exists():
	return p
	return None