"""Tests for src/precompute.py."""

import pytest
import numpy as np
import pandas as pd
from pathlib import Path

from src.precompute import (
    compute_gene_frequency, compute_line_stats,
    compute_similarity_topk, compute_hotspot_bins, compute_cluster_markers,
)


class TestGeneFrequency:

    def test_freq_count_range(self, synthetic_pav):
        df = compute_gene_frequency(synthetic_pav)
        assert (df["freq_count"] >= 1).all() or (df["freq_count"] >= 0).all()
        assert (df["freq_count"] <= synthetic_pav.shape[1]).all()

    def test_freq_classes(self, synthetic_pav):
        df = compute_gene_frequency(synthetic_pav)
        assert set(df["core_class"].unique()).issubset({"core", "shell", "cloud"})

    def test_freq_pct_range(self, synthetic_pav):
        df = compute_gene_frequency(synthetic_pav)
        assert (df["freq_pct"] >= 0).all()
        assert (df["freq_pct"] <= 100).all()

    def test_gene_id_column(self, synthetic_pav):
        df = compute_gene_frequency(synthetic_pav)
        assert "gene_id" in df.columns
        assert len(df) == len(synthetic_pav)


class TestLineStats:

    def test_line_count(self, synthetic_pav):
        df = compute_line_stats(synthetic_pav)
        assert len(df) == synthetic_pav.shape[1]

    def test_columns(self, synthetic_pav):
        df = compute_line_stats(synthetic_pav)
        assert "line_id" in df.columns
        assert "country" in df.columns
        assert "genes_present_count" in df.columns
        assert "unique_genes_count" in df.columns


class TestSimilarity:

    def test_jaccard_range(self, synthetic_pav):
        df = compute_similarity_topk(synthetic_pav, k=5)
        assert (df["jaccard_score"] >= 0).all()
        assert (df["jaccard_score"] <= 1).all()

    def test_jaccard_symmetry(self, synthetic_pav):
        df = compute_similarity_topk(synthetic_pav, k=5)
        # Check a few pairs
        for _, row in df.head(5).iterrows():
            reverse = df[
                (df["line_id"] == row["neighbor_line_id"]) &
                (df["neighbor_line_id"] == row["line_id"])
            ]
            if len(reverse) > 0:
                assert abs(reverse.iloc[0]["jaccard_score"] - row["jaccard_score"]) < 0.001


class TestHotspots:

    def test_bins_valid(self, synthetic_gff, synthetic_pav, synthetic_contig_index):
        gene_freq = compute_gene_frequency(synthetic_pav)
        df = compute_hotspot_bins(synthetic_gff, gene_freq, synthetic_contig_index)
        if len(df) > 0:
            assert (df["bin_start"] < df["bin_end"]).all()
            assert (df["total_genes"] >= 0).all()


class TestClusterMarkers:

    def test_marker_count(self, synthetic_pav):
        line_ids = list(synthetic_pav.columns)
        embedding = pd.DataFrame({
            "line_id": line_ids,
            "umap_x": np.random.randn(len(line_ids)),
            "umap_y": np.random.randn(len(line_ids)),
            "cluster_id": [i % 3 for i in range(len(line_ids))],
        })
        df = compute_cluster_markers(synthetic_pav, embedding, top_n=5)
        # Each cluster should have at most top_n markers
        for cid in df["cluster_id"].unique():
            assert len(df[df["cluster_id"] == cid]) <= 5


class TestPrecomputedFiles:

    def test_files_exist(self, precomputed_dir):
        if precomputed_dir is None:
            pytest.skip("Precomputed directory not available")
        expected = [
            "pav_gene_frequency.parquet",
            "line_stats.parquet",
            "line_embedding.parquet",
            "line_similarity_topk.parquet",
            "gff_gene_index.parquet",
            "protein_index.parquet",
            "genome_contig_index.json",
            "hotspot_bins.parquet",
            "cluster_markers.parquet",
        ]
        for f in expected:
            assert (precomputed_dir / f).exists(), f"Missing: {f}"