"""
Unit tests for drift detection — detect_drift() only.
No model loading, no IO, no telemetry.
"""

import sys
from pathlib import Path

import numpy as np
import pytest

sys.path.insert(0, str(Path(__file__).parent.parent.parent / "eval"))

from drift import ALPHA, MIN_CURRENT_SAMPLES, MetricDrift, detect_drift

METRICS = ["faithfulness", "answer_relevancy", "pii_leakage", "token_budget", "chain_terminology"]


def _scores(n: int, **col_values: list[float]) -> dict[str, list[float]]:
    """Build a Scores dict with fixed values per column; defaults to 0.9 for others."""
    data: dict[str, list[float]] = {}
    for metric in METRICS:
        data[metric] = col_values.get(metric, [0.9] * n)
    return data


class TestDetectDrift:
    def test_identical_distributions_no_drift(self) -> None:
        rng = np.random.default_rng(42)
        scores = rng.uniform(0.5, 1.0, 50).tolist()
        ref = _scores(50, faithfulness=scores)
        cur = _scores(50, faithfulness=scores)
        results = detect_drift(cur, ref)
        faith = next(r for r in results if r.metric == "faithfulness")
        assert faith.drifted is False

    def test_shifted_distribution_detected(self) -> None:
        ref = _scores(50, faithfulness=[0.9] * 50)
        cur = _scores(50, faithfulness=[0.1] * 50)
        results = detect_drift(cur, ref)
        faith = next(r for r in results if r.metric == "faithfulness")
        assert faith.drifted is True
        assert faith.p_value < ALPHA

    def test_below_min_samples_excluded(self) -> None:
        ref = _scores(50)
        cur = _scores(MIN_CURRENT_SAMPLES - 1)
        results = detect_drift(cur, ref)
        assert results == []

    def test_exactly_min_samples_included(self) -> None:
        ref = _scores(50)
        cur = _scores(MIN_CURRENT_SAMPLES)
        results = detect_drift(cur, ref)
        assert len(results) == len(METRICS)

    def test_ks_statistic_in_range(self) -> None:
        ref = _scores(50, faithfulness=[0.9] * 50)
        cur = _scores(50, faithfulness=[0.1] * 50)
        results = detect_drift(cur, ref)
        faith = next(r for r in results if r.metric == "faithfulness")
        assert 0.0 <= faith.ks_statistic <= 1.0

    def test_means_computed_correctly(self) -> None:
        ref = _scores(10, faithfulness=[0.8] * 10)
        cur = _scores(10, faithfulness=[0.4] * 10)
        results = detect_drift(cur, ref)
        faith = next(r for r in results if r.metric == "faithfulness")
        assert faith.ref_mean == pytest.approx(0.8, abs=1e-3)
        assert faith.cur_mean == pytest.approx(0.4, abs=1e-3)

    def test_all_metrics_returned(self) -> None:
        ref = _scores(30)
        cur = _scores(30)
        result_names = {r.metric for r in detect_drift(cur, ref)}
        assert result_names == set(METRICS)

    def test_result_is_metric_drift_dataclass(self) -> None:
        ref = _scores(20)
        cur = _scores(20)
        for r in detect_drift(cur, ref):
            assert isinstance(r, MetricDrift)
            assert isinstance(r.drifted, bool)
            assert isinstance(r.ks_statistic, float)
            assert isinstance(r.p_value, float)

    def test_custom_alpha_respected(self) -> None:
        rng = np.random.default_rng(0)
        ref = _scores(50, faithfulness=rng.uniform(0.7, 1.0, 50).tolist())
        cur = _scores(50, faithfulness=rng.uniform(0.4, 0.7, 50).tolist())
        strict = detect_drift(cur, ref, alpha=0.001)
        lenient = detect_drift(cur, ref, alpha=0.999)
        faith_strict = next(r for r in strict if r.metric == "faithfulness")
        faith_lenient = next(r for r in lenient if r.metric == "faithfulness")
        assert faith_lenient.drifted or not faith_strict.drifted

    def test_missing_metric_column_skipped(self) -> None:
        ref: dict[str, list[float]] = {"faithfulness": [0.9] * 20}
        cur: dict[str, list[float]] = {"faithfulness": [0.4] * 20}
        results = detect_drift(cur, ref)
        assert all(r.metric == "faithfulness" for r in results)
        assert len(results) == 1

    def test_empty_reference_skipped(self) -> None:
        ref: dict[str, list[float]] = {"faithfulness": []}
        cur: dict[str, list[float]] = {"faithfulness": [0.4] * 20}
        results = detect_drift(cur, ref)
        assert results == []

    def test_sample_counts_in_result(self) -> None:
        ref = _scores(30)
        cur = _scores(10)
        results = detect_drift(cur, ref)
        for r in results:
            assert r.ref_n == 30
            assert r.cur_n == 10