"""Tests for PermutedJudge and Jury — aggregation, quorum, sidecar."""

from __future__ import annotations

import json
from unittest.mock import AsyncMock

import pytest

from agent_bench.agents.orchestrator import AgentResponse, SourceReference
from agent_bench.core.provider import LLMProvider
from agent_bench.core.types import CompletionResponse, TokenUsage
from agent_bench.evaluation.harness import GoldenQuestion
from agent_bench.evaluation.judges.base import Rubric
from agent_bench.evaluation.judges.relevance import RelevanceJudge


def _mk_response(content: str) -> CompletionResponse:
    return CompletionResponse(
        content=content,
        tool_calls=[],
        usage=TokenUsage(input_tokens=10, output_tokens=10, estimated_cost_usd=0.0001),
        provider="mock",
        model="m",
        latency_ms=1.0,
    )


def _vj(score) -> str:
    return json.dumps({"reasoning": "r", "evidence_quotes": [], "score": score})


def _item(item_id: str = "i1") -> GoldenQuestion:
    return GoldenQuestion(
        id=item_id,
        question="?",
        expected_answer_keywords=[],
        expected_sources=[],
        category="retrieval",
        difficulty="easy",
        requires_calculator=False,
    )


def _output(answer: str = "A.") -> AgentResponse:
    return AgentResponse(
        answer=answer,
        sources=[SourceReference(source="x.md")],
        iterations=1,
        usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
        latency_ms=0,
    )


def _relevance_judge_with_responses(responses: list[str]) -> RelevanceJudge:
    rubric = Rubric.from_markdown_file("agent_bench/evaluation/rubrics/relevance.md")
    provider = AsyncMock(spec=LLMProvider)
    provider.complete.side_effect = [_mk_response(r) for r in responses]
    return RelevanceJudge(judge_provider=provider, rubric=rubric, model_id="m")


class TestPermutedJudge:
    @pytest.mark.asyncio
    async def test_runs_n_permutations_and_means(self, tmp_path):
        from agent_bench.evaluation.variance.rubric_permute import rubric_permute

        # Two seeds produce two scores: 1 and 2; mean=1.5; ties→lower → 1
        judge = _relevance_judge_with_responses([_vj(1), _vj(2)])
        permuted = rubric_permute(
            judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl"
        )
        result = await permuted.score(_item(), _output())
        assert result.score == 1
        assert result.judge_id == "m_relevance_perm2"
        assert result.prompt_seed == 0

    @pytest.mark.asyncio
    async def test_any_abstain_propagates_unknown(self, tmp_path):
        from agent_bench.evaluation.variance.rubric_permute import rubric_permute

        judge = _relevance_judge_with_responses([_vj(1), _vj("Unknown")])
        permuted = rubric_permute(
            judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl"
        )
        result = await permuted.score(_item(), _output())
        assert result.score == "Unknown"
        assert result.abstained

    @pytest.mark.asyncio
    async def test_writes_per_permutation_sidecar(self, tmp_path):
        from agent_bench.evaluation.variance.rubric_permute import rubric_permute

        sidecar = tmp_path / "perm_members.jsonl"
        judge = _relevance_judge_with_responses([_vj(2), _vj(2)])
        permuted = rubric_permute(judge, n=2, seeds=[5, 7], sidecar_path=sidecar)
        await permuted.score(_item(), _output())

        lines = sidecar.read_text().strip().split("\n")
        assert len(lines) == 2
        records = [json.loads(line) for line in lines]
        assert {r["prompt_seed"] for r in records} == {5, 7}


class TestJury:
    @pytest.mark.asyncio
    async def test_mean_aggregation_two_judges(self, tmp_path):
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(2)])
        j2 = _relevance_judge_with_responses([_vj(2)])
        j1.judge_id = "claude-haiku_relevance"
        j2.judge_id = "gpt-4o-mini_relevance"

        ju = jury(
            judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
        )
        result = await ju.score(_item(), _output())
        assert result.score == 2
        assert result.judge_id == "jury_v1_mean"

    @pytest.mark.asyncio
    async def test_strict_quorum_default_abstains_on_one_failure(self, tmp_path):
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(1)])
        j1.judge_id = "claude-haiku_relevance"
        # Both attempts return garbage → abstain via schema-parse-after-retry
        j2 = _relevance_judge_with_responses(["garbage", "garbage"])
        j2.judge_id = "gpt-4o-mini_relevance"

        ju = jury(
            judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
        )
        result = await ju.score(_item(), _output())
        assert result.score == "Unknown"
        assert "jury_below_quorum" in result.reasoning
        assert "1/2" in result.reasoning

    @pytest.mark.asyncio
    async def test_sidecar_captures_both_members_including_abstain(self, tmp_path):
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(1)])
        j1.judge_id = "claude-haiku_relevance"
        j2 = _relevance_judge_with_responses(["garbage", "garbage"])
        j2.judge_id = "gpt-4o-mini_relevance"

        sidecar = tmp_path / "jury.jsonl"
        ju = jury(judges=[j1, j2], aggregation="mean", sidecar_path=sidecar)
        await ju.score(_item(), _output())

        records = [
            json.loads(line) for line in sidecar.read_text().strip().split("\n")
        ]
        assert len(records) == 2
        scores = [r["score"] for r in records]
        assert 1 in scores
        assert "Unknown" in scores

    @pytest.mark.asyncio
    async def test_kappa_weighted_requires_weights(self, tmp_path):
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(2)])
        with pytest.raises(ValueError, match="weights"):
            jury(judges=[j1], aggregation="kappa_weighted")

    @pytest.mark.asyncio
    async def test_kappa_weighted_with_equal_weights_matches_mean(self, tmp_path):
        """Regression for ties-to-lower divergence between mean and
        kappa_weighted paths. Two judges score [1, 2] with equal weights;
        weighted mean == 1.5. The mean path returns 1 (ties-to-lower); the
        kappa_weighted path must also return 1 — banker's rounding would
        return 2 and silently violate the policy.
        """
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(1)])
        j1.judge_id = "claude-haiku_relevance"
        j2 = _relevance_judge_with_responses([_vj(2)])
        j2.judge_id = "gpt-4o-mini_relevance"

        weights = {"claude-haiku_relevance": 1.0, "gpt-4o-mini_relevance": 1.0}
        ju = jury(
            judges=[j1, j2],
            aggregation="kappa_weighted",
            weights=weights,
            sidecar_path=tmp_path / "jury.jsonl",
        )
        result = await ju.score(_item(), _output())
        assert result.score == 1, (
            f"kappa_weighted with equal weights on [1, 2] returned "
            f"{result.score}; expected 1 (ties-to-lower per "
            f"_aggregate_scores policy). banker's-rounding bug?"
        )

    @pytest.mark.asyncio
    async def test_kappa_weighted_reasoning_reports_applied_weights(
        self, tmp_path
    ):
        """The reasoning string must surface the per-member weights actually
        used so the aggregation is auditable from the sidecar alone (no need
        to re-derive weights from the source).
        """
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(2)])
        j1.judge_id = "claude-haiku_relevance"
        j2 = _relevance_judge_with_responses([_vj(2)])
        j2.judge_id = "gpt-4o-mini_relevance"

        weights = {
            "claude-haiku_relevance": 5.0,
            "gpt-4o-mini_relevance": 0.25,
        }
        ju = jury(
            judges=[j1, j2],
            aggregation="kappa_weighted",
            weights=weights,
            sidecar_path=tmp_path / "jury.jsonl",
        )
        result = await ju.score(_item(), _output())
        assert "5.0" in result.reasoning, (
            f"applied weight 5.0 missing from reasoning: {result.reasoning!r}"
        )
        assert "0.25" in result.reasoning, (
            f"applied weight 0.25 missing from reasoning: {result.reasoning!r}"
        )

    @pytest.mark.asyncio
    async def test_kappa_weighted_hard_errors_on_missing_weight(self, tmp_path):
        """v1.1 regression: a member judge_id missing from the weights dict
        is a hard error, not a silent fallback to 1.0. v1's silent fallback
        let an asymmetric weights source amplify the unweighted member —
        see DECISIONS "v1.1 jury rescue" entry for the calibration evidence.
        """
        from agent_bench.evaluation.variance.jury import jury

        j1 = _relevance_judge_with_responses([_vj(1)])
        j1.judge_id = "claude-haiku_relevance"
        j2 = _relevance_judge_with_responses([_vj(1)])
        j2.judge_id = "gpt-4o-mini_relevance"

        weights = {"claude-haiku_relevance": 1.0}  # j2 missing
        ju = jury(
            judges=[j1, j2],
            aggregation="kappa_weighted",
            weights=weights,
            sidecar_path=tmp_path / "jury.jsonl",
        )
        with pytest.raises(ValueError, match="missing entries.*gpt-4o-mini"):
            await ju.score(_item(), _output())

    @pytest.mark.asyncio
    async def test_cancel_on_non_retryable(self, tmp_path):
        """Non-retryable exception in any member must propagate immediately."""
        from agent_bench.evaluation.judges.base import Rubric
        from agent_bench.evaluation.variance.jury import jury

        rubric = Rubric.from_markdown_file(
            "agent_bench/evaluation/rubrics/relevance.md"
        )
        # j1 raises ValueError (caller bug — not in retryable taxonomy)
        provider1 = AsyncMock(spec=LLMProvider)
        provider1.complete.side_effect = ValueError("auth_error")
        j1 = RelevanceJudge(judge_provider=provider1, rubric=rubric, model_id="m1")

        # j2 would succeed if it ran
        provider2 = AsyncMock(spec=LLMProvider)
        provider2.complete.return_value = _mk_response(_vj(1))
        j2 = RelevanceJudge(judge_provider=provider2, rubric=rubric, model_id="m2")

        ju = jury(
            judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
        )
        with pytest.raises(ValueError, match="auth_error"):
            await ju.score(_item(), _output())