"""
Tests for Step 4 — Context Summarization Between Agents.

Covers:
  • summarize_pipeline_context() output < 5KB JSON for large datasets
  • All required fields present for EvalAgent (label_noise_estimate, ece, per_class_f1)
  • Secrets (hf_token) NOT present in summary
  • label_distribution truncated to top-10 when dataset has many classes
  • epoch_metrics NOT included in summary (replaced by loss_history_tail of last 5)
  • loss_history_tail is the last ≤5 entries of epoch_metrics
  • Training skipped / None → summary still valid (no KeyError)
  • Empty context → summary returns valid empty/zero fields
"""
from __future__ import annotations

import json

import pytest

from agents.base import AgentContext
from agents.pipeline import summarize_pipeline_context


# ── Fixtures ──────────────────────────────────────────────────────────────────

def _make_context(
    num_classes: int = 3,
    epoch_count: int = 5,
    label_noise: float = 0.05,
    hf_token: str | None = "secret-token",
) -> AgentContext:
    label_dist = {f"class_{i}": 100 + i * 10 for i in range(num_classes)}
    epoch_metrics = [
        {"epoch": i, "step": i, "loss": 1.0 - i * 0.1, "eval_loss": 1.1 - i * 0.09}
        for i in range(epoch_count)
    ]
    ctx = AgentContext(
        run_id="test_run",
        user_intent="classify support tickets",
        hf_token=hf_token,
    )
    ctx.task_spec = {
        "task_type": "text_classification",
        "num_labels": num_classes,
        "label_names": list(label_dist.keys()),
        "input_column": "text",
        "label_column": "label",
    }
    ctx.data_profile = {
        "num_rows": 1000,
        "num_classes": num_classes,
        "label_distribution": label_dist,
        "label_noise_estimate": label_noise,
        "label_noise_count": int(1000 * label_noise),
        "text_quality_score": 0.95,
        "issues": [],
    }
    ctx.training_result = {
        "base_model": "bert-base-uncased",
        "training_approach": "lora",
        "device": "cuda",
        "num_epochs_completed": epoch_count,
        "final_train_loss": 0.35,
        "training_time_seconds": 120,
        "accuracy": 0.88,
        "f1": 0.87,
        "precision": 0.86,
        "recall": 0.88,
        "ece": 0.04,
        "per_class_f1": {f"class_{i}": round(0.85 + i * 0.01, 2) for i in range(num_classes)},
        "num_labels": num_classes,
        "label_names": list(label_dist.keys()),
        "train_samples": 800,
        "eval_samples": 200,
        "warnings": [],
        "epoch_metrics": epoch_metrics,
        "model_path": "/runs/test_run/model",
    }
    ctx.eval_result = {
        "evaluation_grade": "B",
        "summary": "Good performance.",
        "concerns": [],
        "next_steps": ["Add more training data."],
    }
    return ctx


# ── Size check ────────────────────────────────────────────────────────────────

class TestSummarySize:
    def test_summary_under_5kb_for_3_class(self):
        ctx = _make_context(num_classes=3, epoch_count=10)
        summary = summarize_pipeline_context(ctx)
        json_bytes = len(json.dumps(summary).encode())
        assert json_bytes < 5_000, f"Summary is {json_bytes} bytes (limit: 5000)"

    def test_summary_under_5kb_for_50_class(self):
        """50-class dataset: label_distribution must be truncated to stay compact."""
        ctx = _make_context(num_classes=50, epoch_count=20)
        summary = summarize_pipeline_context(ctx)
        json_bytes = len(json.dumps(summary).encode())
        assert json_bytes < 5_000, f"Summary is {json_bytes} bytes (limit: 5000)"

    def test_summary_under_5kb_for_100_class(self):
        """100-class dataset: extreme case still stays compact."""
        ctx = _make_context(num_classes=100, epoch_count=30)
        summary = summarize_pipeline_context(ctx)
        json_bytes = len(json.dumps(summary).encode())
        assert json_bytes < 5_000, f"Summary is {json_bytes} bytes (limit: 5000)"


# ── Required fields for EvalAgent ────────────────────────────────────────────

class TestRequiredFields:
    def test_label_noise_estimate_present(self):
        ctx = _make_context(label_noise=0.12)
        summary = summarize_pipeline_context(ctx)
        assert summary["data_profile"]["label_noise_estimate"] == pytest.approx(0.12)

    def test_ece_present(self):
        ctx = _make_context()
        summary = summarize_pipeline_context(ctx)
        assert summary["training_result"]["ece"] == pytest.approx(0.04)

    def test_per_class_f1_present(self):
        ctx = _make_context(num_classes=3)
        summary = summarize_pipeline_context(ctx)
        pcf1 = summary["training_result"]["per_class_f1"]
        assert isinstance(pcf1, dict)
        assert "class_0" in pcf1

    def test_eval_grade_present(self):
        ctx = _make_context()
        summary = summarize_pipeline_context(ctx)
        assert summary["eval_result"]["evaluation_grade"] == "B"

    def test_all_top_level_keys_present(self):
        ctx = _make_context()
        summary = summarize_pipeline_context(ctx)
        for key in ("task_spec", "data_profile", "training_result", "eval_result"):
            assert key in summary, f"Missing top-level key: {key}"


# ── Secrets exclusion ─────────────────────────────────────────────────────────

class TestSecretsExclusion:
    def test_hf_token_not_in_summary(self):
        ctx = _make_context(hf_token="hf_super_secret_token_12345")
        summary = summarize_pipeline_context(ctx)
        summary_str = json.dumps(summary)
        assert "hf_super_secret_token_12345" not in summary_str
        assert "hf_token" not in summary_str

    def test_model_path_not_in_summary(self):
        ctx = _make_context()
        summary = summarize_pipeline_context(ctx)
        tr = summary["training_result"]
        assert "model_path" not in tr

    def test_dataset_path_not_in_summary(self):
        ctx = _make_context()
        ctx.dataset_path = "/private/uploads/user_data.csv"
        summary = summarize_pipeline_context(ctx)
        summary_str = json.dumps(summary)
        assert "/private/uploads" not in summary_str


# ── label_distribution truncation ────────────────────────────────────────────

class TestLabelDistTruncation:
    def test_3_classes_not_truncated(self):
        ctx = _make_context(num_classes=3)
        summary = summarize_pipeline_context(ctx)
        dist = summary["data_profile"]["label_distribution"]
        # All 3 classes present (not truncated)
        assert len(dist) == 3

    def test_10_classes_not_truncated(self):
        ctx = _make_context(num_classes=10)
        summary = summarize_pipeline_context(ctx)
        dist = summary["data_profile"]["label_distribution"]
        assert len(dist) == 10

    def test_11_classes_truncated_to_11_entries(self):
        """11th entry is 'N more classes' bucket — total dict len is 11 (10 + 1 bucket)."""
        ctx = _make_context(num_classes=11)
        summary = summarize_pipeline_context(ctx)
        dist = summary["data_profile"]["label_distribution"]
        assert len(dist) == 11  # 10 real + 1 "N more" bucket

    def test_50_classes_truncated(self):
        ctx = _make_context(num_classes=50)
        summary = summarize_pipeline_context(ctx)
        dist = summary["data_profile"]["label_distribution"]
        assert len(dist) == 11  # 10 + 1 bucket

    def test_truncated_bucket_has_remaining_class_count(self):
        ctx = _make_context(num_classes=15)
        summary = summarize_pipeline_context(ctx)
        dist = summary["data_profile"]["label_distribution"]
        bucket_key = [k for k in dist if "more" in k]
        assert len(bucket_key) == 1
        assert "5" in bucket_key[0]  # "… 5 more classes"

    def test_top_classes_are_most_frequent(self):
        """Top-10 should be the most frequent classes, not arbitrary."""
        ctx = _make_context(num_classes=20)
        # The fixture creates class_i with count 100 + i*10, so class_19 has most
        summary = summarize_pipeline_context(ctx)
        dist = summary["data_profile"]["label_distribution"]
        # class_19 (count=290) should be in top 10; class_0 (count=100) might not be
        assert "class_19" in dist


# ── epoch_metrics exclusion + loss_history_tail ───────────────────────────────

class TestEpochMetrics:
    def test_epoch_metrics_not_in_summary(self):
        ctx = _make_context(epoch_count=20)
        summary = summarize_pipeline_context(ctx)
        assert "epoch_metrics" not in summary["training_result"]

    def test_loss_history_tail_has_at_most_5_entries(self):
        ctx = _make_context(epoch_count=20)
        summary = summarize_pipeline_context(ctx)
        tail = summary["training_result"]["loss_history_tail"]
        assert len(tail) <= 5

    def test_loss_history_tail_is_last_5(self):
        ctx = _make_context(epoch_count=10)
        summary = summarize_pipeline_context(ctx)
        tail = summary["training_result"]["loss_history_tail"]
        # Last 5 epoch indices should be 5,6,7,8,9
        epochs_in_tail = [e["epoch"] for e in tail]
        assert epochs_in_tail == [5, 6, 7, 8, 9]

    def test_short_training_loss_tail_is_full(self):
        """Only 3 epochs → loss_history_tail has 3 entries (not padded)."""
        ctx = _make_context(epoch_count=3)
        summary = summarize_pipeline_context(ctx)
        tail = summary["training_result"]["loss_history_tail"]
        assert len(tail) == 3

    def test_empty_epoch_metrics_gives_empty_tail(self):
        ctx = _make_context(epoch_count=0)
        summary = summarize_pipeline_context(ctx)
        assert summary["training_result"]["loss_history_tail"] == []


# ── Edge cases ────────────────────────────────────────────────────────────────

class TestEdgeCases:
    def test_empty_context_no_crash(self):
        ctx = AgentContext(run_id="r", user_intent="test")
        # All context dicts are empty — should not raise
        summary = summarize_pipeline_context(ctx)
        assert "task_spec" in summary
        assert "data_profile" in summary
        assert "training_result" in summary

    def test_no_eval_result_gives_empty_dict(self):
        ctx = _make_context()
        ctx.eval_result = {}
        summary = summarize_pipeline_context(ctx)
        # eval_result is empty → summary returns empty dict
        assert summary["eval_result"] == {} or summary["eval_result"] is not None

    def test_training_skipped_context_no_crash(self):
        ctx = AgentContext(run_id="r", user_intent="test")
        ctx.training_result = {"status": "skipped", "reason": "no GPU"}
        ctx.data_profile = {"num_rows": 100, "label_distribution": {}}
        summary = summarize_pipeline_context(ctx)
        assert summary["training_result"]["warnings"] == []