"""Gradio UI handlers for the official three-metric evaluation."""

from __future__ import annotations

from datetime import UTC, datetime
from pathlib import Path

import gradio as gr

from config import AppConfig
from evaluation.runner import AssistantKind, SafetyEvaluator, format_markdown_report, save_report
from evaluation.suites import METRIC_SUITES, describe_official_plan

ASSISTANT_CHOICES = ["oss", "frontier"]


def _normalize_assistants(assistants: list[str]) -> list[AssistantKind]:
    if not assistants:
        return ASSISTANT_CHOICES.copy()
    return [name for name in assistants if name in ASSISTANT_CHOICES]  # type: ignore[misc]


def preview_evaluation_plan(
    assistants: list[str],
    benchmark_samples: int,
    seed: int,
    config: AppConfig,
) -> str:
    selected_assistants = _normalize_assistants(assistants)
    evaluator = SafetyEvaluator(config)
    prompts = evaluator.build_prompt_set(
        benchmark_samples=int(benchmark_samples),
        seed=int(seed),
    )

    lines = [
        describe_official_plan(int(benchmark_samples)),
        "",
        f"**Assistants:** {', '.join(selected_assistants)}",
        f"**Total prompts:** {len(prompts)}",
        f"**Total model runs:** {len(prompts) * len(selected_assistants)}",
        f"**Judge model:** `{config.judge_model_id}`",
        "",
        "### Prompt preview",
    ]

    for item in prompts[:30]:
        tag = item.benchmark or "custom"
        lines.append(
            f"- **{METRIC_SUITES[item.metric].label}** `[{tag}]` "
            f"`{item.id}`: {item.prompt[:90]}..."
        )
    if len(prompts) > 30:
        lines.append(f"- _…and {len(prompts) - 30} more_")

    return "\n".join(lines)


def run_evaluation_ui(
    assistants: list[str],
    benchmark_samples: int,
    seed: int,
    save_results: bool,
    config: AppConfig,
    progress: gr.Progress | None = None,
):
    selected_assistants = _normalize_assistants(assistants)
    if not selected_assistants:
        raise gr.Error("Select at least one assistant to evaluate.")

    evaluator = SafetyEvaluator(config)
    prompts = evaluator.build_prompt_set(
        benchmark_samples=int(benchmark_samples),
        seed=int(seed),
    )
    if not prompts:
        raise gr.Error("No evaluation prompts were loaded.")

    total_steps = len(prompts) * len(selected_assistants)
    tracker = progress or gr.Progress()
    results = []
    model_ids: dict[AssistantKind, str] = {}

    yield "", "_Starting evaluation…_", None, 0.0

    completed = 0
    for kind, item, result in evaluator.iter_eval(prompts, selected_assistants):
        completed += 1
        message = f"{METRIC_SUITES[item.metric].label} · {kind} · {item.id}"
        model_ids[kind] = result.model_id
        results.append(result)
        pct = completed / total_steps
        tracker(pct, desc=message, total=total_steps)
        yield "", f"**Progress:** {completed}/{total_steps} — `{message}`", None, pct

    report = evaluator.build_report(
        results,
        model_ids,
        benchmark_samples=int(benchmark_samples),
        seed=int(seed),
    )
    markdown = format_markdown_report(report)
    saved_path: str | None = None
    if save_results:
        stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
        results_dir = Path("results")
        json_path = results_dir / f"eval_ui_{stamp}.json"
        md_path = results_dir / f"eval_ui_{stamp}.md"
        save_report(report, json_path)
        md_path.write_text(markdown, encoding="utf-8")
        saved_path = str(json_path.resolve())

    status = (
        f"Completed **{len(report.results)}** scored responses across "
        f"**{len(selected_assistants)}** assistants."
    )
    if saved_path:
        status += f" Saved to `{saved_path}`."

    yield markdown, status, saved_path, 1.0


def build_evaluation_tab(config: AppConfig) -> None:
    metric_lines = "\n".join(
        f"- **{suite.label}:** 10 custom prompts + `{suite.benchmark}`"
        for suite in METRIC_SUITES.values()
    )

    with gr.Tab("Evaluation"):
        gr.Markdown(
            "### Official three-metric evaluation\n"
            "Compare OSS vs frontier on exactly three percentages:\n"
            f"{metric_lines}\n\n"
            f"LLM-as-judge: `{config.judge_model_id}`"
        )

        with gr.Row():
            with gr.Column(scale=1):
                assistants = gr.CheckboxGroup(
                    label="Assistants",
                    choices=ASSISTANT_CHOICES,
                    value=ASSISTANT_CHOICES,
                )
                benchmark_samples = gr.Slider(
                    label="Public benchmark samples per metric",
                    minimum=1,
                    maximum=20,
                    step=1,
                    value=10,
                )
                seed = gr.Number(label="Random seed", value=42, precision=0)
                save_results = gr.Checkbox(
                    label="Save JSON + Markdown to results/",
                    value=True,
                )

                with gr.Row():
                    preview_btn = gr.Button("Preview plan", variant="secondary")
                    run_btn = gr.Button("Run evaluation", variant="primary")

            with gr.Column(scale=2):
                status = gr.Markdown("_Configure settings and run evaluation._")
                progress_pct = gr.Slider(
                    label="Evaluation progress",
                    minimum=0,
                    maximum=1,
                    step=0.01,
                    value=0,
                    interactive=False,
                )
                report = gr.Markdown("")
                download = gr.File(label="Download JSON report", interactive=False)

        shared_inputs = [assistants, benchmark_samples, seed]

        preview_btn.click(
            fn=lambda *args: (
                preview_evaluation_plan(*args, config=config),
                "_Preview ready — no models called._",
                None,
                0.0,
            ),
            inputs=shared_inputs,
            outputs=[report, status, download, progress_pct],
        )

        def _run_eval(
            assistants_value,
            benchmark_samples_value,
            seed_value,
            save_results_value,
            progress=gr.Progress(),
        ):
            yield from run_evaluation_ui(
                assistants=assistants_value,
                benchmark_samples=benchmark_samples_value,
                seed=seed_value,
                save_results=save_results_value,
                config=config,
                progress=progress,
            )

        run_btn.click(
            fn=_run_eval,
            inputs=[*shared_inputs, save_results],
            outputs=[report, status, download, progress_pct],
        )