"""
Evaluate inference over N random episodes (default 100).

Requires OpenEnv server and OPENAI_API_KEY. The server's active task (``EARNINGS_ANALYST_TASK_ID``)
must match what you are measuring; ``--task`` here selects which task spec is used for report labels.

Usage:
    uv run python evaluate.py
    uv run python evaluate.py --samples 50 --quiet
    uv run python evaluate.py --samples 10 -o results.csv
"""

from __future__ import annotations

import argparse
import asyncio
import csv
import os
import sys
from collections import defaultdict

from dotenv import load_dotenv

from earnings_analyst.environment_config import DEFAULT_TASK, TASKS
from inference import run_episode

load_dotenv()


def _normalize_label(label_text: str) -> str:
    return str(label_text).strip().lower()


def exact_match(predicted_label: str, ground_truth_label: str) -> bool:
    return _normalize_label(predicted_label) == _normalize_label(ground_truth_label)


def confusion_key(predicted_label: str, ground_truth_label: str) -> tuple[str, str]:
    return (
        _normalize_label(predicted_label),
        _normalize_label(ground_truth_label),
    )


_CSV_FIELDNAMES = (
    "sample_index",
    "task_id",
    "model",
    "predicted",
    "ground_truth",
    "exact_match",
    "reward",
    "done",
    "model_response",
)


async def run_evaluation(
    *,
    samples: int,
    base_url: str | None,
    model: str | None,
    task_id: str,
    quiet: bool,
    output_path: str | None,
) -> None:
    spec = TASKS.get(task_id) or TASKS[DEFAULT_TASK]
    label_values = list(spec["label_values"])
    resolved_model = model or os.environ.get("OPENAI_MODEL", "gpt-4o")

    rewards: list[float] = []
    exact_match_count = 0
    confusion: dict[tuple[str, str], int] = defaultdict(int)
    per_ground_truth_label: dict[str, dict[str, int]] = defaultdict(
        lambda: {"n": 0, "correct": 0}
    )
    csv_rows: list[dict[str, str | int | float | bool]] = []

    for episode_index in range(samples):
        if not quiet:
            print(f"episode {episode_index + 1}/{samples} ...", flush=True)
        episode_result = await run_episode(
            base_url=base_url,
            model=model,
            verbose=True,

        )
        episode_reward = float(
            episode_result.reward if episode_result.reward is not None else 0.0
        )
        rewards.append(episode_reward)
        ground_truth_label = episode_result.ground_truth
        predicted_label = episode_result.predicted
        is_exact = exact_match(predicted_label, ground_truth_label)
        if is_exact:
            exact_match_count += 1
        confusion[confusion_key(predicted_label, ground_truth_label)] += 1
        normalized_ground_truth = _normalize_label(ground_truth_label)
        per_ground_truth_label[normalized_ground_truth]["n"] += 1
        if is_exact:
            per_ground_truth_label[normalized_ground_truth]["correct"] += 1

        # --- VERBOSE PRINTING BLOCK (Safe to remove) ---
        # NOTE: This block is exclusively for result visibility in the console.
        if not quiet:
            print(f"\n--- Episode {episode_index + 1}/{samples} Summary ---")
            print(f"Reward: {episode_reward:.4f}")
            print(f"Predicted: {predicted_label}")
            print(f"Ground Truth: {ground_truth_label}")
            print(f"Model Response: {episode_result.model_response_text}")
            print("-" * 40)
        # ------------------------------------------------

        csv_rows.append(
            {
                "sample_index": episode_index + 1,
                "task_id": task_id,
                "model": resolved_model,
                "predicted": predicted_label,
                "ground_truth": ground_truth_label,
                "exact_match": is_exact,
                "reward": episode_reward,
                "done": episode_result.done,
                "model_response": episode_result.model_response_text or "",
            }
        )

    mean_reward = sum(rewards) / len(rewards) if rewards else 0.0
    exact_accuracy = exact_match_count / samples if samples else 0.0

    print("\n=== Evaluation summary ===")
    print(f"samples: {samples}")
    print(f"mean_reward: {mean_reward:.4f}")
    print(f"exact_accuracy: {exact_accuracy:.4f} ({exact_match_count}/{samples})")
    if label_values:
        print("\nPer ground-truth label (exact match rate):")
        for lab in label_values:
            normalized_key = _normalize_label(lab)
            row = per_ground_truth_label.get(normalized_key, {"n": 0, "correct": 0})
            total_count, correct_count = row["n"], row["correct"]
            rate = (correct_count / total_count) if total_count else 0.0
            print(f"  {lab!r}: {rate:.4f} ({correct_count}/{total_count})")
    else:
        print(
            "\n(No label_values in selected task spec — add them when the task is implemented.)"
        )

    print("\nConfusion (predicted -> counts by ground_truth):")
    counts_by_truth: dict[str, list[tuple[str, int]]] = defaultdict(list)
    for (
        predicted_normalized,
        truth_normalized,
    ), occurrence_count in sorted(confusion.items()):
        counts_by_truth[truth_normalized].append(
            (predicted_normalized, occurrence_count)
        )
    for truth_normalized in sorted(counts_by_truth.keys()):
        parts = ", ".join(
            f"{predicted_normalized!r}:{occurrence_count}"
            for predicted_normalized, occurrence_count in sorted(
                counts_by_truth[truth_normalized]
            )
        )
        print(f"  truth={truth_normalized!r}: {parts}")

    if output_path:
        with open(output_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=_CSV_FIELDNAMES)
            writer.writeheader()
            writer.writerows(csv_rows)
        print(f"\nWrote {len(csv_rows)} row(s) to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Evaluate over N episodes (align server EARNINGS_ANALYST_TASK_ID with --task)"
    )
    parser.add_argument("--samples", type=int, default=100, help="Number of episodes")
    parser.add_argument(
        "--base-url",
        default=os.environ.get("ENV_SERVER_URL", "http://localhost:8000"),
    )
    parser.add_argument("--model", default=os.environ.get("OPENAI_MODEL", "gpt-4o"))
    parser.add_argument(
        "--task",
        default=DEFAULT_TASK,
        help="Task id for label list in report (must match server task)",
    )
    parser.add_argument(
        "--quiet", action="store_true", help="Suppress per-episode lines"
    )
    parser.add_argument(
        "-o",
        "--output",
        default=None,
        metavar="PATH",
        help="Write one row per episode to this CSV file (UTF-8)",
    )
    args = parser.parse_args()
    try:
        asyncio.run(
            run_evaluation(
                samples=args.samples,
                base_url=args.base_url,
                model=args.model,
                task_id=args.task,
                quiet=args.quiet,
                output_path=args.output,
            )
        )
    except Exception as e:
        print(f"error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()