#!/usr/bin/env python3
"""
eval_scores_by_style.py

Score a random sample of (source, summary) pairs from each summary style
and check whether the resulting uncertainty/consistency scores are
meaningfully differentiated across styles.

Scores are fetched from a running API instance (local or deployed).
The Kruskal-Wallis H-test is used to check for a statistically significant
difference in medians across the three styles; per-pair Mann-Whitney U tests
check the expected orderings.

Expected orderings (heuristic):
  uncertainty:  informal >= professional >= shorten
  consistency:  shorten >= professional >= informal

Usage:
    pipenv run python scripts/eval_scores_by_style.py \\
        --api-url http://localhost:7860 \\
        --n-per-style 50 \\
        --sample-count 3
"""

from __future__ import annotations

import argparse
import json
import logging
import random
import sys
from collections import defaultdict
from pathlib import Path

import requests
from scipy import stats

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)


def load_by_style(path: str, styles: list[str]) -> dict[str, list[dict]]:
    records: dict[str, list[dict]] = defaultdict(list)
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            r = json.loads(line)
            style = r.get("summary_style", "unknown")
            if style in styles and r.get("paragraph_text") and r.get("summary"):
                records[style].append(r)
    return records


def score_record(api_url: str, record: dict, sample_count: int, api_token: str | None) -> dict | None:
    payload = {
        "source": record["paragraph_text"],
        "summary": record["summary"],
        "sample_count": sample_count,
        "seed": 42,
    }
    headers = {}
    if api_token:
        headers["X-Api-Token"] = api_token
    try:
        resp = requests.post(f"{api_url}/score", json=payload, headers=headers, timeout=60)
        resp.raise_for_status()
        return resp.json()
    except Exception as exc:
        logger.warning("Request failed: %s", exc)
        return None


def extract_sentence_scores(result: dict) -> list[dict]:
    return result.get("sentence_results", [])


def report(metric: str, scores_by_style: dict[str, list[float]], expected_order: list[str]) -> None:
    print(f"\n── {metric} ──")
    medians = {}
    for style in expected_order:
        vals = scores_by_style.get(style, [])
        if not vals:
            print(f"  {style:12s}  n=0  (no data)")
            continue
        med = sorted(vals)[len(vals) // 2]
        medians[style] = med
        print(f"  {style:12s}  n={len(vals):4d}  median={med:6.1f}  mean={sum(vals)/len(vals):6.1f}")

    groups = [scores_by_style.get(s, []) for s in expected_order if scores_by_style.get(s)]
    if len(groups) >= 2:
        h_stat, p_val = stats.kruskal(*groups)
        print(f"  Kruskal-Wallis H={h_stat:.2f}  p={p_val:.4f}", end="")
        print("  ✓ significant" if p_val < 0.05 else "  ✗ not significant")

    # Pairwise ordering check for adjacent pairs in expected_order
    styles_with_data = [s for s in expected_order if scores_by_style.get(s)]
    for i in range(len(styles_with_data) - 1):
        a, b = styles_with_data[i], styles_with_data[i + 1]
        va, vb = scores_by_style[a], scores_by_style[b]
        u_stat, p = stats.mannwhitneyu(va, vb, alternative="greater")
        direction = ">" if medians.get(a, 0) >= medians.get(b, 0) else "<"
        ok = "✓" if p < 0.05 and medians.get(a, 0) >= medians.get(b, 0) else "✗"
        print(f"  {a} {direction} {b}:  U={u_stat:.0f}  p={p:.4f}  {ok}")


def main(args: argparse.Namespace) -> None:
    styles = ["informal", "professional", "shorten"]
    logger.info("Loading data from %s", args.infile)
    by_style = load_by_style(args.infile, styles)

    random.seed(args.seed)
    sample: dict[str, list[dict]] = {}
    for style in styles:
        pool = by_style.get(style, [])
        random.shuffle(pool)
        sample[style] = pool[: args.n_per_style]
        logger.info("%s: %d records sampled", style, len(sample[style]))

    uncertainty: dict[str, list[float]] = defaultdict(list)
    ambiguity:   dict[str, list[float]] = defaultdict(list)
    consistency: dict[str, list[float]] = defaultdict(list)

    total = sum(len(v) for v in sample.values())
    done = 0
    for style, records in sample.items():
        for record in records:
            result = score_record(args.api_url, record, args.sample_count, args.api_token)
            done += 1
            if result is None:
                continue
            for sent in extract_sentence_scores(result):
                if "uncertainty_score" in sent:
                    uncertainty[style].append(sent["uncertainty_score"])
                if "ambiguity_score" in sent:
                    ambiguity[style].append(sent["ambiguity_score"])
                if "consistency_score" in sent:
                    consistency[style].append(sent["consistency_score"])
            if done % 10 == 0:
                logger.info("%d / %d records scored", done, total)

    logger.info("Scoring complete. %d / %d records returned results.", done, total)

    print("\n═══ Score distributions by style ═══")
    # informal tends to be more uncertain/ambiguous; shorten tends to be more consistent
    report("uncertainty_score",  uncertainty,  ["informal", "professional", "shorten"])
    report("ambiguity_score",    ambiguity,    ["informal", "professional", "shorten"])
    report("consistency_score",  consistency,  ["shorten",  "professional", "informal"])


if __name__ == "__main__":
    p = argparse.ArgumentParser(description="Evaluate score distributions across summary styles.")
    p.add_argument("--infile",       default="data/summaries_v4.jsonl")
    p.add_argument("--api-url",      default="http://localhost:7860",
                   help="Base URL of the scoring API (default: http://localhost:7860)")
    p.add_argument("--api-token",    default=None, metavar="TOKEN")
    p.add_argument("--n-per-style",  type=int, default=50,
                   help="Records to sample per style (default: 50)")
    p.add_argument("--sample-count", type=int, default=3,
                   help="Posterior samples per request (default: 3)")
    p.add_argument("--seed",         type=int, default=42)
    main(p.parse_args())