import json
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Tuple

import gradio as gr
from huggingface_hub import hf_hub_download
from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError


HF_ORG = os.getenv("HF_ORG", "map-setup-pilot")
RESULTS_DATASET_REPO = os.getenv("HF_RESULTS_DATASET_REPO", f"{HF_ORG}/results")
HF_READ_TOKEN = os.getenv("HF_READ_TOKEN", "")
HF_WRITE_TOKEN = os.getenv("HF_WRITE_TOKEN", "")

VALIDATION_RESULTS_JSONL_PATH = "results/validation_results.jsonl"
TEST_RESULTS_JSONL_PATH = "results/test_results.jsonl"

EXPECTED_LABELS = [
    "Useful & Safe",
    "Safe but not useful",
    "Useful but unsafe",
    "Untruthful",
    "No relevant data",
]

RANK_HEADERS = [
    "rank",
    "model_id",
    "useful_safe",
    "safe_not_useful",
    "useful_unsafe",
    "untruthful",
    "no_relevant_data",
    "timestamp_utc",
    "snapshot_id",
    "run_count",
    "variance",
    "confidence_interval",
    "model_revision",
]

DETAIL_HEADERS = [
    "timestamp_utc",
    "model_id",
    "submission_id",
    "mode",
    "status",
    "useful_safe",
    "safe_not_useful",
    "useful_unsafe",
    "untruthful",
    "no_relevant_data",
    "snapshot_id",
    "run_count",
    "variance",
    "confidence_interval",
    "model_revision",
]


def _token() -> str:
    token = (HF_READ_TOKEN or HF_WRITE_TOKEN).strip()
    if not token:
        raise RuntimeError("Missing HF_READ_TOKEN or HF_WRITE_TOKEN in Space secrets.")
    return token


def _read_jsonl_rows(path_in_repo: str) -> List[Dict[str, Any]]:
    try:
        local_file = hf_hub_download(
            repo_id=RESULTS_DATASET_REPO,
            filename=path_in_repo,
            repo_type="dataset",
            token=_token(),
        )
    except (EntryNotFoundError, HfHubHTTPError, FileNotFoundError):
        return []

    rows: List[Dict[str, Any]] = []
    for line in Path(local_file).read_text(encoding="utf-8").splitlines():
        text = line.strip()
        if not text:
            continue
        try:
            parsed = json.loads(text)
        except json.JSONDecodeError:
            continue
        if isinstance(parsed, dict):
            rows.append(parsed)
    return rows


def _parse_iso(value: Any) -> datetime:
    text = str(value or "").strip()
    if not text:
        return datetime.fromtimestamp(0, tz=timezone.utc)
    if text.endswith("Z"):
        text = text[:-1] + "+00:00"
    try:
        dt = datetime.fromisoformat(text)
    except ValueError:
        return datetime.fromtimestamp(0, tz=timezone.utc)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt


def _as_float(value: Any) -> float:
    try:
        return float(value)
    except (TypeError, ValueError):
        return 0.0


def _row_is_post_eval(row: Dict[str, Any]) -> bool:
    status = str(row.get("status") or "").strip().lower()
    metrics = row.get("metrics") or {}
    has_metrics = isinstance(metrics, dict) and isinstance(metrics.get("labelDistribution"), dict)
    if not has_metrics:
        return False
    if row.get("leaderboard_visible") is False:
        return False
    return status in {"completed", "simulated_completed", "published", "official_scored"} or bool(
        row.get("simulation")
    )


def _ci_to_text(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, (str, int, float)):
        return str(value)
    return json.dumps(value, ensure_ascii=False)


def _normalize_distribution(raw_dist: Dict[str, Any]) -> Dict[str, float]:
    key_map = {
        "Useful & Safe": "Useful & Safe",
        "Useful&Safe": "Useful & Safe",
        "Safe but not useful": "Safe but not useful",
        "SafeNotUseful": "Safe but not useful",
        "Useful but unsafe": "Useful but unsafe",
        "UsefulUnsafe": "Useful but unsafe",
        "Untruthful": "Untruthful",
        "No relevant data": "No relevant data",
        "NoRelevantData": "No relevant data",
    }
    out = {label: 0.0 for label in EXPECTED_LABELS}
    for key, value in (raw_dist or {}).items():
        mapped = key_map.get(str(key).strip())
        if mapped in out:
            out[mapped] += _as_float(value)
    return out


def _extract_record(row: Dict[str, Any]) -> Dict[str, Any]:
    dist = _normalize_distribution(((row.get("metrics") or {}).get("labelDistribution") or {}))
    return {
        "timestamp_utc": str(row.get("timestamp_utc") or ""),
        "model_id": str(row.get("model_id") or row.get("model_identifier") or row.get("system_name") or ""),
        "submission_id": str(row.get("submission_id") or ""),
        "mode": str(row.get("mode") or ""),
        "status": str(row.get("status") or ""),
        "useful_safe": dist["Useful & Safe"],
        "safe_not_useful": dist["Safe but not useful"],
        "useful_unsafe": dist["Useful but unsafe"],
        "untruthful": dist["Untruthful"],
        "no_relevant_data": dist["No relevant data"],
        "snapshot_id": str(row.get("snapshot_id") or ""),
        "run_count": _as_float(row.get("run_count")),
        "variance": _as_float(row.get("variance")),
        "confidence_interval": _ci_to_text(row.get("confidence_interval")),
        "model_revision": str(row.get("model_revision") or ""),
    }


def _to_records(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    records: List[Dict[str, Any]] = []
    for row in rows:
        if _row_is_post_eval(row):
            records.append(_extract_record(row))
    records.sort(key=lambda rec: _parse_iso(rec["timestamp_utc"]), reverse=True)
    return records


def _latest_record_per_model(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    latest: Dict[str, Dict[str, Any]] = {}
    for rec in records:
        model_id = rec["model_id"]
        if not model_id:
            continue
        prev = latest.get(model_id)
        if prev is None or _parse_iso(rec["timestamp_utc"]) >= _parse_iso(prev["timestamp_utc"]):
            latest[model_id] = rec
    return list(latest.values())


def _rank_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    ranked = sorted(
        records,
        key=lambda rec: (
            -rec["useful_safe"],
            rec["untruthful"],
            rec["useful_unsafe"],
            -rec["safe_not_useful"],
            -_parse_iso(rec["timestamp_utc"]).timestamp(),
            rec["model_id"],
        ),
    )
    out: List[Dict[str, Any]] = []
    for idx, rec in enumerate(ranked, start=1):
        row = dict(rec)
        row["rank"] = idx
        out.append(row)
    return out


def _rank_table_rows(ranked_rows: List[Dict[str, Any]]) -> List[List[Any]]:
    return [
        [
            row["rank"],
            row["model_id"],
            row["useful_safe"],
            row["safe_not_useful"],
            row["useful_unsafe"],
            row["untruthful"],
            row["no_relevant_data"],
            row["timestamp_utc"],
            row["snapshot_id"],
            row["run_count"],
            row["variance"],
            row["confidence_interval"],
            row["model_revision"],
        ]
        for row in ranked_rows
    ]


def _detail_table_rows(records: List[Dict[str, Any]]) -> List[List[Any]]:
    return [
        [
            row["timestamp_utc"],
            row["model_id"],
            row["submission_id"],
            row["mode"],
            row["status"],
            row["useful_safe"],
            row["safe_not_useful"],
            row["useful_unsafe"],
            row["untruthful"],
            row["no_relevant_data"],
            row["snapshot_id"],
            row["run_count"],
            row["variance"],
            row["confidence_interval"],
            row["model_revision"],
        ]
        for row in records
    ]


def refresh_leaderboard() -> Tuple[str, List[List[Any]], List[List[Any]], List[List[Any]], List[List[Any]]]:
    try:
        validation_raw_rows = _read_jsonl_rows(VALIDATION_RESULTS_JSONL_PATH)
        test_raw_rows = _read_jsonl_rows(TEST_RESULTS_JSONL_PATH)
    except Exception as exc:
        return (
            f"### Error Loading Dataset\n`{type(exc).__name__}` while reading `{RESULTS_DATASET_REPO}`.",
            [],
            [],
            [],
            [],
        )

    validation_records = _to_records(validation_raw_rows)
    test_records = _to_records(test_raw_rows)

    validation_ranked = _rank_records(_latest_record_per_model(validation_records))
    test_ranked = _rank_records(_latest_record_per_model(test_records))

    summary = (
        "### MAP Pilot Leaderboard\n"
        f"- Dataset: `{RESULTS_DATASET_REPO}`\n"
        f"- Validation/dev rows displayed: **{len(validation_records)}** across **{len(validation_ranked)}** models\n"
        f"- Official/private-test rows displayed: **{len(test_records)}** across **{len(test_ranked)}** models\n"
        "- Ranking order: Useful & Safe desc, then Untruthful asc, then Useful but unsafe asc."
    )
    return (
        summary,
        _rank_table_rows(validation_ranked),
        _detail_table_rows(validation_records),
        _rank_table_rows(test_ranked),
        _detail_table_rows(test_records),
    )


with gr.Blocks(title="MAP Pilot Leaderboard") as demo:
    gr.Markdown("# MAP Pilot Leaderboard")
    gr.Markdown("Latest standings for the first iteration of the MAP challenge")

    refresh_button = gr.Button("Refresh")
    summary_box = gr.Markdown()

    with gr.Tab("Validation / Dev"):
        validation_rank_df = gr.Dataframe(
            headers=RANK_HEADERS,
            datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"],
            value=[],
            interactive=False,
            label="Model Ranking (latest run per model)",
        )
        validation_detail_df = gr.Dataframe(
            headers=DETAIL_HEADERS,
            datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"],
            value=[],
            interactive=False,
            label="Recent Evaluation Rows",
        )

    with gr.Tab("Official / Private Test"):
        test_rank_df = gr.Dataframe(
            headers=RANK_HEADERS,
            datatype=["number", "str", "number", "number", "number", "number", "number", "str", "str", "number", "number", "str", "str"],
            value=[],
            interactive=False,
            label="Model Ranking (latest run per model)",
        )
        test_detail_df = gr.Dataframe(
            headers=DETAIL_HEADERS,
            datatype=["str", "str", "str", "str", "str", "number", "number", "number", "number", "number", "str", "number", "number", "str", "str"],
            value=[],
            interactive=False,
            label="Recent Evaluation Rows",
        )

    refresh_button.click(
        fn=refresh_leaderboard,
        inputs=[],
        outputs=[
            summary_box,
            validation_rank_df,
            validation_detail_df,
            test_rank_df,
            test_detail_df,
        ],
        queue=False,
    )
    demo.load(
        fn=refresh_leaderboard,
        inputs=[],
        outputs=[
            summary_box,
            validation_rank_df,
            validation_detail_df,
            test_rank_df,
            test_detail_df,
        ],
        queue=False,
    )


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))