Spaces:

QuanticaLab
/

PLainBench

Running

File size: 60,837 Bytes

3bd48fe

"""PLainBench - Polish Text Simplification Leaderboard.

Reads scored anon JSON files from the data/current/ directory and displays a
leaderboard showing how well each LLM simplifies Polish texts, measured
by readability indices, difficulty markers, reference-based similarity
metrics, and a QuestEval-style QA consistency score.
"""

import json
from functools import lru_cache
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.graph_objects as go

DATA_DIR = Path(__file__).parent / "data" / "current"


@lru_cache(maxsize=1)
def load_records() -> tuple[dict, ...]:
    """Parse every scored anon JSON once and cache the result.

    The full files are large (~9 MB each, holding per-text records), but the
    app only ever reads ``metadata`` and ``summary``. We keep just those two
    sections so each file is parsed a single time and every loader/refresh
    reuses the in-memory copy instead of re-reading from disk.
    """
    records: list[dict] = []
    if not DATA_DIR.exists():
        return ()
    for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
        with open(fp, encoding="utf-8") as f:
            data = json.load(f)
        records.append({"metadata": data["metadata"], "summary": data["summary"]})
    return tuple(records)


# Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
# quick-filters. Size options are *upper bounds* in billions of parameters.
SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]


def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
    """Whether a model's metadata satisfies the size-limit / model-type filters."""
    if model_type and model_type != "ALL":
        want = "open" if model_type == "open-weights" else "closed"
        if meta.get("weights") != want:
            return False
    if size_limit and size_limit != "ALL":
        cap = float(size_limit.rstrip("B"))
        params = meta.get("total_params_b") or 0
        # Unknown / unreported size (0) can't be placed under a cap, so exclude it.
        if params <= 0 or params > cap:
            return False
    return True


def _filtered_records(
    size_limit: str | None = None, model_type: str | None = None
) -> list[dict]:
    """Records whose model passes the size-limit / model-type filters."""
    sl = size_limit or "ALL"
    mt = model_type or "ALL"
    return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]


def _visible_size_limits() -> list[str]:
    """Prune ``SIZE_LIMITS`` to the caps that actually split the current models.

    A numeric cap is redundant when it selects the same set of models as the
    next-smaller cap (no model has a size in the band between them) - those
    upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
    always kept. Recomputed from the data, so adding models later automatically
    re-expands the list.
    """
    params = [
        p for d in load_records()
        if (p := d["metadata"].get("total_params_b") or 0) > 0
    ]
    # Ascending by value: keep the smallest representative of each distinct
    # subset; a larger cap with the same model count is the redundant "upper" one.
    kept: set[str] = set()
    prev_count = -1
    for s in sorted(
        (s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
    ):
        cap = float(s.rstrip("B"))
        count = sum(1 for p in params if p <= cap)
        if count > 0 and count != prev_count:
            kept.add(s)
            prev_count = count
    # Preserve the original descending display order, with ALL first.
    return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]


READABILITY_ORTH_LABELS = {
    "flesch_reading_ease_orth": "Flesch RE",
    "flesch_kincaid_grade_orth": "Flesch-Kincaid",
    "gunning_fog_orth": "Gunning Fog",
    "ari_orth": "ARI",
    "linsear_write_orth": "Linsear Write",
    "smog_grade_orth": "SMOG",
    "coleman_liau_orth": "Coleman-Liau",
    "pisarek_orth": "Pisarek",
}

READABILITY_LEMMA_LABELS = {
    "flesch_reading_ease_lemma": "Flesch RE",
    "flesch_kincaid_grade_lemma": "Flesch-Kincaid",
    "gunning_fog_lemma": "Gunning Fog",
    "ari_lemma": "ARI",
    "linsear_write_lemma": "Linsear Write",
    "smog_grade_lemma": "SMOG",
    "coleman_liau_lemma": "Coleman-Liau",
    "pisarek_lemma": "Pisarek",
}

LEXICAL_ORTH_LABELS = {
    "ttr_orth": "TTR",
    "rttr_orth": "RTTR",
    "cttr_orth": "CTTR",
    "herdan_orth": "Herdan",
    "summer_orth": "Summer",
    "dugast_orth": "Dugast",
    "maas_orth": "Maas",
    "mtld_orth": "MTLD",
    "mattr_orth": "MATTR",
}

LEXICAL_LEMMA_LABELS = {
    "ttr_lemma": "TTR",
    "rttr_lemma": "RTTR",
    "cttr_lemma": "CTTR",
    "herdan_lemma": "Herdan",
    "summer_lemma": "Summer",
    "dugast_lemma": "Dugast",
    "maas_lemma": "Maas",
    "mtld_lemma": "MTLD",
    "mattr_lemma": "MATTR",
}

SIMILARITY_LABELS = {
    "bert_score_precision": "BERTScore P",
    "bert_score_recall": "BERTScore R",
    "bert_score_f1": "BERTScore F1",
    "bleu": "BLEU",
    "chrf": "chrF",
    "chrfpp": "chrF++",
    "nli_precision": "NLI P",
    "nli_recall": "NLI R",
    "nli_f1": "NLI F1",
    "rouge_1_precision": "ROUGE-1 P",
    "rouge_1_recall": "ROUGE-1 R",
    "rouge_1_f1": "ROUGE-1 F1",
    "rouge_2_precision": "ROUGE-2 P",
    "rouge_2_recall": "ROUGE-2 R",
    "rouge_2_f1": "ROUGE-2 F1",
    "rouge_l_precision": "ROUGE-L P",
    "rouge_l_recall": "ROUGE-L R",
    "rouge_l_f1": "ROUGE-L F1",
    "wer": "WER",
    "mer": "MER",
    "wil": "WIL",
    "ne_retention": "NE Retention",
}

MARKER_LABELS = {
    # counts
    "paragraph_count": "Paragraph count",
    "sentence_count": "Sentence count",
    "word_count": "Word count",
    "named_entity_count": "Named entity count",
    "difficult_word_count": "Difficult word count",
    "difficult_word_count_orth": "Difficult word count (orth)",
    # average lengths
    "avg_word_syllables": "Avg word syllables",
    "avg_sentence_length": "Avg sentence length",
    "avg_paragraph_length": "Avg paragraph length",
    # lexical difficulty
    "named_entity_ratio": "Named entity ratio",
    "difficult_word_ratio": "Difficult word ratio",
    "difficult_word_ratio_orth": "Difficult word ratio (orth)",
    # POS ratios
    "noun_ratio": "Noun ratio",
    "difficult_noun_ratio": "Difficult noun ratio",
    "difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
    "verb_ratio": "Verb ratio",
    "difficult_verb_ratio": "Difficult verb ratio",
    "difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
    "adjective_ratio": "Adjective ratio",
    "difficult_adjective_ratio": "Difficult adjective ratio",
    "difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
    # POS-to-POS ratios
    "noun_to_verb_ratio": "Noun/verb ratio",
    "verbo_nominal_ratio": "Verbo-nominal ratio",
    "adj_to_verb_ratio": "Adj/verb ratio",
    "adj_to_noun_ratio": "Adj/noun ratio",
    # morphological
    "nie_prefix_ratio": "Nie-prefix ratio",
    "participle_ratio": "Participle ratio",
    "gerund_ratio": "Gerund ratio",
    "osc_noun_ratio": "OSC noun ratio",
    "impersonal_verb_ratio": "Impersonal verb ratio",
    "genitive_noun_ratio": "Genitive noun ratio",
    "avg_genitive_chain_length": "Avg genitive chain",
    # syntactic
    "sentence_length_variance": "Sentence length variance",
    "mean_dependency_distance": "Mean dep. distance",
    "subordination_index": "Subordination index",
}

QUESTEVAL_LABELS = {
    "precision": "QuestEval P",
    "recall": "QuestEval R",
    "f1": "QuestEval F1",
    "answerable_rate_forward": "Answerable (fwd)",
    "answerable_rate_backward": "Answerable (bwd)",
}

RRF_K = 60

# Each entry: (source, key, label, ascending_rrf, in_rrf)
#   source       — "metrics" | "markers" → use avg_diff_pct (Δ%)
#                  "similarity" | "questeval" → use absolute value
#   ascending_rrf — True = lower value is better (rank 1 = smallest)
#   in_rrf        — include this metric in category RRF computation

CATEGORIES: list[dict] = [
    {
        "name": "Readability",
        "in_rrf": True,
        "rrf_weight": 1,
        "description": (
            "Readability indices - **orth** (surface-form) variants. "
            "Δ% = percentage change after simplification. "
            "For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. "
            "**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
            "**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
            "where complex words have many syllables (lower → easier). "
            "**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
            "IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
        ),
        "metrics": [
            ("metrics", "flesch_reading_ease_orth", "Flesch RE",      False, True),
            ("metrics", "gunning_fog_orth",         "Gunning Fog",    True,  True),
            ("metrics", "coleman_liau_orth",        "Coleman-Liau",   True,  True),
            ("ifeval",  "avg_exclude",              "IFEval exclude", False, True),
        ],
    },
    {
        "name": "Lexical Difficulty",
        "in_rrf": True,
        "rrf_weight": 1,
        "description": (
            "Word-level difficulty markers - **orth** variants where available. "
            "Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
            "**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). "
            "**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
            "(higher → harder). "
            "**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
            "**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
            "(higher → more complex nominal vocabulary)."
        ),
        "metrics": [
            ("markers", "avg_word_syllables",        "Avg word syllables",   True,  True),
            ("markers", "difficult_word_ratio_orth", "Difficult word ratio", True,  True),
            ("markers", "verb_ratio",                "Verb ratio",           False, True),
            ("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True,  True),
        ],
    },
    {
        "name": "Syntactic",
        "in_rrf": True,
        "rrf_weight": 1,
        "description": (
            "Sentence and clause structure complexity markers. "
            "Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
            "**Avg sentence length** is the mean number of words per sentence (higher → harder). "
            "**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
            "**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
        ),
        "metrics": [
            ("markers", "avg_sentence_length",      "Avg sentence length",  True, True),
            ("markers", "sentence_length_variance", "Sentence length var.", True, False),
            ("markers", "mean_dependency_distance", "Mean dep. distance",   True, True),
            ("markers", "subordination_index",      "Subordination index",  True, True),
        ],
    },
    {
        "name": "Morphological",
        "in_rrf": True,
        "rrf_weight": 1,
        "description": (
            "Polish-specific morphological complexity markers. "
            "Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
            "**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) "
            "among alphabetic tokens - a bookish, formal construction (higher → more complex). "
            "**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words "
            "(higher → more nominalised, formal). "
            "**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), "
            "passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish "
            "(higher → more impersonal, harder). "
            "**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
            "(higher → harder). "
            "**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
            "(higher → more genitive stacking, harder). "
            "**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of "
            "administrative Polish (higher → harder). "
            "**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)."
        ),
        "metrics": [
            ("markers", "participle_ratio",            "Participle ratio",           True, False),
            ("markers", "adverbial_participle_ratio",  "Adverbial participle ratio", True, True),
            ("markers", "gerund_ratio",                "Gerund ratio",               True, True),
            ("markers", "impersonal_verb_ratio",       "Impersonal verb ratio",      True, True),
            ("markers", "genitive_noun_ratio",         "Genitive noun ratio",        True, True),
            ("markers", "avg_genitive_chain_length",   "Avg genitive chain",         True, True),
            ("markers", "verbo_nominal_ratio",         "Verbo-nominal ratio",        True, True),
            ("markers", "osc_noun_ratio",              "OSC noun ratio",             True, True),
        ],
    },
    {
        "name": "Meaning Preservation",
        "in_rrf": True,
        "rrf_weight": 4,
        "description": (
            "Semantic metrics that directly test whether the simplified text says the same thing as the original. "
            "NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
            "NE Retention measures what fraction of named entities from the original appear in the simplified text "
            "(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
            "IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
            "Higher is better for all."
        ),
        "metrics": [
            ("similarity", "nli_f1",        "NLI F1",       False, True),
            ("questeval",  "f1",            "QuestEval F1", False, True),
            ("similarity", "ne_retention",  "NE Retention", False, True),
            ("ifeval",     "avg_include",   "IFEval include", False, True),
        ],
    },
]


def _col_name(source: str, label: str) -> str:
    """Column name used in category DataFrames."""
    if source in ("metrics", "markers"):
        return f"{label} (Δ%)"
    return label


def _model_label(data: dict) -> str:
    """Return a unique display name, appending reasoning effort when present.

    The parameter size is shown separately (see :func:`_params_str`), in its
    own column, mirroring the PLCC leaderboard layout.
    """
    model = data["metadata"]["model"]
    effort = (
        data["metadata"]
        .get("model_kwargs", {})
        .get("extra_body", {})
        .get("reasoning", {})
        .get("effort")
    )
    if effort is not None:
        return f"{model} [reasoning: {effort}]"
    return model


def _params_str(params: float | None) -> str | None:
    """PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
    p = params or 0
    if p <= 0:
        return None
    return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"


def _params_map() -> dict[str, str]:
    """Model label → formatted parameter size, read from each file's metadata."""
    out: dict[str, str] = {}
    for data in load_records():
        label = _params_str(data["metadata"].get("total_params_b"))
        if label:
            out[_model_label(data)] = label
    return out


def _metric_row(
    label_map: dict,
    summary_metrics: dict,
    row: dict,
    detail_row: dict,
    *,
    include_detail: bool = True,
) -> None:
    """Populate leaderboard row and detail row from a label→key map."""
    for key, label in label_map.items():
        vals = summary_metrics.get(key, {})
        row[f"{label} (Δ)"] = vals.get("avg_diff")
        row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
        if include_detail:
            detail_row[f"{label} before"] = vals.get("avg_before")
            detail_row[f"{label} after"] = vals.get("avg_after")
            detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
            detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")


def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
    """Load scored JSON files and build leaderboard DataFrames.

    Returns:
        (readability_orth_df, readability_lemma_df,
         lexical_orth_df, lexical_lemma_df,
         similarity_df, questeval_df, markers_df, detail_df)
    """
    read_orth_rows, read_lemma_rows = [], []
    lex_orth_rows, lex_lemma_rows = [], []
    similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []

    if not DATA_DIR.exists():
        empty = pd.DataFrame()
        return empty, empty, empty, empty, empty, empty, empty, empty

    for data in load_records():
        model = _model_label(data)
        n = data["summary"]["n"]
        metrics = data["summary"]["metrics"]
        similarity = data["summary"].get("similarity", {})
        questeval = data["summary"].get("questeval", {})
        markers = data["summary"].get("markers", {})

        base = {"Model": model, "N": n}
        read_orth_row = dict(base)
        read_lemma_row = dict(base)
        lex_orth_row = dict(base)
        lex_lemma_row = dict(base)
        similarity_row = dict(base)
        questeval_row = dict(base)
        markers_row = dict(base)
        detail_row = dict(base)

        _metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
        _metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
        _metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
        _metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)

        for key, label in SIMILARITY_LABELS.items():
            similarity_row[label] = similarity.get(key)

        for key, label in QUESTEVAL_LABELS.items():
            questeval_row[label] = questeval.get(key)

        for key, label in MARKER_LABELS.items():
            vals = markers.get(key, {})
            markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
            markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
            detail_row[f"{label} before"] = vals.get("avg_before")
            detail_row[f"{label} after"] = vals.get("avg_after")
            detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
            detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")

        read_orth_rows.append(read_orth_row)
        read_lemma_rows.append(read_lemma_row)
        lex_orth_rows.append(lex_orth_row)
        lex_lemma_rows.append(lex_lemma_row)
        similarity_rows.append(similarity_row)
        questeval_rows.append(questeval_row)
        markers_rows.append(markers_row)
        detail_rows.append(detail_row)

    dfs = [
        pd.DataFrame(read_orth_rows),
        pd.DataFrame(read_lemma_rows),
        pd.DataFrame(lex_orth_rows),
        pd.DataFrame(lex_lemma_rows),
        pd.DataFrame(similarity_rows),
        pd.DataFrame(questeval_rows),
        pd.DataFrame(markers_rows),
        pd.DataFrame(detail_rows),
    ]
    for df in dfs:
        num_cols = df.select_dtypes(include="number").columns
        df[num_cols] = df[num_cols].round(4)

    return tuple(dfs)


@lru_cache(maxsize=1)
def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
    """Per-model matched IFEval records, cached once.

    Manual IFEval rules are hand-written for a subset of the prompts, so the
    comparison only makes sense on records carrying *both* an automatic and a
    manual score. This reads the per-text ``results`` arrays (which
    ``load_records`` discards) once and keeps, per model, the tuples
    ``(category, prompt_id, auto_include, auto_exclude, man_include,
    man_exclude)`` so the dropdown filters can re-aggregate cheaply.
    """
    out: list[tuple[str, tuple[tuple, ...]]] = []
    if not DATA_DIR.exists():
        return ()
    for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
        with open(fp, encoding="utf-8") as f:
            data = json.load(f)
        model = _model_label(data)
        recs: list[tuple] = []
        for rec in data["results"]:
            man = rec.get("ifeval_manual")
            auto = rec.get("ifeval")
            if not man or not auto:
                continue
            recs.append((
                rec.get("category"),
                rec.get("prompt_id"),
                auto.get("include"), auto.get("exclude"),
                man.get("include"), man.get("exclude"),
            ))
        if recs:
            out.append((model, tuple(recs)))
    return tuple(out)


def load_ifeval_comparison_df(
    text_category: str | None = None,
    prompt: str | None = None,
    size_limit: str | None = None,
    model_type: str | None = None,
) -> pd.DataFrame:
    """Compare manual (gold) IFEval against automatic IFEval, per model.

    The comparison is restricted to records carrying *both* an automatic and a
    manual score - the very same texts scored both ways, which isolates the
    rule-quality gap from sampling differences (the overall ``ifeval`` summary
    averages over ~5× more texts and so is not directly comparable). ``Δ``
    columns are manual − automatic: a negative value means the automatic
    constraints were easier to satisfy than the hand-checked ones, i.e. the
    automatic rules are more lenient.

    ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
    restrict the matched records to one source-text category and/or one
    simplification prompt, mirroring the RRF dropdown filters.
    """
    tc = None if text_category in (None, "All") else text_category
    pr = None if prompt in (None, "All") else prompt

    # Automatic IFEval over *all* records (not just the manual-matched subset),
    # from the summary buckets, so it tracks the same category/prompt filters.
    # Restricted to models passing the size / model-type filters.
    allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
    summaries = {
        _model_label(data): data["summary"]
        for data in load_records()
        if _model_label(data) in allowed
    }

    rows: list[dict] = []
    for model, recs in _load_ifeval_records():
        if model not in allowed:
            continue
        ai = ae = mi = me = 0.0
        ni = ne = 0
        for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
            if tc and cat != tc:
                continue
            if pr and prompt_id != pr:
                continue
            if m_inc is not None and a_inc is not None:
                ai += a_inc; mi += m_inc; ni += 1
            if m_exc is not None and a_exc is not None:
                ae += a_exc; me += m_exc; ne += 1
        if ni == 0 and ne == 0:
            continue
        auto_inc = ai / ni if ni else None
        man_inc = mi / ni if ni else None
        auto_exc = ae / ne if ne else None
        man_exc = me / ne if ne else None
        auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
        all_inc = auto_all.get("avg_include")
        all_exc = auto_all.get("avg_exclude")
        rows.append({
            "Model": model,
            "N": ni or ne,
            "Manual include": man_inc,
            "Manual exclude": man_exc,
            "Auto include": auto_inc,
            "Auto include (all)": all_inc,
            "Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
            "Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
            "Auto exclude": auto_exc,
            "Auto exclude (all)": all_exc,
            "Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
            "Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
        })

    df = pd.DataFrame(rows)
    if df.empty:
        return df
    df = df.sort_values("Model").reset_index(drop=True)
    num_cols = df.select_dtypes(include="number").columns
    df[num_cols] = df[num_cols].round(4)
    return df


def text_category_choices() -> list[str]:
    """All source-text categories present in the data, prefixed with 'All'."""
    cats: set[str] = set()
    for data in load_records():
        cats.update(data["summary"].get("metrics_by_category", {}).keys())
    return ["All"] + sorted(cats)


def prompt_choices() -> list[str]:
    """All simplification prompts present in the data, prefixed with 'All'."""
    prompts: set[str] = set()
    for data in load_records():
        prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
    return ["All"] + sorted(prompts)


def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict:
    """Return the metric bucket for one source, filtered by text category and/or prompt.

    Picks the overall summary when neither filter is set, the ``*_by_category`` /
    ``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket
    (keyed ``"CATEGORY/PROMPT"``) when both are set.
    """
    if source in ("metrics", "markers", "similarity"):
        if tc and prompt:
            return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
        if tc:
            return s.get(f"{source}_by_category", {}).get(tc, {})
        if prompt:
            return s.get(f"{source}_by_prompt", {}).get(prompt, {})
        return s.get(source, {})
    # questeval / ifeval keep their per-filter buckets nested under the source object
    src = s.get(source, {})
    if tc and prompt:
        return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
    if tc:
        return src.get("by_category", {}).get(tc, {})
    if prompt:
        return src.get("by_prompt", {}).get(prompt, {})
    return src


def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None:
    """Sample count for the selected filters, from whichever source records it."""
    for src in ("questeval", "ifeval"):
        n = _source_bucket(s, src, tc, prompt).get("n")
        if n is not None:
            return n
    return None


def load_category_df(
    category: dict,
    text_category: str | None = None,
    prompt: str | None = None,
) -> pd.DataFrame:
    """Build a DataFrame for one metric category with a per-category RRF score.

    ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
    restrict the metrics to one source-text category and/or one simplification
    prompt via the matching ``*_by_category`` / ``*_by_prompt`` /
    ``*_by_category_prompt`` buckets; otherwise the overall summary is used.
    The RRF is always computed over **all** models; the size-limit / model-type
    filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
    so they never change a model's rank or score.
    """
    rows: list[dict] = []
    tc = None if text_category in (None, "All") else text_category
    pr = None if prompt in (None, "All") else prompt

    for data in load_records():
        s = data["summary"]
        model = _model_label(data)
        n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
        row: dict = {"Model": model, "N": n}

        for source, key, label, _asc, in_rrf in category["metrics"]:
            if not in_rrf:
                continue
            col = _col_name(source, label)
            bucket = _source_bucket(s, source, tc, pr)
            if source in ("metrics", "markers"):
                row[col] = bucket.get(key, {}).get("avg_diff_pct")
            else:  # similarity, questeval, ifeval store the value directly
                row[col] = bucket.get(key)

        rows.append(row)

    df = pd.DataFrame(rows)
    if df.empty:
        return df

    num_cols = df.select_dtypes(include="number").columns
    df[num_cols] = df[num_cols].round(4)

    rrf = pd.Series(0.0, index=df.index)
    for source, key, label, ascending, in_rrf in category["metrics"]:
        if not in_rrf:
            continue
        col = _col_name(source, label)
        if col not in df.columns or df[col].isna().all():
            continue
        rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))

    df.insert(2, "RRF Score", rrf.round(4))
    df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))
    df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
    return df


def _plcc_overall_map() -> dict[str, float]:
    """Model label → external PLCC overall score, read from each file's metadata.

    PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
    score is carried verbatim in ``metadata.plcc.overall`` and shown for
    reference only - it does not feed the RRF ranking. Models without a PLCC
    entry are omitted (mapped to NaN in the table).
    """
    out: dict[str, float] = {}
    for data in load_records():
        plcc = data["metadata"].get("plcc") or {}
        overall = plcc.get("overall")
        if overall is not None:
            out[_model_label(data)] = overall
    return out


def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
    """Fuse per-category RRF scores into a final ranking via RRF.

    Each category column shows the model's **rank within that category** (1 = best);
    those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
    A reference ``PLCC`` column carries the external PLCC benchmark score and does
    not influence the ranking.
    """
    merged: pd.DataFrame | None = None
    for cat, cat_df in category_data:
        if not cat.get("in_rrf", True) or cat_df.empty:
            continue
        sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
        merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")

    if merged is None or merged.empty:
        return pd.DataFrame()

    # N (sample count) is identical across categories for a given model, so take
    # it from whichever category table carries it.
    n_map: dict = {}
    for _cat, cat_df in category_data:
        if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
            n_map = dict(zip(cat_df["Model"], cat_df["N"]))
            break

    score_cols = [c for c in merged.columns if c != "Model"]
    weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}

    out = merged[["Model"]].copy()
    rrf = pd.Series(0.0, index=merged.index)
    rank_cols: dict[str, pd.Series] = {}
    for col in score_cols:
        ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
        rrf += weights.get(col, 1) / (RRF_K + ranks)
        rank_cols[col] = ranks

    out.insert(1, "Final RRF", rrf.round(4))
    out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
    for name, ranks in rank_cols.items():
        out[name] = ranks
    out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
    out.insert(0, "Rank", range(1, len(out) + 1))
    out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
    out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
    return out


def build_tradeoff_scatter(
    text_category: str | None = None,
    prompt: str | None = None,
    size_limit: str | None = None,
    model_type: str | None = None,
) -> go.Figure | None:
    """Scatter of Gunning Fog reduction vs meaning preservation, one point per model.

    X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
    Y: QuestEval F1 (higher = better meaning preservation)

    Honours the same text-category / prompt / size / model-type filters as the
    RRF rankings.
    """
    tc = None if text_category in (None, "All") else text_category
    pr = None if prompt in (None, "All") else prompt
    points = []
    for data in _filtered_records(size_limit, model_type):
        s = data["summary"]
        model = _model_label(data)
        x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
        y = _source_bucket(s, "questeval", tc, pr).get("f1")
        if x is None or y is None:
            continue
        points.append((model, x, y))

    if not points:
        return None

    models, xs, ys = zip(*points)

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            mode="markers+text",
            text=models,
            textposition="top center",
            textfont={"size": 10},
            marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
            hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
        )
    )

    x_mid = (min(xs) + max(xs)) / 2
    y_mid = (min(ys) + max(ys)) / 2
    fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
    fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")

    fig.update_layout(
        title="Complexity reduction vs meaning preservation",
        xaxis_title="Gunning Fog orth Δ%  (← easier text)",
        yaxis_title="QuestEval F1  (↑ meaning preserved)",
        height=560,
        margin={"l": 60, "r": 40, "t": 60, "b": 60},
        plot_bgcolor="white",
    )
    fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
    fig.update_yaxes(showgrid=True, gridcolor="#EEE")

    return fig


def build_fog_nli_scatter(
    text_category: str | None = None,
    prompt: str | None = None,
    size_limit: str | None = None,
    model_type: str | None = None,
) -> go.Figure | None:
    """Scatter of Gunning Fog reduction vs NLI F1, one point per model.

    X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
    Y: NLI F1 (higher = stronger entailment / meaning preserved)

    Honours the same text-category / prompt / size / model-type filters as the
    RRF rankings.
    """
    tc = None if text_category in (None, "All") else text_category
    pr = None if prompt in (None, "All") else prompt
    points = []
    for data in _filtered_records(size_limit, model_type):
        s = data["summary"]
        model = _model_label(data)
        x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
        y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
        if x is None or y is None:
            continue
        points.append((model, x, y))

    if not points:
        return None

    models, xs, ys = zip(*points)

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=xs,
            y=ys,
            mode="markers+text",
            text=models,
            textposition="top center",
            textfont={"size": 10},
            marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
            hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
        )
    )

    x_mid = (min(xs) + max(xs)) / 2
    y_mid = (min(ys) + max(ys)) / 2
    fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
    fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")

    fig.update_layout(
        title="Complexity reduction vs NLI consistency",
        xaxis_title="Gunning Fog orth Δ%  (← easier text)",
        yaxis_title="NLI F1  (↑ meaning preserved)",
        height=560,
        margin={"l": 60, "r": 40, "t": 60, "b": 60},
        plot_bgcolor="white",
    )
    fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
    fig.update_yaxes(showgrid=True, gridcolor="#EEE")

    return fig


INTRO = """\
# PLainBench - Polish Text Simplification Leaderboard

This benchmark evaluates how well LLMs simplify difficult Polish texts -
drawn from legal/administrative (BIP/GOV), finance, and science domains - while
preserving the original meaning. Each model simplifies 210 source texts under
5 simplification prompts (1050 outputs per model). Outputs are scored on
readability indices, fine-grained difficulty markers (lexical, syntactic,
morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
named-entity retention), and instruction following (IFEval include/exclude).
The per-category scores are fused into an overall **Final RRF** ranking.
"""

METRICS_DOC = """\
## Metrics

### Readability indices

All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
dictionary) and counted on surface (orthographic) word forms.

Δ is the absolute change (after − before); Δ% is the average percentage change
from the original text to the simplified text.

| Metric | Formula | Interpretation |
|---|---|---|
| **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** |
| **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** |
| **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** |

### Difficulty markers

Fine-grained syntactic, morphological, and lexical features.
Δ is absolute change; Δ% is percentage change.
Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
surface (orthographic) form.

| Marker | Description | Desired Δ% |
|---|---|---|
| **Avg word syllables** | Mean syllable count per word | − (shorter words) |
| **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − |
| **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − |
| **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) |
| **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) |
| **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) |
| **Subordination index** | Subordinate clauses / total clauses | − |
| **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − |
| **Gerund ratio** | Gerunds / all tokens | − |
| **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − |
| **Genitive noun ratio** | Nouns in genitive case / all tokens | − |
| **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − |
| **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − |
| **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − |

### Similarity metrics

Reference-based metrics comparing simplified text against the original.

| Metric | Description | Direction |
|---|---|---|
| **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment |
| **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved |

*Only **NLI F1** feeds the RRF score; P and R are shown for context.*

### QuestEval - QA consistency

| Metric | Description | Direction |
|---|---|---|
| **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations |
| **QuestEval R** | Forward recall - information preserved | Higher = less content dropped |
| **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation |
| **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic |
| **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original |

*Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.*

### IFEval - instruction following

| Metric | Description | Direction |
|---|---|---|
| **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better |
| **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better |
"""

# Sample-count note shown under each table that carries an ``N`` column.
N_NOTE = "**N** = number of prompt × text evaluations per model."

# The five simplification prompts every model is run with. The keys match the
# "Simplification prompt" filter values (and the ``*_by_prompt`` summary
# buckets); each value is ``(short description, user-message template)``, where
# ``<text>`` marks where the source text is inserted. Kept in sync with
# generation/prompting/instruction.py. Ordered from least to most detailed.
PROMPTS: dict[str, tuple[str, str]] = {
    "mini": (
        "Minimal - a single-line instruction, no rules.",
        "Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
        "bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
    ),
    "compact": (
        "Compact - a short bulleted rule set.",
        """Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.

Zasady:
- Skup się na najważniejszych informacjach, usuń zbędne treści.
- Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
- Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
- Twórz krótkie zdania (jedna myśl = jedno zdanie).
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
- Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
- Zachowaj poprawność językową i logiczną spójność.
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

---

### Tekst do uproszczenia:

<text>""",
    ),
    "medium": (
        "Medium - moderately detailed rules with sub-points.",
        """Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.

### Zasady:
- Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
- Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
- Stosuj proste i naturalne słownictwo:
  - zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
  - jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
- Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
- Unikaj:
  - żargonu, stylu urzędowego i zapożyczeń,
  - form bezosobowych i strony biernej (jeśli nie są konieczne),
  - nadmiaru rzeczowników odczasownikowych,
  - podwójnych przeczeń i zawiłych konstrukcji.
- Zachowaj poprawność językową, spójność i logiczny układ tekstu.
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

---

### Tekst do uproszczenia:

<text>""",
    ),
    "long": (
        "Long - full, sectioned plain-language guidelines.",
        """Uprość poniższy tekst zgodnie z zasadami prostego języka.

### 1. Cel i odbiorca
- Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
- Skup się na najważniejszych informacjach.

### 2. Struktura
- Usuń informacje zbędne i poboczne.
- Uporządkuj treść: najważniejsze informacje podaj na początku.
- Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
- Jeśli tekst jest dłuższy, użyj nagłówków lub list.

### 3. Słownictwo
- Zastępuj trudne słowa prostszymi.
- Unikaj:
  - terminów specjalistycznych (chyba że je wyjaśnisz),
  - słów rzadkich, książkowych i urzędowych,
  - zapożyczeń i modnych zwrotów,
  - skrótów niezrozumiałych dla odbiorcy.
- W razie potrzeby:
  - wyjaśnij trudne pojęcia,
  - podaj przykłady,
  - używaj konkretnych nazw zamiast ogólników.

### 4. Składnia
- Twórz krótkie zdania (ok. 20 słów).
- Jedno zdanie = jedna myśl.
- Używaj zdań twierdzących.
- Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
- Używaj strony czynnej zamiast biernej.
- Unikaj form bezosobowych i skomplikowanych konstrukcji.
- Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).

### 5. Styl
- Unikaj podwójnych przeczeń.
- Upraszczaj złożone konstrukcje.
- Zachowaj naturalny, jasny ton.

### 6. Końcowa kontrola
- Sprawdź, czy tekst jest:
  - zrozumiały,
  - poprawny językowo,
  - logiczny i spójny.

### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

---

### Tekst do uproszczenia:

<text>""",
    ),
    "step_by_step": (
        "Step by step - role-based, numbered editorial guidelines.",
        """Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:

1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

---

### Tekst do uproszczenia:

<text>""",
    ),
}

# ── PLCC-inspired visual style ──────────────────────────────────────────────
# Mirrors the sdadas/plcc leaderboard: clean white background, a system
# sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
# (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
# CSS — a custom gr.themes.* would tint the component label chips blue, which
# is not part of the PLCC look.
PLCC_CSS = """
.gradio-container {
    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
        "Helvetica Neue", Arial, sans-serif !important;
    max-width: 1500px !important;
}
/* PLCC-style data tables */
.plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
.plain-table thead th {
    background: #f9fafd !important;
    border-bottom: 2px solid #ddd !important;
    color: #222 !important;
    font-weight: 700 !important;
}
.plain-table tbody td { padding: 8px 10px !important; }
.plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
.plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
/* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
.params-col tbody td:nth-child(3),
.params-col thead th:nth-child(3) {
    text-align: right !important;
    white-space: nowrap;
}
.params-col tbody td:nth-child(3) { color: #999 !important; }
/* Filter bar — the grey rounded block holding the dropdowns */
.filter-bar {
    background: #f9fafd;
    border: 1px solid #ddd;
    border-radius: 0.5rem;
    padding: 10px 14px;
}
"""

# Colour palette for category bars
_CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]


def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
    """Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
    if df.empty or "Model" not in df.columns:
        return df
    return df[df["Model"].isin(allowed)].reset_index(drop=True)


def load_rrf_views(
    text_category: str | None = None,
    prompt: str | None = None,
    size_limit: str | None = None,
    model_type: str | None = None,
) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
    """Final ranking DataFrame and per-category DataFrames for the selected filters.

    Ranks and RRF scores are computed over **all** models (honouring only the
    text-category / prompt filters). The size-limit and model-type selections
    are then applied as pure row filters that hide models without recomputing
    any ranking - so a surviving model keeps the rank it held in the full table.
    """
    category_data = [
        (cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
    ]
    final_df = build_final_ranking_df(category_data)

    allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
    final_df = _filter_model_rows(final_df, allowed)
    category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
    return final_df, category_data


def _tradeoff_figs(
    text_category: str | None = None,
    prompt: str | None = None,
    size_limit: str | None = None,
    model_type: str | None = None,
) -> tuple[go.Figure, go.Figure]:
    """Both trade-off scatters for the selected filters (empty figure when no data)."""
    return (
        build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
        build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
    )


def build_app() -> gr.Blocks:
    (
        read_orth_df, read_lemma_df,
        lex_orth_df, lex_lemma_df,
        similarity_df, questeval_df,
        markers_df, detail_df,
    ) = load_leaderboard_data()

    ifeval_cmp_df = load_ifeval_comparison_df()
    final_df, category_data = load_rrf_views(None, None)
    tc_choices = text_category_choices()
    pr_choices = prompt_choices()
    size_choices = _visible_size_limits()
    tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)

    with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
        gr.Markdown(INTRO)

        if read_orth_df.empty:
            gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*")
        else:
            # Reactive output components, gathered in the order the change
            # handler returns them: final table, then one table per in-RRF
            # category, then the two trade-off scatters (and the IFEval table).
            rrf_outputs: list = []

            with gr.Row(elem_classes=["filter-bar"]):
                tc_dropdown = gr.Dropdown(
                    choices=tc_choices,
                    value="All",
                    label="Text category",
                    info="Filter the RRF rankings to one source-text category.",
                )
                pr_dropdown = gr.Dropdown(
                    choices=pr_choices,
                    value="All",
                    label="Simplification prompt",
                    info="Filter the RRF rankings to one simplification prompt.",
                )
                size_dropdown = gr.Dropdown(
                    choices=size_choices,
                    value="ALL",
                    label="Size limit",
                    info="Keep only models up to this many parameters.",
                )
                type_dropdown = gr.Dropdown(
                    choices=MODEL_TYPES,
                    value="ALL",
                    label="Model type",
                    info="Filter by open- vs closed-weights models.",
                )

            with gr.Tabs():

                # ── Final Ranking ──────────────────────────────────────────
                with gr.TabItem("Final Ranking"):
                    gr.Markdown(
                        "Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. "
                        "Each category ranks models by its own RRF score; those ranks are then fused into a "
                        "single **Final RRF** score. Higher = better overall simplification. "
                        "The **PLCC** column shows the model's score on the external "
                        "[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
                        "benchmark for reference only - it does not affect the ranking (blank where unavailable)."
                    )
                    final_table = gr.Dataframe(
                        value=final_df, interactive=False, wrap=True,
                        elem_classes=["plain-table", "params-col"],
                    )
                    gr.Markdown(N_NOTE)
                    rrf_outputs += [final_table]

                # ── RRF category tabs ──────────────────────────────────────
                for cat, cat_df in category_data:
                    if not cat.get("in_rrf", True):
                        continue
                    with gr.TabItem(cat["name"]):
                        gr.Markdown(cat["description"])
                        cat_table = gr.Dataframe(
                            value=cat_df, interactive=False, wrap=True,
                            elem_classes=["plain-table", "params-col"],
                        )
                        gr.Markdown(N_NOTE)
                        rrf_outputs += [cat_table]

                # ── Trade-off plots ────────────────────────────────────────
                with gr.TabItem("Trade-off"):
                    gr.Markdown(
                        "Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
                        "(QuestEval F1), one point per model. Top-left is ideal: "
                        "greater complexity reduction **and** faithful to the original."
                    )
                    tradeoff_plot = gr.Plot(value=tradeoff_fig)
                    gr.Markdown(
                        "---\n"
                        "Gunning Fog orth reduction (Δ%) versus NLI F1. "
                        "Top-left is best: greater complexity reduction **and** strong NLI entailment."
                    )
                    fog_nli_plot = gr.Plot(value=fog_nli_fig)
                    rrf_outputs += [tradeoff_plot, fog_nli_plot]

                with gr.TabItem("Detailed scores", visible=False):
                    gr.Markdown(
                        "Average scores before and after simplification, plus absolute (Δ) "
                        "and percentage (Δ%) change - for all readability, lexical, and marker metrics."
                    )
                    gr.Dataframe(
                        value=detail_df, interactive=False, wrap=True,
                        elem_classes=["plain-table"],
                    )

                # ── IFEval: manual vs automatic ────────────────────────────
                if not ifeval_cmp_df.empty:
                    with gr.TabItem("IFEval manual vs auto"):
                        gr.Markdown(
                            "**Automatic** IFEval constraints are generated by an LLM; "
                            "**manual** constraints are hand-written gold rules, available for a "
                            "subset of the prompts. To isolate rule quality from sampling, the "
                            "comparison is restricted to the texts that carry **both** scores "
                            "(N = matched texts per model), so these automatic figures differ from "
                            "the full-sample IFEval used elsewhere.\n\n"
                            "**include** = fraction of *include* constraints satisfied, "
                            "**exclude** = fraction of *exclude* constraints satisfied (higher is "
                            "better for both). **Δ = manual − automatic** (on the matched texts): a "
                            "negative Δ means the automatic rules were easier to satisfy than the "
                            "hand-checked ones (more lenient automatic scoring). The **(all)** columns "
                            "show automatic IFEval over *every* text (the full-sample figure used "
                            "elsewhere). **Δ (man−auto all)** is manual minus that full-sample "
                            "automatic value - useful as a sanity check, but note the two cover "
                            "different text sets (matched subset vs. all texts), so **Δ (man−auto)** "
                            "is the rigorous like-for-like comparison."
                        )
                        ifeval_cmp_table = gr.Dataframe(
                            value=ifeval_cmp_df, interactive=False, wrap=True,
                            elem_classes=["plain-table"],
                        )
                        rrf_outputs.append(ifeval_cmp_table)

            # Metric documentation, shown below the results.
            gr.Markdown(METRICS_DOC)

            # Simplification prompts, documenting the "Simplification prompt"
            # filter values — shown below the metric documentation.
            gr.Markdown(
                "## Simplification prompts\n\n"
                "The five prompt templates every model is run with - these are the "
                "values of the **Simplification prompt** filter above. Each source "
                "text is simplified once per prompt, so they range from a bare "
                "one-line instruction to full plain-language guidelines. "
                "`<text>` marks where the source text is inserted."
            )
            for _name, (_desc, _body) in PROMPTS.items():
                with gr.Accordion(f"{_name} - {_desc}", open=False):
                    gr.Markdown(f"```\n{_body}\n```")

            # Recompute the RRF rankings whenever any filter changes.
            _filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]

            def _refresh_rrf(
                text_category: str, prompt: str, size_limit: str, model_type: str
            ) -> list:
                f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
                updates: list = [f_df]
                for cat, df in cat_data:
                    if not cat.get("in_rrf", True):
                        continue
                    updates += [df]
                updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
                if not ifeval_cmp_df.empty:
                    updates.append(
                        load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
                    )
                return updates

            for _dd in _filters:
                _dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)

    return app


app = build_app()

if __name__ == "__main__":
    app.launch()