Spaces:

QuanticaLab
/

PLainBench

Running

App Files Files Community

bartlomiejn87 commited on 8 days ago

Commit

3bd48fe

0 Parent(s):

Initial commit

Browse files

Files changed (27) hide show

.gitattributes +35 -0
README.md +14 -0
app.py +1400 -0
data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2412_2026-06-15_095534_scored_anon.json +0 -0
data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2512_2026-06-12_082622_scored_anon.json +0 -0
data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2412_2026-06-02_091112_scored_anon.json +0 -0
data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2512_2026-06-02_121044_scored_anon.json +0 -0
data/current/CYFRAGOVPL__PLLuM-12B-chat-2412_2026-06-02_141510_scored_anon.json +0 -0
data/current/CYFRAGOVPL__PLLuM-12B-chat-2512_2026-06-02_195811_scored_anon.json +0 -0
data/current/CYFRAGOVPL__PLLuM-12B-instruct-2512_2026-06-10_102424_scored_anon.json +0 -0
data/current/CYFRAGOVPL__PLLuM-4B-chat-2512_2026-06-02_223411_scored_anon.json +0 -0
data/current/deepseek__deepseek-v4-pro_reasoning-high_2026-05-31_094932_scored_anon.json +0 -0
data/current/google__gemini-3.1-pro-preview_2026-06-11_121124_scored_anon.json +0 -0
data/current/google__gemma-3-4b-it_reasoning-none_2026-06-08_110604_scored_anon.json +0 -0
data/current/google__gemma-4-26b-a4b-it_reasoning-high_2026-05-31_223337_scored_anon.json +0 -0
data/current/google__gemma-4-26b-a4b-it_reasoning-none_2026-06-01_020338_scored_anon.json +0 -0
data/current/google__gemma-4-31b-it_reasoning-high_2026-05-31_124753_scored_anon.json +0 -0
data/current/google__gemma-4-31b-it_reasoning-none_2026-05-31_200347_scored_anon.json +0 -0
data/current/meta-llama__llama-3.1-70b-instruct_reasoning-none_2026-06-08_102826_scored_anon.json +0 -0
data/current/meta-llama__llama-3.1-8b-instruct_reasoning-none_2026-06-08_100015_scored_anon.json +0 -0
data/current/mistralai__ministral-8b-2512_2026-05-31_083128_scored_anon.json +0 -0
data/current/mistralai__mistral-nemo_2026-05-31_084528_scored_anon.json +0 -0
data/current/openai__gpt-oss-120b_2026-06-12_123249_scored_anon.json +0 -0
data/current/openai__gpt-oss-20b_2026-06-12_133408_scored_anon.json +0 -0
data/current/qwen__qwen3.5-35b-a3b_reasoning-high_2026-06-01_023022_scored_anon.json +0 -0
data/current/speakleash__Bielik-11B-v3.0-Instruct_2026-06-01_112337_scored_anon.json +0 -0
requirements.txt +2 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: PLainBench
+emoji: ⚡
+colorFrom: green
+colorTo: yellow
+sdk: gradio
+sdk_version: 6.12.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Benchmark for scoring LLMs in text simplification
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1400 @@

+"""PLainBench - Polish Text Simplification Leaderboard.
+Reads scored anon JSON files from the data/current/ directory and displays a
+leaderboard showing how well each LLM simplifies Polish texts, measured
+by readability indices, difficulty markers, reference-based similarity
+metrics, and a QuestEval-style QA consistency score.
+"""
+import json
+from functools import lru_cache
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+DATA_DIR = Path(__file__).parent / "data" / "current"
+@lru_cache(maxsize=1)
+def load_records() -> tuple[dict, ...]:
+    """Parse every scored anon JSON once and cache the result.
+    The full files are large (~9 MB each, holding per-text records), but the
+    app only ever reads ``metadata`` and ``summary``. We keep just those two
+    sections so each file is parsed a single time and every loader/refresh
+    reuses the in-memory copy instead of re-reading from disk.
+    """
+    records: list[dict] = []
+    if not DATA_DIR.exists():
+        return ()
+    for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
+        with open(fp, encoding="utf-8") as f:
+            data = json.load(f)
+        records.append({"metadata": data["metadata"], "summary": data["summary"]})
+    return tuple(records)
+# Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
+# quick-filters. Size options are *upper bounds* in billions of parameters.
+SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
+MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]
+def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
+    """Whether a model's metadata satisfies the size-limit / model-type filters."""
+    if model_type and model_type != "ALL":
+        want = "open" if model_type == "open-weights" else "closed"
+        if meta.get("weights") != want:
+            return False
+    if size_limit and size_limit != "ALL":
+        cap = float(size_limit.rstrip("B"))
+        params = meta.get("total_params_b") or 0
+        # Unknown / unreported size (0) can't be placed under a cap, so exclude it.
+        if params <= 0 or params > cap:
+            return False
+    return True
+def _filtered_records(
+    size_limit: str | None = None, model_type: str | None = None
+) -> list[dict]:
+    """Records whose model passes the size-limit / model-type filters."""
+    sl = size_limit or "ALL"
+    mt = model_type or "ALL"
+    return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]
+def _visible_size_limits() -> list[str]:
+    """Prune ``SIZE_LIMITS`` to the caps that actually split the current models.
+    A numeric cap is redundant when it selects the same set of models as the
+    next-smaller cap (no model has a size in the band between them) - those
+    upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
+    always kept. Recomputed from the data, so adding models later automatically
+    re-expands the list.
+    """
+    params = [
+        p for d in load_records()
+        if (p := d["metadata"].get("total_params_b") or 0) > 0
+    ]
+    # Ascending by value: keep the smallest representative of each distinct
+    # subset; a larger cap with the same model count is the redundant "upper" one.
+    kept: set[str] = set()
+    prev_count = -1
+    for s in sorted(
+        (s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
+    ):
+        cap = float(s.rstrip("B"))
+        count = sum(1 for p in params if p <= cap)
+        if count > 0 and count != prev_count:
+            kept.add(s)
+            prev_count = count
+    # Preserve the original descending display order, with ALL first.
+    return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]
+READABILITY_ORTH_LABELS = {
+    "flesch_reading_ease_orth": "Flesch RE",
+    "flesch_kincaid_grade_orth": "Flesch-Kincaid",
+    "gunning_fog_orth": "Gunning Fog",
+    "ari_orth": "ARI",
+    "linsear_write_orth": "Linsear Write",
+    "smog_grade_orth": "SMOG",
+    "coleman_liau_orth": "Coleman-Liau",
+    "pisarek_orth": "Pisarek",
+}
+READABILITY_LEMMA_LABELS = {
+    "flesch_reading_ease_lemma": "Flesch RE",
+    "flesch_kincaid_grade_lemma": "Flesch-Kincaid",
+    "gunning_fog_lemma": "Gunning Fog",
+    "ari_lemma": "ARI",
+    "linsear_write_lemma": "Linsear Write",
+    "smog_grade_lemma": "SMOG",
+    "coleman_liau_lemma": "Coleman-Liau",
+    "pisarek_lemma": "Pisarek",
+}
+LEXICAL_ORTH_LABELS = {
+    "ttr_orth": "TTR",
+    "rttr_orth": "RTTR",
+    "cttr_orth": "CTTR",
+    "herdan_orth": "Herdan",
+    "summer_orth": "Summer",
+    "dugast_orth": "Dugast",
+    "maas_orth": "Maas",
+    "mtld_orth": "MTLD",
+    "mattr_orth": "MATTR",
+}
+LEXICAL_LEMMA_LABELS = {
+    "ttr_lemma": "TTR",
+    "rttr_lemma": "RTTR",
+    "cttr_lemma": "CTTR",
+    "herdan_lemma": "Herdan",
+    "summer_lemma": "Summer",
+    "dugast_lemma": "Dugast",
+    "maas_lemma": "Maas",
+    "mtld_lemma": "MTLD",
+    "mattr_lemma": "MATTR",
+}
+SIMILARITY_LABELS = {
+    "bert_score_precision": "BERTScore P",
+    "bert_score_recall": "BERTScore R",
+    "bert_score_f1": "BERTScore F1",
+    "bleu": "BLEU",
+    "chrf": "chrF",
+    "chrfpp": "chrF++",
+    "nli_precision": "NLI P",
+    "nli_recall": "NLI R",
+    "nli_f1": "NLI F1",
+    "rouge_1_precision": "ROUGE-1 P",
+    "rouge_1_recall": "ROUGE-1 R",
+    "rouge_1_f1": "ROUGE-1 F1",
+    "rouge_2_precision": "ROUGE-2 P",
+    "rouge_2_recall": "ROUGE-2 R",
+    "rouge_2_f1": "ROUGE-2 F1",
+    "rouge_l_precision": "ROUGE-L P",
+    "rouge_l_recall": "ROUGE-L R",
+    "rouge_l_f1": "ROUGE-L F1",
+    "wer": "WER",
+    "mer": "MER",
+    "wil": "WIL",
+    "ne_retention": "NE Retention",
+}
+MARKER_LABELS = {
+    # counts
+    "paragraph_count": "Paragraph count",
+    "sentence_count": "Sentence count",
+    "word_count": "Word count",
+    "named_entity_count": "Named entity count",
+    "difficult_word_count": "Difficult word count",
+    "difficult_word_count_orth": "Difficult word count (orth)",
+    # average lengths
+    "avg_word_syllables": "Avg word syllables",
+    "avg_sentence_length": "Avg sentence length",
+    "avg_paragraph_length": "Avg paragraph length",
+    # lexical difficulty
+    "named_entity_ratio": "Named entity ratio",
+    "difficult_word_ratio": "Difficult word ratio",
+    "difficult_word_ratio_orth": "Difficult word ratio (orth)",
+    # POS ratios
+    "noun_ratio": "Noun ratio",
+    "difficult_noun_ratio": "Difficult noun ratio",
+    "difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
+    "verb_ratio": "Verb ratio",
+    "difficult_verb_ratio": "Difficult verb ratio",
+    "difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
+    "adjective_ratio": "Adjective ratio",
+    "difficult_adjective_ratio": "Difficult adjective ratio",
+    "difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
+    # POS-to-POS ratios
+    "noun_to_verb_ratio": "Noun/verb ratio",
+    "verbo_nominal_ratio": "Verbo-nominal ratio",
+    "adj_to_verb_ratio": "Adj/verb ratio",
+    "adj_to_noun_ratio": "Adj/noun ratio",
+    # morphological
+    "nie_prefix_ratio": "Nie-prefix ratio",
+    "participle_ratio": "Participle ratio",
+    "gerund_ratio": "Gerund ratio",
+    "osc_noun_ratio": "OSC noun ratio",
+    "impersonal_verb_ratio": "Impersonal verb ratio",
+    "genitive_noun_ratio": "Genitive noun ratio",
+    "avg_genitive_chain_length": "Avg genitive chain",
+    # syntactic
+    "sentence_length_variance": "Sentence length variance",
+    "mean_dependency_distance": "Mean dep. distance",
+    "subordination_index": "Subordination index",
+}
+QUESTEVAL_LABELS = {
+    "precision": "QuestEval P",
+    "recall": "QuestEval R",
+    "f1": "QuestEval F1",
+    "answerable_rate_forward": "Answerable (fwd)",
+    "answerable_rate_backward": "Answerable (bwd)",
+}
+RRF_K = 60
+# Each entry: (source, key, label, ascending_rrf, in_rrf)
+#   source       — "metrics" | "markers" → use avg_diff_pct (Δ%)
+#                  "similarity" | "questeval" → use absolute value
+#   ascending_rrf — True = lower value is better (rank 1 = smallest)
+#   in_rrf        — include this metric in category RRF computation
+CATEGORIES: list[dict] = [
+    {
+        "name": "Readability",
+        "in_rrf": True,
+        "rrf_weight": 1,
+        "description": (
+            "Readability indices - **orth** (surface-form) variants. "
+            "Δ% = percentage change after simplification. "
+            "For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. "
+            "**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
+            "**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
+            "where complex words have many syllables (lower → easier). "
+            "**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
+            "IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
+        ),
+        "metrics": [
+            ("metrics", "flesch_reading_ease_orth", "Flesch RE",      False, True),
+            ("metrics", "gunning_fog_orth",         "Gunning Fog",    True,  True),
+            ("metrics", "coleman_liau_orth",        "Coleman-Liau",   True,  True),
+            ("ifeval",  "avg_exclude",              "IFEval exclude", False, True),
+        ],
+    },
+    {
+        "name": "Lexical Difficulty",
+        "in_rrf": True,
+        "rrf_weight": 1,
+        "description": (
+            "Word-level difficulty markers - **orth** variants where available. "
+            "Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
+            "**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). "
+            "**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
+            "(higher → harder). "
+            "**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
+            "**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
+            "(higher → more complex nominal vocabulary)."
+        ),
+        "metrics": [
+            ("markers", "avg_word_syllables",        "Avg word syllables",   True,  True),
+            ("markers", "difficult_word_ratio_orth", "Difficult word ratio", True,  True),
+            ("markers", "verb_ratio",                "Verb ratio",           False, True),
+            ("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True,  True),
+        ],
+    },
+    {
+        "name": "Syntactic",
+        "in_rrf": True,
+        "rrf_weight": 1,
+        "description": (
+            "Sentence and clause structure complexity markers. "
+            "Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
+            "**Avg sentence length** is the mean number of words per sentence (higher → harder). "
+            "**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
+            "**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
+        ),
+        "metrics": [
+            ("markers", "avg_sentence_length",      "Avg sentence length",  True, True),
+            ("markers", "sentence_length_variance", "Sentence length var.", True, False),
+            ("markers", "mean_dependency_distance", "Mean dep. distance",   True, True),
+            ("markers", "subordination_index",      "Subordination index",  True, True),
+        ],
+    },
+    {
+        "name": "Morphological",
+        "in_rrf": True,
+        "rrf_weight": 1,
+        "description": (
+            "Polish-specific morphological complexity markers. "
+            "Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
+            "**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) "
+            "among alphabetic tokens - a bookish, formal construction (higher → more complex). "
+            "**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words "
+            "(higher → more nominalised, formal). "
+            "**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), "
+            "passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish "
+            "(higher → more impersonal, harder). "
+            "**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
+            "(higher → harder). "
+            "**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
+            "(higher → more genitive stacking, harder). "
+            "**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of "
+            "administrative Polish (higher → harder). "
+            "**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)."
+        ),
+        "metrics": [
+            ("markers", "participle_ratio",            "Participle ratio",           True, False),
+            ("markers", "adverbial_participle_ratio",  "Adverbial participle ratio", True, True),
+            ("markers", "gerund_ratio",                "Gerund ratio",               True, True),
+            ("markers", "impersonal_verb_ratio",       "Impersonal verb ratio",      True, True),
+            ("markers", "genitive_noun_ratio",         "Genitive noun ratio",        True, True),
+            ("markers", "avg_genitive_chain_length",   "Avg genitive chain",         True, True),
+            ("markers", "verbo_nominal_ratio",         "Verbo-nominal ratio",        True, True),
+            ("markers", "osc_noun_ratio",              "OSC noun ratio",             True, True),
+        ],
+    },
+    {
+        "name": "Meaning Preservation",
+        "in_rrf": True,
+        "rrf_weight": 4,
+        "description": (
+            "Semantic metrics that directly test whether the simplified text says the same thing as the original. "
+            "NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
+            "NE Retention measures what fraction of named entities from the original appear in the simplified text "
+            "(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
+            "IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
+            "Higher is better for all."
+        ),
+        "metrics": [
+            ("similarity", "nli_f1",        "NLI F1",       False, True),
+            ("questeval",  "f1",            "QuestEval F1", False, True),
+            ("similarity", "ne_retention",  "NE Retention", False, True),
+            ("ifeval",     "avg_include",   "IFEval include", False, True),
+        ],
+    },
+]
+def _col_name(source: str, label: str) -> str:
+    """Column name used in category DataFrames."""
+    if source in ("metrics", "markers"):
+        return f"{label} (Δ%)"
+    return label
+def _model_label(data: dict) -> str:
+    """Return a unique display name, appending reasoning effort when present.
+    The parameter size is shown separately (see :func:`_params_str`), in its
+    own column, mirroring the PLCC leaderboard layout.
+    """
+    model = data["metadata"]["model"]
+    effort = (
+        data["metadata"]
+        .get("model_kwargs", {})
+        .get("extra_body", {})
+        .get("reasoning", {})
+        .get("effort")
+    )
+    if effort is not None:
+        return f"{model} [reasoning: {effort}]"
+    return model
+def _params_str(params: float | None) -> str | None:
+    """PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
+    p = params or 0
+    if p <= 0:
+        return None
+    return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"
+def _params_map() -> dict[str, str]:
+    """Model label → formatted parameter size, read from each file's metadata."""
+    out: dict[str, str] = {}
+    for data in load_records():
+        label = _params_str(data["metadata"].get("total_params_b"))
+        if label:
+            out[_model_label(data)] = label
+    return out
+def _metric_row(
+    label_map: dict,
+    summary_metrics: dict,
+    row: dict,
+    detail_row: dict,
+    *,
+    include_detail: bool = True,
+) -> None:
+    """Populate leaderboard row and detail row from a label→key map."""
+    for key, label in label_map.items():
+        vals = summary_metrics.get(key, {})
+        row[f"{label} (Δ)"] = vals.get("avg_diff")
+        row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
+        if include_detail:
+            detail_row[f"{label} before"] = vals.get("avg_before")
+            detail_row[f"{label} after"] = vals.get("avg_after")
+            detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
+            detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
+def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
+    """Load scored JSON files and build leaderboard DataFrames.
+    Returns:
+        (readability_orth_df, readability_lemma_df,
+         lexical_orth_df, lexical_lemma_df,
+         similarity_df, questeval_df, markers_df, detail_df)
+    """
+    read_orth_rows, read_lemma_rows = [], []
+    lex_orth_rows, lex_lemma_rows = [], []
+    similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []
+    if not DATA_DIR.exists():
+        empty = pd.DataFrame()
+        return empty, empty, empty, empty, empty, empty, empty, empty
+    for data in load_records():
+        model = _model_label(data)
+        n = data["summary"]["n"]
+        metrics = data["summary"]["metrics"]
+        similarity = data["summary"].get("similarity", {})
+        questeval = data["summary"].get("questeval", {})
+        markers = data["summary"].get("markers", {})
+        base = {"Model": model, "N": n}
+        read_orth_row = dict(base)
+        read_lemma_row = dict(base)
+        lex_orth_row = dict(base)
+        lex_lemma_row = dict(base)
+        similarity_row = dict(base)
+        questeval_row = dict(base)
+        markers_row = dict(base)
+        detail_row = dict(base)
+        _metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
+        _metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
+        _metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
+        _metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)
+        for key, label in SIMILARITY_LABELS.items():
+            similarity_row[label] = similarity.get(key)
+        for key, label in QUESTEVAL_LABELS.items():
+            questeval_row[label] = questeval.get(key)
+        for key, label in MARKER_LABELS.items():
+            vals = markers.get(key, {})
+            markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
+            markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
+            detail_row[f"{label} before"] = vals.get("avg_before")
+            detail_row[f"{label} after"] = vals.get("avg_after")
+            detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
+            detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
+        read_orth_rows.append(read_orth_row)
+        read_lemma_rows.append(read_lemma_row)
+        lex_orth_rows.append(lex_orth_row)
+        lex_lemma_rows.append(lex_lemma_row)
+        similarity_rows.append(similarity_row)
+        questeval_rows.append(questeval_row)
+        markers_rows.append(markers_row)
+        detail_rows.append(detail_row)
+    dfs = [
+        pd.DataFrame(read_orth_rows),
+        pd.DataFrame(read_lemma_rows),
+        pd.DataFrame(lex_orth_rows),
+        pd.DataFrame(lex_lemma_rows),
+        pd.DataFrame(similarity_rows),
+        pd.DataFrame(questeval_rows),
+        pd.DataFrame(markers_rows),
+        pd.DataFrame(detail_rows),
+    ]
+    for df in dfs:
+        num_cols = df.select_dtypes(include="number").columns
+        df[num_cols] = df[num_cols].round(4)
+    return tuple(dfs)
+@lru_cache(maxsize=1)
+def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
+    """Per-model matched IFEval records, cached once.
+    Manual IFEval rules are hand-written for a subset of the prompts, so the
+    comparison only makes sense on records carrying *both* an automatic and a
+    manual score. This reads the per-text ``results`` arrays (which
+    ``load_records`` discards) once and keeps, per model, the tuples
+    ``(category, prompt_id, auto_include, auto_exclude, man_include,
+    man_exclude)`` so the dropdown filters can re-aggregate cheaply.
+    """
+    out: list[tuple[str, tuple[tuple, ...]]] = []
+    if not DATA_DIR.exists():
+        return ()
+    for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
+        with open(fp, encoding="utf-8") as f:
+            data = json.load(f)
+        model = _model_label(data)
+        recs: list[tuple] = []
+        for rec in data["results"]:
+            man = rec.get("ifeval_manual")
+            auto = rec.get("ifeval")
+            if not man or not auto:
+                continue
+            recs.append((
+                rec.get("category"),
+                rec.get("prompt_id"),
+                auto.get("include"), auto.get("exclude"),
+                man.get("include"), man.get("exclude"),
+            ))
+        if recs:
+            out.append((model, tuple(recs)))
+    return tuple(out)
+def load_ifeval_comparison_df(
+    text_category: str | None = None,
+    prompt: str | None = None,
+    size_limit: str | None = None,
+    model_type: str | None = None,
+) -> pd.DataFrame:
+    """Compare manual (gold) IFEval against automatic IFEval, per model.
+    The comparison is restricted to records carrying *both* an automatic and a
+    manual score - the very same texts scored both ways, which isolates the
+    rule-quality gap from sampling differences (the overall ``ifeval`` summary
+    averages over ~5× more texts and so is not directly comparable). ``Δ``
+    columns are manual − automatic: a negative value means the automatic
+    constraints were easier to satisfy than the hand-checked ones, i.e. the
+    automatic rules are more lenient.
+    ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
+    restrict the matched records to one source-text category and/or one
+    simplification prompt, mirroring the RRF dropdown filters.
+    """
+    tc = None if text_category in (None, "All") else text_category
+    pr = None if prompt in (None, "All") else prompt
+    # Automatic IFEval over *all* records (not just the manual-matched subset),
+    # from the summary buckets, so it tracks the same category/prompt filters.
+    # Restricted to models passing the size / model-type filters.
+    allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
+    summaries = {
+        _model_label(data): data["summary"]
+        for data in load_records()
+        if _model_label(data) in allowed
+    }
+    rows: list[dict] = []
+    for model, recs in _load_ifeval_records():
+        if model not in allowed:
+            continue
+        ai = ae = mi = me = 0.0
+        ni = ne = 0
+        for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
+            if tc and cat != tc:
+                continue
+            if pr and prompt_id != pr:
+                continue
+            if m_inc is not None and a_inc is not None:
+                ai += a_inc; mi += m_inc; ni += 1
+            if m_exc is not None and a_exc is not None:
+                ae += a_exc; me += m_exc; ne += 1
+        if ni == 0 and ne == 0:
+            continue
+        auto_inc = ai / ni if ni else None
+        man_inc = mi / ni if ni else None
+        auto_exc = ae / ne if ne else None
+        man_exc = me / ne if ne else None
+        auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
+        all_inc = auto_all.get("avg_include")
+        all_exc = auto_all.get("avg_exclude")
+        rows.append({
+            "Model": model,
+            "N": ni or ne,
+            "Manual include": man_inc,
+            "Manual exclude": man_exc,
+            "Auto include": auto_inc,
+            "Auto include (all)": all_inc,
+            "Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
+            "Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
+            "Auto exclude": auto_exc,
+            "Auto exclude (all)": all_exc,
+            "Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
+            "Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
+        })
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+    df = df.sort_values("Model").reset_index(drop=True)
+    num_cols = df.select_dtypes(include="number").columns
+    df[num_cols] = df[num_cols].round(4)
+    return df
+def text_category_choices() -> list[str]:
+    """All source-text categories present in the data, prefixed with 'All'."""
+    cats: set[str] = set()
+    for data in load_records():
+        cats.update(data["summary"].get("metrics_by_category", {}).keys())
+    return ["All"] + sorted(cats)
+def prompt_choices() -> list[str]:
+    """All simplification prompts present in the data, prefixed with 'All'."""
+    prompts: set[str] = set()
+    for data in load_records():
+        prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
+    return ["All"] + sorted(prompts)
+def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict:
+    """Return the metric bucket for one source, filtered by text category and/or prompt.
+    Picks the overall summary when neither filter is set, the ``*_by_category`` /
+    ``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket
+    (keyed ``"CATEGORY/PROMPT"``) when both are set.
+    """
+    if source in ("metrics", "markers", "similarity"):
+        if tc and prompt:
+            return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
+        if tc:
+            return s.get(f"{source}_by_category", {}).get(tc, {})
+        if prompt:
+            return s.get(f"{source}_by_prompt", {}).get(prompt, {})
+        return s.get(source, {})
+    # questeval / ifeval keep their per-filter buckets nested under the source object
+    src = s.get(source, {})
+    if tc and prompt:
+        return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
+    if tc:
+        return src.get("by_category", {}).get(tc, {})
+    if prompt:
+        return src.get("by_prompt", {}).get(prompt, {})
+    return src
+def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None:
+    """Sample count for the selected filters, from whichever source records it."""
+    for src in ("questeval", "ifeval"):
+        n = _source_bucket(s, src, tc, prompt).get("n")
+        if n is not None:
+            return n
+    return None
+def load_category_df(
+    category: dict,
+    text_category: str | None = None,
+    prompt: str | None = None,
+) -> pd.DataFrame:
+    """Build a DataFrame for one metric category with a per-category RRF score.
+    ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
+    restrict the metrics to one source-text category and/or one simplification
+    prompt via the matching ``*_by_category`` / ``*_by_prompt`` /
+    ``*_by_category_prompt`` buckets; otherwise the overall summary is used.
+    The RRF is always computed over **all** models; the size-limit / model-type
+    filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
+    so they never change a model's rank or score.
+    """
+    rows: list[dict] = []
+    tc = None if text_category in (None, "All") else text_category
+    pr = None if prompt in (None, "All") else prompt
+    for data in load_records():
+        s = data["summary"]
+        model = _model_label(data)
+        n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
+        row: dict = {"Model": model, "N": n}
+        for source, key, label, _asc, in_rrf in category["metrics"]:
+            if not in_rrf:
+                continue
+            col = _col_name(source, label)
+            bucket = _source_bucket(s, source, tc, pr)
+            if source in ("metrics", "markers"):
+                row[col] = bucket.get(key, {}).get("avg_diff_pct")
+            else:  # similarity, questeval, ifeval store the value directly
+                row[col] = bucket.get(key)
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+    num_cols = df.select_dtypes(include="number").columns
+    df[num_cols] = df[num_cols].round(4)
+    rrf = pd.Series(0.0, index=df.index)
+    for source, key, label, ascending, in_rrf in category["metrics"]:
+        if not in_rrf:
+            continue
+        col = _col_name(source, label)
+        if col not in df.columns or df[col].isna().all():
+            continue
+        rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))
+    df.insert(2, "RRF Score", rrf.round(4))
+    df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
+    df.insert(0, "Rank", range(1, len(df) + 1))
+    df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
+    return df
+def _plcc_overall_map() -> dict[str, float]:
+    """Model label → external PLCC overall score, read from each file's metadata.
+    PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
+    score is carried verbatim in ``metadata.plcc.overall`` and shown for
+    reference only - it does not feed the RRF ranking. Models without a PLCC
+    entry are omitted (mapped to NaN in the table).
+    """
+    out: dict[str, float] = {}
+    for data in load_records():
+        plcc = data["metadata"].get("plcc") or {}
+        overall = plcc.get("overall")
+        if overall is not None:
+            out[_model_label(data)] = overall
+    return out
+def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
+    """Fuse per-category RRF scores into a final ranking via RRF.
+    Each category column shows the model's **rank within that category** (1 = best);
+    those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
+    A reference ``PLCC`` column carries the external PLCC benchmark score and does
+    not influence the ranking.
+    """
+    merged: pd.DataFrame | None = None
+    for cat, cat_df in category_data:
+        if not cat.get("in_rrf", True) or cat_df.empty:
+            continue
+        sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
+        merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")
+    if merged is None or merged.empty:
+        return pd.DataFrame()
+    # N (sample count) is identical across categories for a given model, so take
+    # it from whichever category table carries it.
+    n_map: dict = {}
+    for _cat, cat_df in category_data:
+        if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
+            n_map = dict(zip(cat_df["Model"], cat_df["N"]))
+            break
+    score_cols = [c for c in merged.columns if c != "Model"]
+    weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}
+    out = merged[["Model"]].copy()
+    rrf = pd.Series(0.0, index=merged.index)
+    rank_cols: dict[str, pd.Series] = {}
+    for col in score_cols:
+        ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
+        rrf += weights.get(col, 1) / (RRF_K + ranks)
+        rank_cols[col] = ranks
+    out.insert(1, "Final RRF", rrf.round(4))
+    out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
+    for name, ranks in rank_cols.items():
+        out[name] = ranks
+    out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
+    out.insert(0, "Rank", range(1, len(out) + 1))
+    out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
+    out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
+    return out
+def build_tradeoff_scatter(
+    text_category: str | None = None,
+    prompt: str | None = None,
+    size_limit: str | None = None,
+    model_type: str | None = None,
+) -> go.Figure | None:
+    """Scatter of Gunning Fog reduction vs meaning preservation, one point per model.
+    X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
+    Y: QuestEval F1 (higher = better meaning preservation)
+    Honours the same text-category / prompt / size / model-type filters as the
+    RRF rankings.
+    """
+    tc = None if text_category in (None, "All") else text_category
+    pr = None if prompt in (None, "All") else prompt
+    points = []
+    for data in _filtered_records(size_limit, model_type):
+        s = data["summary"]
+        model = _model_label(data)
+        x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
+        y = _source_bucket(s, "questeval", tc, pr).get("f1")
+        if x is None or y is None:
+            continue
+        points.append((model, x, y))
+    if not points:
+        return None
+    models, xs, ys = zip(*points)
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=xs,
+            y=ys,
+            mode="markers+text",
+            text=models,
+            textposition="top center",
+            textfont={"size": 10},
+            marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
+            hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
+        )
+    )
+    x_mid = (min(xs) + max(xs)) / 2
+    y_mid = (min(ys) + max(ys)) / 2
+    fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
+    fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
+    fig.update_layout(
+        title="Complexity reduction vs meaning preservation",
+        xaxis_title="Gunning Fog orth Δ%  (← easier text)",
+        yaxis_title="QuestEval F1  (↑ meaning preserved)",
+        height=560,
+        margin={"l": 60, "r": 40, "t": 60, "b": 60},
+        plot_bgcolor="white",
+    )
+    fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
+    fig.update_yaxes(showgrid=True, gridcolor="#EEE")
+    return fig
+def build_fog_nli_scatter(
+    text_category: str | None = None,
+    prompt: str | None = None,
+    size_limit: str | None = None,
+    model_type: str | None = None,
+) -> go.Figure | None:
+    """Scatter of Gunning Fog reduction vs NLI F1, one point per model.
+    X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
+    Y: NLI F1 (higher = stronger entailment / meaning preserved)
+    Honours the same text-category / prompt / size / model-type filters as the
+    RRF rankings.
+    """
+    tc = None if text_category in (None, "All") else text_category
+    pr = None if prompt in (None, "All") else prompt
+    points = []
+    for data in _filtered_records(size_limit, model_type):
+        s = data["summary"]
+        model = _model_label(data)
+        x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
+        y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
+        if x is None or y is None:
+            continue
+        points.append((model, x, y))
+    if not points:
+        return None
+    models, xs, ys = zip(*points)
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=xs,
+            y=ys,
+            mode="markers+text",
+            text=models,
+            textposition="top center",
+            textfont={"size": 10},
+            marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
+            hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
+        )
+    )
+    x_mid = (min(xs) + max(xs)) / 2
+    y_mid = (min(ys) + max(ys)) / 2
+    fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
+    fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
+    fig.update_layout(
+        title="Complexity reduction vs NLI consistency",
+        xaxis_title="Gunning Fog orth Δ%  (← easier text)",
+        yaxis_title="NLI F1  (↑ meaning preserved)",
+        height=560,
+        margin={"l": 60, "r": 40, "t": 60, "b": 60},
+        plot_bgcolor="white",
+    )
+    fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
+    fig.update_yaxes(showgrid=True, gridcolor="#EEE")
+    return fig
+INTRO = """\
+# PLainBench - Polish Text Simplification Leaderboard
+This benchmark evaluates how well LLMs simplify difficult Polish texts -
+drawn from legal/administrative (BIP/GOV), finance, and science domains - while
+preserving the original meaning. Each model simplifies 210 source texts under
+5 simplification prompts (1050 outputs per model). Outputs are scored on
+readability indices, fine-grained difficulty markers (lexical, syntactic,
+morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
+named-entity retention), and instruction following (IFEval include/exclude).
+The per-category scores are fused into an overall **Final RRF** ranking.
+"""
+METRICS_DOC = """\
+## Metrics
+### Readability indices
+All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
+dictionary) and counted on surface (orthographic) word forms.
+Δ is the absolute change (after − before); Δ% is the average percentage change
+from the original text to the simplified text.
+| Metric | Formula | Interpretation |
+|---|---|---|
+| **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** |
+| **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** |
+| **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** |
+### Difficulty markers
+Fine-grained syntactic, morphological, and lexical features.
+Δ is absolute change; Δ% is percentage change.
+Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
+surface (orthographic) form.
+| Marker | Description | Desired Δ% |
+|---|---|---|
+| **Avg word syllables** | Mean syllable count per word | − (shorter words) |
+| **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − |
+| **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − |
+| **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) |
+| **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) |
+| **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) |
+| **Subordination index** | Subordinate clauses / total clauses | − |
+| **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − |
+| **Gerund ratio** | Gerunds / all tokens | − |
+| **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − |
+| **Genitive noun ratio** | Nouns in genitive case / all tokens | − |
+| **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − |
+| **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − |
+| **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − |
+### Similarity metrics
+Reference-based metrics comparing simplified text against the original.
+| Metric | Description | Direction |
+|---|---|---|
+| **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment |
+| **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved |
+*Only **NLI F1** feeds the RRF score; P and R are shown for context.*
+### QuestEval - QA consistency
+| Metric | Description | Direction |
+|---|---|---|
+| **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations |
+| **QuestEval R** | Forward recall - information preserved | Higher = less content dropped |
+| **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation |
+| **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic |
+| **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original |
+*Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.*
+### IFEval - instruction following
+| Metric | Description | Direction |
+|---|---|---|
+| **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better |
+| **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better |
+"""
+# Sample-count note shown under each table that carries an ``N`` column.
+N_NOTE = "**N** = number of prompt × text evaluations per model."
+# The five simplification prompts every model is run with. The keys match the
+# "Simplification prompt" filter values (and the ``*_by_prompt`` summary
+# buckets); each value is ``(short description, user-message template)``, where
+# ``<text>`` marks where the source text is inserted. Kept in sync with
+# generation/prompting/instruction.py. Ordered from least to most detailed.
+PROMPTS: dict[str, tuple[str, str]] = {
+    "mini": (
+        "Minimal - a single-line instruction, no rules.",
+        "Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
+        "bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
+    ),
+    "compact": (
+        "Compact - a short bulleted rule set.",
+        """Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.
+Zasady:
+- Skup się na najważniejszych informacjach, usuń zbędne treści.
+- Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
+- Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
+- Twórz krótkie zdania (jedna myśl = jedno zdanie).
+- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
+- Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
+- Zachowaj poprawność językową i logiczną spójność.
+- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
+---
+### Tekst do uproszczenia:
+<text>""",
+    ),
+    "medium": (
+        "Medium - moderately detailed rules with sub-points.",
+        """Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.
+### Zasady:
+- Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
+- Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
+- Stosuj proste i naturalne słownictwo:
+  - zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
+  - jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
+- Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
+- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
+- Unikaj:
+  - żargonu, stylu urzędowego i zapożyczeń,
+  - form bezosobowych i strony biernej (jeśli nie są konieczne),
+  - nadmiaru rzeczowników odczasownikowych,
+  - podwójnych przeczeń i zawiłych konstrukcji.
+- Zachowaj poprawność językową, spójność i logiczny układ tekstu.
+- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
+---
+### Tekst do uproszczenia:
+<text>""",
+    ),
+    "long": (
+        "Long - full, sectioned plain-language guidelines.",
+        """Uprość poniższy tekst zgodnie z zasadami prostego języka.
+### 1. Cel i odbiorca
+- Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
+- Skup się na najważniejszych informacjach.
+### 2. Struktura
+- Usuń informacje zbędne i poboczne.
+- Uporządkuj treść: najważniejsze informacje podaj na początku.
+- Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
+- Jeśli tekst jest dłuższy, użyj nagłówków lub list.
+### 3. Słownictwo
+- Zastępuj trudne słowa prostszymi.
+- Unikaj:
+  - terminów specjalistycznych (chyba że je wyjaśnisz),
+  - słów rzadkich, książkowych i urzędowych,
+  - zapożyczeń i modnych zwrotów,
+  - skrótów niezrozumiałych dla odbiorcy.
+- W razie potrzeby:
+  - wyjaśnij trudne pojęcia,
+  - podaj przykłady,
+  - używaj konkretnych nazw zamiast ogólników.
+### 4. Składnia
+- Twórz krótkie zdania (ok. 20 słów).
+- Jedno zdanie = jedna myśl.
+- Używaj zdań twierdzących.
+- Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
+- Używaj strony czynnej zamiast biernej.
+- Unikaj form bezosobowych i skomplikowanych konstrukcji.
+- Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).
+### 5. Styl
+- Unikaj podwójnych przeczeń.
+- Upraszczaj złożone konstrukcje.
+- Zachowaj naturalny, jasny ton.
+### 6. Końcowa kontrola
+- Sprawdź, czy tekst jest:
+  - zrozumiały,
+  - poprawny językowo,
+  - logiczny i spójny.
+### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
+---
+### Tekst do uproszczenia:
+<text>""",
+    ),
+    "step_by_step": (
+        "Step by step - role-based, numbered editorial guidelines.",
+        """Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:
+1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
+2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
+3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
+4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
+5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
+6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
+7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
+8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
+9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
+---
+### Tekst do uproszczenia:
+<text>""",
+    ),
+}
+# ── PLCC-inspired visual style ──────────────────────────────────────────────
+# Mirrors the sdadas/plcc leaderboard: clean white background, a system
+# sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
+# (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
+# CSS — a custom gr.themes.* would tint the component label chips blue, which
+# is not part of the PLCC look.
+PLCC_CSS = """
+.gradio-container {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
+        "Helvetica Neue", Arial, sans-serif !important;
+    max-width: 1500px !important;
+}
+/* PLCC-style data tables */
+.plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
+.plain-table thead th {
+    background: #f9fafd !important;
+    border-bottom: 2px solid #ddd !important;
+    color: #222 !important;
+    font-weight: 700 !important;
+}
+.plain-table tbody td { padding: 8px 10px !important; }
+.plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
+.plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
+/* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
+.params-col tbody td:nth-child(3),
+.params-col thead th:nth-child(3) {
+    text-align: right !important;
+    white-space: nowrap;
+}
+.params-col tbody td:nth-child(3) { color: #999 !important; }
+/* Filter bar — the grey rounded block holding the dropdowns */
+.filter-bar {
+    background: #f9fafd;
+    border: 1px solid #ddd;
+    border-radius: 0.5rem;
+    padding: 10px 14px;
+}
+"""
+# Colour palette for category bars
+_CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]
+def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
+    """Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
+    if df.empty or "Model" not in df.columns:
+        return df
+    return df[df["Model"].isin(allowed)].reset_index(drop=True)
+def load_rrf_views(
+    text_category: str | None = None,
+    prompt: str | None = None,
+    size_limit: str | None = None,
+    model_type: str | None = None,
+) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
+    """Final ranking DataFrame and per-category DataFrames for the selected filters.
+    Ranks and RRF scores are computed over **all** models (honouring only the
+    text-category / prompt filters). The size-limit and model-type selections
+    are then applied as pure row filters that hide models without recomputing
+    any ranking - so a surviving model keeps the rank it held in the full table.
+    """
+    category_data = [
+        (cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
+    ]
+    final_df = build_final_ranking_df(category_data)
+    allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
+    final_df = _filter_model_rows(final_df, allowed)
+    category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
+    return final_df, category_data
+def _tradeoff_figs(
+    text_category: str | None = None,
+    prompt: str | None = None,
+    size_limit: str | None = None,
+    model_type: str | None = None,
+) -> tuple[go.Figure, go.Figure]:
+    """Both trade-off scatters for the selected filters (empty figure when no data)."""
+    return (
+        build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
+        build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
+    )
+def build_app() -> gr.Blocks:
+    (
+        read_orth_df, read_lemma_df,
+        lex_orth_df, lex_lemma_df,
+        similarity_df, questeval_df,
+        markers_df, detail_df,
+    ) = load_leaderboard_data()
+    ifeval_cmp_df = load_ifeval_comparison_df()
+    final_df, category_data = load_rrf_views(None, None)
+    tc_choices = text_category_choices()
+    pr_choices = prompt_choices()
+    size_choices = _visible_size_limits()
+    tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)
+    with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
+        gr.Markdown(INTRO)
+        if read_orth_df.empty:
+            gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*")
+        else:
+            # Reactive output components, gathered in the order the change
+            # handler returns them: final table, then one table per in-RRF
+            # category, then the two trade-off scatters (and the IFEval table).
+            rrf_outputs: list = []
+            with gr.Row(elem_classes=["filter-bar"]):
+                tc_dropdown = gr.Dropdown(
+                    choices=tc_choices,
+                    value="All",
+                    label="Text category",
+                    info="Filter the RRF rankings to one source-text category.",
+                )
+                pr_dropdown = gr.Dropdown(
+                    choices=pr_choices,
+                    value="All",
+                    label="Simplification prompt",
+                    info="Filter the RRF rankings to one simplification prompt.",
+                )
+                size_dropdown = gr.Dropdown(
+                    choices=size_choices,
+                    value="ALL",
+                    label="Size limit",
+                    info="Keep only models up to this many parameters.",
+                )
+                type_dropdown = gr.Dropdown(
+                    choices=MODEL_TYPES,
+                    value="ALL",
+                    label="Model type",
+                    info="Filter by open- vs closed-weights models.",
+                )
+            with gr.Tabs():
+                # ── Final Ranking ──────────────────────────────────────────
+                with gr.TabItem("Final Ranking"):
+                    gr.Markdown(
+                        "Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. "
+                        "Each category ranks models by its own RRF score; those ranks are then fused into a "
+                        "single **Final RRF** score. Higher = better overall simplification. "
+                        "The **PLCC** column shows the model's score on the external "
+                        "[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
+                        "benchmark for reference only - it does not affect the ranking (blank where unavailable)."
+                    )
+                    final_table = gr.Dataframe(
+                        value=final_df, interactive=False, wrap=True,
+                        elem_classes=["plain-table", "params-col"],
+                    )
+                    gr.Markdown(N_NOTE)
+                    rrf_outputs += [final_table]
+                # ── RRF category tabs ──────────────────────────────────────
+                for cat, cat_df in category_data:
+                    if not cat.get("in_rrf", True):
+                        continue
+                    with gr.TabItem(cat["name"]):
+                        gr.Markdown(cat["description"])
+                        cat_table = gr.Dataframe(
+                            value=cat_df, interactive=False, wrap=True,
+                            elem_classes=["plain-table", "params-col"],
+                        )
+                        gr.Markdown(N_NOTE)
+                        rrf_outputs += [cat_table]
+                # ── Trade-off plots ────────────────────────────────────────
+                with gr.TabItem("Trade-off"):
+                    gr.Markdown(
+                        "Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
+                        "(QuestEval F1), one point per model. Top-left is ideal: "
+                        "greater complexity reduction **and** faithful to the original."
+                    )
+                    tradeoff_plot = gr.Plot(value=tradeoff_fig)
+                    gr.Markdown(
+                        "---\n"
+                        "Gunning Fog orth reduction (Δ%) versus NLI F1. "
+                        "Top-left is best: greater complexity reduction **and** strong NLI entailment."
+                    )
+                    fog_nli_plot = gr.Plot(value=fog_nli_fig)
+                    rrf_outputs += [tradeoff_plot, fog_nli_plot]
+                with gr.TabItem("Detailed scores", visible=False):
+                    gr.Markdown(
+                        "Average scores before and after simplification, plus absolute (Δ) "
+                        "and percentage (Δ%) change - for all readability, lexical, and marker metrics."
+                    )
+                    gr.Dataframe(
+                        value=detail_df, interactive=False, wrap=True,
+                        elem_classes=["plain-table"],
+                    )
+                # ── IFEval: manual vs automatic ────────────────────────────
+                if not ifeval_cmp_df.empty:
+                    with gr.TabItem("IFEval manual vs auto"):
+                        gr.Markdown(
+                            "**Automatic** IFEval constraints are generated by an LLM; "
+                            "**manual** constraints are hand-written gold rules, available for a "
+                            "subset of the prompts. To isolate rule quality from sampling, the "
+                            "comparison is restricted to the texts that carry **both** scores "
+                            "(N = matched texts per model), so these automatic figures differ from "
+                            "the full-sample IFEval used elsewhere.\n\n"
+                            "**include** = fraction of *include* constraints satisfied, "
+                            "**exclude** = fraction of *exclude* constraints satisfied (higher is "
+                            "better for both). **Δ = manual − automatic** (on the matched texts): a "
+                            "negative Δ means the automatic rules were easier to satisfy than the "
+                            "hand-checked ones (more lenient automatic scoring). The **(all)** columns "
+                            "show automatic IFEval over *every* text (the full-sample figure used "
+                            "elsewhere). **Δ (man−auto all)** is manual minus that full-sample "
+                            "automatic value - useful as a sanity check, but note the two cover "
+                            "different text sets (matched subset vs. all texts), so **Δ (man−auto)** "
+                            "is the rigorous like-for-like comparison."
+                        )
+                        ifeval_cmp_table = gr.Dataframe(
+                            value=ifeval_cmp_df, interactive=False, wrap=True,
+                            elem_classes=["plain-table"],
+                        )
+                        rrf_outputs.append(ifeval_cmp_table)
+            # Metric documentation, shown below the results.
+            gr.Markdown(METRICS_DOC)
+            # Simplification prompts, documenting the "Simplification prompt"
+            # filter values — shown below the metric documentation.
+            gr.Markdown(
+                "## Simplification prompts\n\n"
+                "The five prompt templates every model is run with - these are the "
+                "values of the **Simplification prompt** filter above. Each source "
+                "text is simplified once per prompt, so they range from a bare "
+                "one-line instruction to full plain-language guidelines. "
+                "`<text>` marks where the source text is inserted."
+            )
+            for _name, (_desc, _body) in PROMPTS.items():
+                with gr.Accordion(f"{_name} - {_desc}", open=False):
+                    gr.Markdown(f"```\n{_body}\n```")
+            # Recompute the RRF rankings whenever any filter changes.
+            _filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]
+            def _refresh_rrf(
+                text_category: str, prompt: str, size_limit: str, model_type: str
+            ) -> list:
+                f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
+                updates: list = [f_df]
+                for cat, df in cat_data:
+                    if not cat.get("in_rrf", True):
+                        continue
+                    updates += [df]
+                updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
+                if not ifeval_cmp_df.empty:
+                    updates.append(
+                        load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
+                    )
+                return updates
+            for _dd in _filters:
+                _dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)
+    return app
+app = build_app()
+if __name__ == "__main__":
+    app.launch()

data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2412_2026-06-15_095534_scored_anon.json ADDED Viewed