bartlomiejn87 commited on
Commit
3bd48fe
·
0 Parent(s):

Initial commit

Browse files
Files changed (27) hide show
  1. .gitattributes +35 -0
  2. README.md +14 -0
  3. app.py +1400 -0
  4. data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2412_2026-06-15_095534_scored_anon.json +0 -0
  5. data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2512_2026-06-12_082622_scored_anon.json +0 -0
  6. data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2412_2026-06-02_091112_scored_anon.json +0 -0
  7. data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2512_2026-06-02_121044_scored_anon.json +0 -0
  8. data/current/CYFRAGOVPL__PLLuM-12B-chat-2412_2026-06-02_141510_scored_anon.json +0 -0
  9. data/current/CYFRAGOVPL__PLLuM-12B-chat-2512_2026-06-02_195811_scored_anon.json +0 -0
  10. data/current/CYFRAGOVPL__PLLuM-12B-instruct-2512_2026-06-10_102424_scored_anon.json +0 -0
  11. data/current/CYFRAGOVPL__PLLuM-4B-chat-2512_2026-06-02_223411_scored_anon.json +0 -0
  12. data/current/deepseek__deepseek-v4-pro_reasoning-high_2026-05-31_094932_scored_anon.json +0 -0
  13. data/current/google__gemini-3.1-pro-preview_2026-06-11_121124_scored_anon.json +0 -0
  14. data/current/google__gemma-3-4b-it_reasoning-none_2026-06-08_110604_scored_anon.json +0 -0
  15. data/current/google__gemma-4-26b-a4b-it_reasoning-high_2026-05-31_223337_scored_anon.json +0 -0
  16. data/current/google__gemma-4-26b-a4b-it_reasoning-none_2026-06-01_020338_scored_anon.json +0 -0
  17. data/current/google__gemma-4-31b-it_reasoning-high_2026-05-31_124753_scored_anon.json +0 -0
  18. data/current/google__gemma-4-31b-it_reasoning-none_2026-05-31_200347_scored_anon.json +0 -0
  19. data/current/meta-llama__llama-3.1-70b-instruct_reasoning-none_2026-06-08_102826_scored_anon.json +0 -0
  20. data/current/meta-llama__llama-3.1-8b-instruct_reasoning-none_2026-06-08_100015_scored_anon.json +0 -0
  21. data/current/mistralai__ministral-8b-2512_2026-05-31_083128_scored_anon.json +0 -0
  22. data/current/mistralai__mistral-nemo_2026-05-31_084528_scored_anon.json +0 -0
  23. data/current/openai__gpt-oss-120b_2026-06-12_123249_scored_anon.json +0 -0
  24. data/current/openai__gpt-oss-20b_2026-06-12_133408_scored_anon.json +0 -0
  25. data/current/qwen__qwen3.5-35b-a3b_reasoning-high_2026-06-01_023022_scored_anon.json +0 -0
  26. data/current/speakleash__Bielik-11B-v3.0-Instruct_2026-06-01_112337_scored_anon.json +0 -0
  27. requirements.txt +2 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PLainBench
3
+ emoji: ⚡
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 6.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Benchmark for scoring LLMs in text simplification
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,1400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PLainBench - Polish Text Simplification Leaderboard.
2
+
3
+ Reads scored anon JSON files from the data/current/ directory and displays a
4
+ leaderboard showing how well each LLM simplifies Polish texts, measured
5
+ by readability indices, difficulty markers, reference-based similarity
6
+ metrics, and a QuestEval-style QA consistency score.
7
+ """
8
+
9
+ import json
10
+ from functools import lru_cache
11
+ from pathlib import Path
12
+
13
+ import gradio as gr
14
+ import pandas as pd
15
+ import plotly.graph_objects as go
16
+
17
+ DATA_DIR = Path(__file__).parent / "data" / "current"
18
+
19
+
20
+ @lru_cache(maxsize=1)
21
+ def load_records() -> tuple[dict, ...]:
22
+ """Parse every scored anon JSON once and cache the result.
23
+
24
+ The full files are large (~9 MB each, holding per-text records), but the
25
+ app only ever reads ``metadata`` and ``summary``. We keep just those two
26
+ sections so each file is parsed a single time and every loader/refresh
27
+ reuses the in-memory copy instead of re-reading from disk.
28
+ """
29
+ records: list[dict] = []
30
+ if not DATA_DIR.exists():
31
+ return ()
32
+ for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
33
+ with open(fp, encoding="utf-8") as f:
34
+ data = json.load(f)
35
+ records.append({"metadata": data["metadata"], "summary": data["summary"]})
36
+ return tuple(records)
37
+
38
+
39
+ # Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
40
+ # quick-filters. Size options are *upper bounds* in billions of parameters.
41
+ SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
42
+ MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]
43
+
44
+
45
+ def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
46
+ """Whether a model's metadata satisfies the size-limit / model-type filters."""
47
+ if model_type and model_type != "ALL":
48
+ want = "open" if model_type == "open-weights" else "closed"
49
+ if meta.get("weights") != want:
50
+ return False
51
+ if size_limit and size_limit != "ALL":
52
+ cap = float(size_limit.rstrip("B"))
53
+ params = meta.get("total_params_b") or 0
54
+ # Unknown / unreported size (0) can't be placed under a cap, so exclude it.
55
+ if params <= 0 or params > cap:
56
+ return False
57
+ return True
58
+
59
+
60
+ def _filtered_records(
61
+ size_limit: str | None = None, model_type: str | None = None
62
+ ) -> list[dict]:
63
+ """Records whose model passes the size-limit / model-type filters."""
64
+ sl = size_limit or "ALL"
65
+ mt = model_type or "ALL"
66
+ return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]
67
+
68
+
69
+ def _visible_size_limits() -> list[str]:
70
+ """Prune ``SIZE_LIMITS`` to the caps that actually split the current models.
71
+
72
+ A numeric cap is redundant when it selects the same set of models as the
73
+ next-smaller cap (no model has a size in the band between them) - those
74
+ upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
75
+ always kept. Recomputed from the data, so adding models later automatically
76
+ re-expands the list.
77
+ """
78
+ params = [
79
+ p for d in load_records()
80
+ if (p := d["metadata"].get("total_params_b") or 0) > 0
81
+ ]
82
+ # Ascending by value: keep the smallest representative of each distinct
83
+ # subset; a larger cap with the same model count is the redundant "upper" one.
84
+ kept: set[str] = set()
85
+ prev_count = -1
86
+ for s in sorted(
87
+ (s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
88
+ ):
89
+ cap = float(s.rstrip("B"))
90
+ count = sum(1 for p in params if p <= cap)
91
+ if count > 0 and count != prev_count:
92
+ kept.add(s)
93
+ prev_count = count
94
+ # Preserve the original descending display order, with ALL first.
95
+ return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]
96
+
97
+
98
+ READABILITY_ORTH_LABELS = {
99
+ "flesch_reading_ease_orth": "Flesch RE",
100
+ "flesch_kincaid_grade_orth": "Flesch-Kincaid",
101
+ "gunning_fog_orth": "Gunning Fog",
102
+ "ari_orth": "ARI",
103
+ "linsear_write_orth": "Linsear Write",
104
+ "smog_grade_orth": "SMOG",
105
+ "coleman_liau_orth": "Coleman-Liau",
106
+ "pisarek_orth": "Pisarek",
107
+ }
108
+
109
+ READABILITY_LEMMA_LABELS = {
110
+ "flesch_reading_ease_lemma": "Flesch RE",
111
+ "flesch_kincaid_grade_lemma": "Flesch-Kincaid",
112
+ "gunning_fog_lemma": "Gunning Fog",
113
+ "ari_lemma": "ARI",
114
+ "linsear_write_lemma": "Linsear Write",
115
+ "smog_grade_lemma": "SMOG",
116
+ "coleman_liau_lemma": "Coleman-Liau",
117
+ "pisarek_lemma": "Pisarek",
118
+ }
119
+
120
+ LEXICAL_ORTH_LABELS = {
121
+ "ttr_orth": "TTR",
122
+ "rttr_orth": "RTTR",
123
+ "cttr_orth": "CTTR",
124
+ "herdan_orth": "Herdan",
125
+ "summer_orth": "Summer",
126
+ "dugast_orth": "Dugast",
127
+ "maas_orth": "Maas",
128
+ "mtld_orth": "MTLD",
129
+ "mattr_orth": "MATTR",
130
+ }
131
+
132
+ LEXICAL_LEMMA_LABELS = {
133
+ "ttr_lemma": "TTR",
134
+ "rttr_lemma": "RTTR",
135
+ "cttr_lemma": "CTTR",
136
+ "herdan_lemma": "Herdan",
137
+ "summer_lemma": "Summer",
138
+ "dugast_lemma": "Dugast",
139
+ "maas_lemma": "Maas",
140
+ "mtld_lemma": "MTLD",
141
+ "mattr_lemma": "MATTR",
142
+ }
143
+
144
+ SIMILARITY_LABELS = {
145
+ "bert_score_precision": "BERTScore P",
146
+ "bert_score_recall": "BERTScore R",
147
+ "bert_score_f1": "BERTScore F1",
148
+ "bleu": "BLEU",
149
+ "chrf": "chrF",
150
+ "chrfpp": "chrF++",
151
+ "nli_precision": "NLI P",
152
+ "nli_recall": "NLI R",
153
+ "nli_f1": "NLI F1",
154
+ "rouge_1_precision": "ROUGE-1 P",
155
+ "rouge_1_recall": "ROUGE-1 R",
156
+ "rouge_1_f1": "ROUGE-1 F1",
157
+ "rouge_2_precision": "ROUGE-2 P",
158
+ "rouge_2_recall": "ROUGE-2 R",
159
+ "rouge_2_f1": "ROUGE-2 F1",
160
+ "rouge_l_precision": "ROUGE-L P",
161
+ "rouge_l_recall": "ROUGE-L R",
162
+ "rouge_l_f1": "ROUGE-L F1",
163
+ "wer": "WER",
164
+ "mer": "MER",
165
+ "wil": "WIL",
166
+ "ne_retention": "NE Retention",
167
+ }
168
+
169
+ MARKER_LABELS = {
170
+ # counts
171
+ "paragraph_count": "Paragraph count",
172
+ "sentence_count": "Sentence count",
173
+ "word_count": "Word count",
174
+ "named_entity_count": "Named entity count",
175
+ "difficult_word_count": "Difficult word count",
176
+ "difficult_word_count_orth": "Difficult word count (orth)",
177
+ # average lengths
178
+ "avg_word_syllables": "Avg word syllables",
179
+ "avg_sentence_length": "Avg sentence length",
180
+ "avg_paragraph_length": "Avg paragraph length",
181
+ # lexical difficulty
182
+ "named_entity_ratio": "Named entity ratio",
183
+ "difficult_word_ratio": "Difficult word ratio",
184
+ "difficult_word_ratio_orth": "Difficult word ratio (orth)",
185
+ # POS ratios
186
+ "noun_ratio": "Noun ratio",
187
+ "difficult_noun_ratio": "Difficult noun ratio",
188
+ "difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
189
+ "verb_ratio": "Verb ratio",
190
+ "difficult_verb_ratio": "Difficult verb ratio",
191
+ "difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
192
+ "adjective_ratio": "Adjective ratio",
193
+ "difficult_adjective_ratio": "Difficult adjective ratio",
194
+ "difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
195
+ # POS-to-POS ratios
196
+ "noun_to_verb_ratio": "Noun/verb ratio",
197
+ "verbo_nominal_ratio": "Verbo-nominal ratio",
198
+ "adj_to_verb_ratio": "Adj/verb ratio",
199
+ "adj_to_noun_ratio": "Adj/noun ratio",
200
+ # morphological
201
+ "nie_prefix_ratio": "Nie-prefix ratio",
202
+ "participle_ratio": "Participle ratio",
203
+ "gerund_ratio": "Gerund ratio",
204
+ "osc_noun_ratio": "OSC noun ratio",
205
+ "impersonal_verb_ratio": "Impersonal verb ratio",
206
+ "genitive_noun_ratio": "Genitive noun ratio",
207
+ "avg_genitive_chain_length": "Avg genitive chain",
208
+ # syntactic
209
+ "sentence_length_variance": "Sentence length variance",
210
+ "mean_dependency_distance": "Mean dep. distance",
211
+ "subordination_index": "Subordination index",
212
+ }
213
+
214
+ QUESTEVAL_LABELS = {
215
+ "precision": "QuestEval P",
216
+ "recall": "QuestEval R",
217
+ "f1": "QuestEval F1",
218
+ "answerable_rate_forward": "Answerable (fwd)",
219
+ "answerable_rate_backward": "Answerable (bwd)",
220
+ }
221
+
222
+ RRF_K = 60
223
+
224
+ # Each entry: (source, key, label, ascending_rrf, in_rrf)
225
+ # source — "metrics" | "markers" → use avg_diff_pct (Δ%)
226
+ # "similarity" | "questeval" → use absolute value
227
+ # ascending_rrf — True = lower value is better (rank 1 = smallest)
228
+ # in_rrf — include this metric in category RRF computation
229
+
230
+ CATEGORIES: list[dict] = [
231
+ {
232
+ "name": "Readability",
233
+ "in_rrf": True,
234
+ "rrf_weight": 1,
235
+ "description": (
236
+ "Readability indices - **orth** (surface-form) variants. "
237
+ "Δ% = percentage change after simplification. "
238
+ "For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. "
239
+ "**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
240
+ "**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
241
+ "where complex words have many syllables (lower → easier). "
242
+ "**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
243
+ "IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
244
+ ),
245
+ "metrics": [
246
+ ("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True),
247
+ ("metrics", "gunning_fog_orth", "Gunning Fog", True, True),
248
+ ("metrics", "coleman_liau_orth", "Coleman-Liau", True, True),
249
+ ("ifeval", "avg_exclude", "IFEval exclude", False, True),
250
+ ],
251
+ },
252
+ {
253
+ "name": "Lexical Difficulty",
254
+ "in_rrf": True,
255
+ "rrf_weight": 1,
256
+ "description": (
257
+ "Word-level difficulty markers - **orth** variants where available. "
258
+ "Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
259
+ "**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). "
260
+ "**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
261
+ "(higher → harder). "
262
+ "**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
263
+ "**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
264
+ "(higher → more complex nominal vocabulary)."
265
+ ),
266
+ "metrics": [
267
+ ("markers", "avg_word_syllables", "Avg word syllables", True, True),
268
+ ("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True),
269
+ ("markers", "verb_ratio", "Verb ratio", False, True),
270
+ ("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True),
271
+ ],
272
+ },
273
+ {
274
+ "name": "Syntactic",
275
+ "in_rrf": True,
276
+ "rrf_weight": 1,
277
+ "description": (
278
+ "Sentence and clause structure complexity markers. "
279
+ "Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
280
+ "**Avg sentence length** is the mean number of words per sentence (higher → harder). "
281
+ "**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
282
+ "**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
283
+ ),
284
+ "metrics": [
285
+ ("markers", "avg_sentence_length", "Avg sentence length", True, True),
286
+ ("markers", "sentence_length_variance", "Sentence length var.", True, False),
287
+ ("markers", "mean_dependency_distance", "Mean dep. distance", True, True),
288
+ ("markers", "subordination_index", "Subordination index", True, True),
289
+ ],
290
+ },
291
+ {
292
+ "name": "Morphological",
293
+ "in_rrf": True,
294
+ "rrf_weight": 1,
295
+ "description": (
296
+ "Polish-specific morphological complexity markers. "
297
+ "Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
298
+ "**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) "
299
+ "among alphabetic tokens - a bookish, formal construction (higher → more complex). "
300
+ "**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words "
301
+ "(higher → more nominalised, formal). "
302
+ "**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), "
303
+ "passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish "
304
+ "(higher → more impersonal, harder). "
305
+ "**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
306
+ "(higher → harder). "
307
+ "**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
308
+ "(higher → more genitive stacking, harder). "
309
+ "**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of "
310
+ "administrative Polish (higher → harder). "
311
+ "**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)."
312
+ ),
313
+ "metrics": [
314
+ ("markers", "participle_ratio", "Participle ratio", True, False),
315
+ ("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True),
316
+ ("markers", "gerund_ratio", "Gerund ratio", True, True),
317
+ ("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True),
318
+ ("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True),
319
+ ("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True),
320
+ ("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True),
321
+ ("markers", "osc_noun_ratio", "OSC noun ratio", True, True),
322
+ ],
323
+ },
324
+ {
325
+ "name": "Meaning Preservation",
326
+ "in_rrf": True,
327
+ "rrf_weight": 4,
328
+ "description": (
329
+ "Semantic metrics that directly test whether the simplified text says the same thing as the original. "
330
+ "NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
331
+ "NE Retention measures what fraction of named entities from the original appear in the simplified text "
332
+ "(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
333
+ "IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
334
+ "Higher is better for all."
335
+ ),
336
+ "metrics": [
337
+ ("similarity", "nli_f1", "NLI F1", False, True),
338
+ ("questeval", "f1", "QuestEval F1", False, True),
339
+ ("similarity", "ne_retention", "NE Retention", False, True),
340
+ ("ifeval", "avg_include", "IFEval include", False, True),
341
+ ],
342
+ },
343
+ ]
344
+
345
+
346
+ def _col_name(source: str, label: str) -> str:
347
+ """Column name used in category DataFrames."""
348
+ if source in ("metrics", "markers"):
349
+ return f"{label} (Δ%)"
350
+ return label
351
+
352
+
353
+ def _model_label(data: dict) -> str:
354
+ """Return a unique display name, appending reasoning effort when present.
355
+
356
+ The parameter size is shown separately (see :func:`_params_str`), in its
357
+ own column, mirroring the PLCC leaderboard layout.
358
+ """
359
+ model = data["metadata"]["model"]
360
+ effort = (
361
+ data["metadata"]
362
+ .get("model_kwargs", {})
363
+ .get("extra_body", {})
364
+ .get("reasoning", {})
365
+ .get("effort")
366
+ )
367
+ if effort is not None:
368
+ return f"{model} [reasoning: {effort}]"
369
+ return model
370
+
371
+
372
+ def _params_str(params: float | None) -> str | None:
373
+ """PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
374
+ p = params or 0
375
+ if p <= 0:
376
+ return None
377
+ return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"
378
+
379
+
380
+ def _params_map() -> dict[str, str]:
381
+ """Model label → formatted parameter size, read from each file's metadata."""
382
+ out: dict[str, str] = {}
383
+ for data in load_records():
384
+ label = _params_str(data["metadata"].get("total_params_b"))
385
+ if label:
386
+ out[_model_label(data)] = label
387
+ return out
388
+
389
+
390
+ def _metric_row(
391
+ label_map: dict,
392
+ summary_metrics: dict,
393
+ row: dict,
394
+ detail_row: dict,
395
+ *,
396
+ include_detail: bool = True,
397
+ ) -> None:
398
+ """Populate leaderboard row and detail row from a label→key map."""
399
+ for key, label in label_map.items():
400
+ vals = summary_metrics.get(key, {})
401
+ row[f"{label} (Δ)"] = vals.get("avg_diff")
402
+ row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
403
+ if include_detail:
404
+ detail_row[f"{label} before"] = vals.get("avg_before")
405
+ detail_row[f"{label} after"] = vals.get("avg_after")
406
+ detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
407
+ detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
408
+
409
+
410
+ def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
411
+ """Load scored JSON files and build leaderboard DataFrames.
412
+
413
+ Returns:
414
+ (readability_orth_df, readability_lemma_df,
415
+ lexical_orth_df, lexical_lemma_df,
416
+ similarity_df, questeval_df, markers_df, detail_df)
417
+ """
418
+ read_orth_rows, read_lemma_rows = [], []
419
+ lex_orth_rows, lex_lemma_rows = [], []
420
+ similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []
421
+
422
+ if not DATA_DIR.exists():
423
+ empty = pd.DataFrame()
424
+ return empty, empty, empty, empty, empty, empty, empty, empty
425
+
426
+ for data in load_records():
427
+ model = _model_label(data)
428
+ n = data["summary"]["n"]
429
+ metrics = data["summary"]["metrics"]
430
+ similarity = data["summary"].get("similarity", {})
431
+ questeval = data["summary"].get("questeval", {})
432
+ markers = data["summary"].get("markers", {})
433
+
434
+ base = {"Model": model, "N": n}
435
+ read_orth_row = dict(base)
436
+ read_lemma_row = dict(base)
437
+ lex_orth_row = dict(base)
438
+ lex_lemma_row = dict(base)
439
+ similarity_row = dict(base)
440
+ questeval_row = dict(base)
441
+ markers_row = dict(base)
442
+ detail_row = dict(base)
443
+
444
+ _metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
445
+ _metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
446
+ _metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
447
+ _metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)
448
+
449
+ for key, label in SIMILARITY_LABELS.items():
450
+ similarity_row[label] = similarity.get(key)
451
+
452
+ for key, label in QUESTEVAL_LABELS.items():
453
+ questeval_row[label] = questeval.get(key)
454
+
455
+ for key, label in MARKER_LABELS.items():
456
+ vals = markers.get(key, {})
457
+ markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
458
+ markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
459
+ detail_row[f"{label} before"] = vals.get("avg_before")
460
+ detail_row[f"{label} after"] = vals.get("avg_after")
461
+ detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
462
+ detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
463
+
464
+ read_orth_rows.append(read_orth_row)
465
+ read_lemma_rows.append(read_lemma_row)
466
+ lex_orth_rows.append(lex_orth_row)
467
+ lex_lemma_rows.append(lex_lemma_row)
468
+ similarity_rows.append(similarity_row)
469
+ questeval_rows.append(questeval_row)
470
+ markers_rows.append(markers_row)
471
+ detail_rows.append(detail_row)
472
+
473
+ dfs = [
474
+ pd.DataFrame(read_orth_rows),
475
+ pd.DataFrame(read_lemma_rows),
476
+ pd.DataFrame(lex_orth_rows),
477
+ pd.DataFrame(lex_lemma_rows),
478
+ pd.DataFrame(similarity_rows),
479
+ pd.DataFrame(questeval_rows),
480
+ pd.DataFrame(markers_rows),
481
+ pd.DataFrame(detail_rows),
482
+ ]
483
+ for df in dfs:
484
+ num_cols = df.select_dtypes(include="number").columns
485
+ df[num_cols] = df[num_cols].round(4)
486
+
487
+ return tuple(dfs)
488
+
489
+
490
+ @lru_cache(maxsize=1)
491
+ def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
492
+ """Per-model matched IFEval records, cached once.
493
+
494
+ Manual IFEval rules are hand-written for a subset of the prompts, so the
495
+ comparison only makes sense on records carrying *both* an automatic and a
496
+ manual score. This reads the per-text ``results`` arrays (which
497
+ ``load_records`` discards) once and keeps, per model, the tuples
498
+ ``(category, prompt_id, auto_include, auto_exclude, man_include,
499
+ man_exclude)`` so the dropdown filters can re-aggregate cheaply.
500
+ """
501
+ out: list[tuple[str, tuple[tuple, ...]]] = []
502
+ if not DATA_DIR.exists():
503
+ return ()
504
+ for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
505
+ with open(fp, encoding="utf-8") as f:
506
+ data = json.load(f)
507
+ model = _model_label(data)
508
+ recs: list[tuple] = []
509
+ for rec in data["results"]:
510
+ man = rec.get("ifeval_manual")
511
+ auto = rec.get("ifeval")
512
+ if not man or not auto:
513
+ continue
514
+ recs.append((
515
+ rec.get("category"),
516
+ rec.get("prompt_id"),
517
+ auto.get("include"), auto.get("exclude"),
518
+ man.get("include"), man.get("exclude"),
519
+ ))
520
+ if recs:
521
+ out.append((model, tuple(recs)))
522
+ return tuple(out)
523
+
524
+
525
+ def load_ifeval_comparison_df(
526
+ text_category: str | None = None,
527
+ prompt: str | None = None,
528
+ size_limit: str | None = None,
529
+ model_type: str | None = None,
530
+ ) -> pd.DataFrame:
531
+ """Compare manual (gold) IFEval against automatic IFEval, per model.
532
+
533
+ The comparison is restricted to records carrying *both* an automatic and a
534
+ manual score - the very same texts scored both ways, which isolates the
535
+ rule-quality gap from sampling differences (the overall ``ifeval`` summary
536
+ averages over ~5× more texts and so is not directly comparable). ``Δ``
537
+ columns are manual − automatic: a negative value means the automatic
538
+ constraints were easier to satisfy than the hand-checked ones, i.e. the
539
+ automatic rules are more lenient.
540
+
541
+ ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
542
+ restrict the matched records to one source-text category and/or one
543
+ simplification prompt, mirroring the RRF dropdown filters.
544
+ """
545
+ tc = None if text_category in (None, "All") else text_category
546
+ pr = None if prompt in (None, "All") else prompt
547
+
548
+ # Automatic IFEval over *all* records (not just the manual-matched subset),
549
+ # from the summary buckets, so it tracks the same category/prompt filters.
550
+ # Restricted to models passing the size / model-type filters.
551
+ allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
552
+ summaries = {
553
+ _model_label(data): data["summary"]
554
+ for data in load_records()
555
+ if _model_label(data) in allowed
556
+ }
557
+
558
+ rows: list[dict] = []
559
+ for model, recs in _load_ifeval_records():
560
+ if model not in allowed:
561
+ continue
562
+ ai = ae = mi = me = 0.0
563
+ ni = ne = 0
564
+ for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
565
+ if tc and cat != tc:
566
+ continue
567
+ if pr and prompt_id != pr:
568
+ continue
569
+ if m_inc is not None and a_inc is not None:
570
+ ai += a_inc; mi += m_inc; ni += 1
571
+ if m_exc is not None and a_exc is not None:
572
+ ae += a_exc; me += m_exc; ne += 1
573
+ if ni == 0 and ne == 0:
574
+ continue
575
+ auto_inc = ai / ni if ni else None
576
+ man_inc = mi / ni if ni else None
577
+ auto_exc = ae / ne if ne else None
578
+ man_exc = me / ne if ne else None
579
+ auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
580
+ all_inc = auto_all.get("avg_include")
581
+ all_exc = auto_all.get("avg_exclude")
582
+ rows.append({
583
+ "Model": model,
584
+ "N": ni or ne,
585
+ "Manual include": man_inc,
586
+ "Manual exclude": man_exc,
587
+ "Auto include": auto_inc,
588
+ "Auto include (all)": all_inc,
589
+ "Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
590
+ "Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
591
+ "Auto exclude": auto_exc,
592
+ "Auto exclude (all)": all_exc,
593
+ "Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
594
+ "Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
595
+ })
596
+
597
+ df = pd.DataFrame(rows)
598
+ if df.empty:
599
+ return df
600
+ df = df.sort_values("Model").reset_index(drop=True)
601
+ num_cols = df.select_dtypes(include="number").columns
602
+ df[num_cols] = df[num_cols].round(4)
603
+ return df
604
+
605
+
606
+ def text_category_choices() -> list[str]:
607
+ """All source-text categories present in the data, prefixed with 'All'."""
608
+ cats: set[str] = set()
609
+ for data in load_records():
610
+ cats.update(data["summary"].get("metrics_by_category", {}).keys())
611
+ return ["All"] + sorted(cats)
612
+
613
+
614
+ def prompt_choices() -> list[str]:
615
+ """All simplification prompts present in the data, prefixed with 'All'."""
616
+ prompts: set[str] = set()
617
+ for data in load_records():
618
+ prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
619
+ return ["All"] + sorted(prompts)
620
+
621
+
622
+ def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict:
623
+ """Return the metric bucket for one source, filtered by text category and/or prompt.
624
+
625
+ Picks the overall summary when neither filter is set, the ``*_by_category`` /
626
+ ``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket
627
+ (keyed ``"CATEGORY/PROMPT"``) when both are set.
628
+ """
629
+ if source in ("metrics", "markers", "similarity"):
630
+ if tc and prompt:
631
+ return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
632
+ if tc:
633
+ return s.get(f"{source}_by_category", {}).get(tc, {})
634
+ if prompt:
635
+ return s.get(f"{source}_by_prompt", {}).get(prompt, {})
636
+ return s.get(source, {})
637
+ # questeval / ifeval keep their per-filter buckets nested under the source object
638
+ src = s.get(source, {})
639
+ if tc and prompt:
640
+ return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
641
+ if tc:
642
+ return src.get("by_category", {}).get(tc, {})
643
+ if prompt:
644
+ return src.get("by_prompt", {}).get(prompt, {})
645
+ return src
646
+
647
+
648
+ def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None:
649
+ """Sample count for the selected filters, from whichever source records it."""
650
+ for src in ("questeval", "ifeval"):
651
+ n = _source_bucket(s, src, tc, prompt).get("n")
652
+ if n is not None:
653
+ return n
654
+ return None
655
+
656
+
657
+ def load_category_df(
658
+ category: dict,
659
+ text_category: str | None = None,
660
+ prompt: str | None = None,
661
+ ) -> pd.DataFrame:
662
+ """Build a DataFrame for one metric category with a per-category RRF score.
663
+
664
+ ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
665
+ restrict the metrics to one source-text category and/or one simplification
666
+ prompt via the matching ``*_by_category`` / ``*_by_prompt`` /
667
+ ``*_by_category_prompt`` buckets; otherwise the overall summary is used.
668
+ The RRF is always computed over **all** models; the size-limit / model-type
669
+ filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
670
+ so they never change a model's rank or score.
671
+ """
672
+ rows: list[dict] = []
673
+ tc = None if text_category in (None, "All") else text_category
674
+ pr = None if prompt in (None, "All") else prompt
675
+
676
+ for data in load_records():
677
+ s = data["summary"]
678
+ model = _model_label(data)
679
+ n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
680
+ row: dict = {"Model": model, "N": n}
681
+
682
+ for source, key, label, _asc, in_rrf in category["metrics"]:
683
+ if not in_rrf:
684
+ continue
685
+ col = _col_name(source, label)
686
+ bucket = _source_bucket(s, source, tc, pr)
687
+ if source in ("metrics", "markers"):
688
+ row[col] = bucket.get(key, {}).get("avg_diff_pct")
689
+ else: # similarity, questeval, ifeval store the value directly
690
+ row[col] = bucket.get(key)
691
+
692
+ rows.append(row)
693
+
694
+ df = pd.DataFrame(rows)
695
+ if df.empty:
696
+ return df
697
+
698
+ num_cols = df.select_dtypes(include="number").columns
699
+ df[num_cols] = df[num_cols].round(4)
700
+
701
+ rrf = pd.Series(0.0, index=df.index)
702
+ for source, key, label, ascending, in_rrf in category["metrics"]:
703
+ if not in_rrf:
704
+ continue
705
+ col = _col_name(source, label)
706
+ if col not in df.columns or df[col].isna().all():
707
+ continue
708
+ rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))
709
+
710
+ df.insert(2, "RRF Score", rrf.round(4))
711
+ df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
712
+ df.insert(0, "Rank", range(1, len(df) + 1))
713
+ df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
714
+ return df
715
+
716
+
717
+ def _plcc_overall_map() -> dict[str, float]:
718
+ """Model label → external PLCC overall score, read from each file's metadata.
719
+
720
+ PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
721
+ score is carried verbatim in ``metadata.plcc.overall`` and shown for
722
+ reference only - it does not feed the RRF ranking. Models without a PLCC
723
+ entry are omitted (mapped to NaN in the table).
724
+ """
725
+ out: dict[str, float] = {}
726
+ for data in load_records():
727
+ plcc = data["metadata"].get("plcc") or {}
728
+ overall = plcc.get("overall")
729
+ if overall is not None:
730
+ out[_model_label(data)] = overall
731
+ return out
732
+
733
+
734
+ def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
735
+ """Fuse per-category RRF scores into a final ranking via RRF.
736
+
737
+ Each category column shows the model's **rank within that category** (1 = best);
738
+ those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
739
+ A reference ``PLCC`` column carries the external PLCC benchmark score and does
740
+ not influence the ranking.
741
+ """
742
+ merged: pd.DataFrame | None = None
743
+ for cat, cat_df in category_data:
744
+ if not cat.get("in_rrf", True) or cat_df.empty:
745
+ continue
746
+ sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
747
+ merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")
748
+
749
+ if merged is None or merged.empty:
750
+ return pd.DataFrame()
751
+
752
+ # N (sample count) is identical across categories for a given model, so take
753
+ # it from whichever category table carries it.
754
+ n_map: dict = {}
755
+ for _cat, cat_df in category_data:
756
+ if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
757
+ n_map = dict(zip(cat_df["Model"], cat_df["N"]))
758
+ break
759
+
760
+ score_cols = [c for c in merged.columns if c != "Model"]
761
+ weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}
762
+
763
+ out = merged[["Model"]].copy()
764
+ rrf = pd.Series(0.0, index=merged.index)
765
+ rank_cols: dict[str, pd.Series] = {}
766
+ for col in score_cols:
767
+ ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
768
+ rrf += weights.get(col, 1) / (RRF_K + ranks)
769
+ rank_cols[col] = ranks
770
+
771
+ out.insert(1, "Final RRF", rrf.round(4))
772
+ out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
773
+ for name, ranks in rank_cols.items():
774
+ out[name] = ranks
775
+ out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
776
+ out.insert(0, "Rank", range(1, len(out) + 1))
777
+ out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
778
+ out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
779
+ return out
780
+
781
+
782
+ def build_tradeoff_scatter(
783
+ text_category: str | None = None,
784
+ prompt: str | None = None,
785
+ size_limit: str | None = None,
786
+ model_type: str | None = None,
787
+ ) -> go.Figure | None:
788
+ """Scatter of Gunning Fog reduction vs meaning preservation, one point per model.
789
+
790
+ X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
791
+ Y: QuestEval F1 (higher = better meaning preservation)
792
+
793
+ Honours the same text-category / prompt / size / model-type filters as the
794
+ RRF rankings.
795
+ """
796
+ tc = None if text_category in (None, "All") else text_category
797
+ pr = None if prompt in (None, "All") else prompt
798
+ points = []
799
+ for data in _filtered_records(size_limit, model_type):
800
+ s = data["summary"]
801
+ model = _model_label(data)
802
+ x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
803
+ y = _source_bucket(s, "questeval", tc, pr).get("f1")
804
+ if x is None or y is None:
805
+ continue
806
+ points.append((model, x, y))
807
+
808
+ if not points:
809
+ return None
810
+
811
+ models, xs, ys = zip(*points)
812
+
813
+ fig = go.Figure()
814
+ fig.add_trace(
815
+ go.Scatter(
816
+ x=xs,
817
+ y=ys,
818
+ mode="markers+text",
819
+ text=models,
820
+ textposition="top center",
821
+ textfont={"size": 10},
822
+ marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
823
+ hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
824
+ )
825
+ )
826
+
827
+ x_mid = (min(xs) + max(xs)) / 2
828
+ y_mid = (min(ys) + max(ys)) / 2
829
+ fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
830
+ fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
831
+
832
+ fig.update_layout(
833
+ title="Complexity reduction vs meaning preservation",
834
+ xaxis_title="Gunning Fog orth Δ% (← easier text)",
835
+ yaxis_title="QuestEval F1 (↑ meaning preserved)",
836
+ height=560,
837
+ margin={"l": 60, "r": 40, "t": 60, "b": 60},
838
+ plot_bgcolor="white",
839
+ )
840
+ fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
841
+ fig.update_yaxes(showgrid=True, gridcolor="#EEE")
842
+
843
+ return fig
844
+
845
+
846
+ def build_fog_nli_scatter(
847
+ text_category: str | None = None,
848
+ prompt: str | None = None,
849
+ size_limit: str | None = None,
850
+ model_type: str | None = None,
851
+ ) -> go.Figure | None:
852
+ """Scatter of Gunning Fog reduction vs NLI F1, one point per model.
853
+
854
+ X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
855
+ Y: NLI F1 (higher = stronger entailment / meaning preserved)
856
+
857
+ Honours the same text-category / prompt / size / model-type filters as the
858
+ RRF rankings.
859
+ """
860
+ tc = None if text_category in (None, "All") else text_category
861
+ pr = None if prompt in (None, "All") else prompt
862
+ points = []
863
+ for data in _filtered_records(size_limit, model_type):
864
+ s = data["summary"]
865
+ model = _model_label(data)
866
+ x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
867
+ y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
868
+ if x is None or y is None:
869
+ continue
870
+ points.append((model, x, y))
871
+
872
+ if not points:
873
+ return None
874
+
875
+ models, xs, ys = zip(*points)
876
+
877
+ fig = go.Figure()
878
+ fig.add_trace(
879
+ go.Scatter(
880
+ x=xs,
881
+ y=ys,
882
+ mode="markers+text",
883
+ text=models,
884
+ textposition="top center",
885
+ textfont={"size": 10},
886
+ marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
887
+ hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
888
+ )
889
+ )
890
+
891
+ x_mid = (min(xs) + max(xs)) / 2
892
+ y_mid = (min(ys) + max(ys)) / 2
893
+ fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
894
+ fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
895
+
896
+ fig.update_layout(
897
+ title="Complexity reduction vs NLI consistency",
898
+ xaxis_title="Gunning Fog orth Δ% (← easier text)",
899
+ yaxis_title="NLI F1 (↑ meaning preserved)",
900
+ height=560,
901
+ margin={"l": 60, "r": 40, "t": 60, "b": 60},
902
+ plot_bgcolor="white",
903
+ )
904
+ fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
905
+ fig.update_yaxes(showgrid=True, gridcolor="#EEE")
906
+
907
+ return fig
908
+
909
+
910
+ INTRO = """\
911
+ # PLainBench - Polish Text Simplification Leaderboard
912
+
913
+ This benchmark evaluates how well LLMs simplify difficult Polish texts -
914
+ drawn from legal/administrative (BIP/GOV), finance, and science domains - while
915
+ preserving the original meaning. Each model simplifies 210 source texts under
916
+ 5 simplification prompts (1050 outputs per model). Outputs are scored on
917
+ readability indices, fine-grained difficulty markers (lexical, syntactic,
918
+ morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
919
+ named-entity retention), and instruction following (IFEval include/exclude).
920
+ The per-category scores are fused into an overall **Final RRF** ranking.
921
+ """
922
+
923
+ METRICS_DOC = """\
924
+ ## Metrics
925
+
926
+ ### Readability indices
927
+
928
+ All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
929
+ dictionary) and counted on surface (orthographic) word forms.
930
+
931
+ Δ is the absolute change (after − before); Δ% is the average percentage change
932
+ from the original text to the simplified text.
933
+
934
+ | Metric | Formula | Interpretation |
935
+ |---|---|---|
936
+ | **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** |
937
+ | **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** |
938
+ | **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** |
939
+
940
+ ### Difficulty markers
941
+
942
+ Fine-grained syntactic, morphological, and lexical features.
943
+ Δ is absolute change; Δ% is percentage change.
944
+ Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
945
+ surface (orthographic) form.
946
+
947
+ | Marker | Description | Desired Δ% |
948
+ |---|---|---|
949
+ | **Avg word syllables** | Mean syllable count per word | − (shorter words) |
950
+ | **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − |
951
+ | **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − |
952
+ | **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) |
953
+ | **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) |
954
+ | **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) |
955
+ | **Subordination index** | Subordinate clauses / total clauses | − |
956
+ | **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − |
957
+ | **Gerund ratio** | Gerunds / all tokens | − |
958
+ | **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − |
959
+ | **Genitive noun ratio** | Nouns in genitive case / all tokens | − |
960
+ | **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − |
961
+ | **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − |
962
+ | **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − |
963
+
964
+ ### Similarity metrics
965
+
966
+ Reference-based metrics comparing simplified text against the original.
967
+
968
+ | Metric | Description | Direction |
969
+ |---|---|---|
970
+ | **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment |
971
+ | **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved |
972
+
973
+ *Only **NLI F1** feeds the RRF score; P and R are shown for context.*
974
+
975
+ ### QuestEval - QA consistency
976
+
977
+ | Metric | Description | Direction |
978
+ |---|---|---|
979
+ | **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations |
980
+ | **QuestEval R** | Forward recall - information preserved | Higher = less content dropped |
981
+ | **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation |
982
+ | **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic |
983
+ | **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original |
984
+
985
+ *Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.*
986
+
987
+ ### IFEval - instruction following
988
+
989
+ | Metric | Description | Direction |
990
+ |---|---|---|
991
+ | **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better |
992
+ | **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better |
993
+ """
994
+
995
+ # Sample-count note shown under each table that carries an ``N`` column.
996
+ N_NOTE = "**N** = number of prompt × text evaluations per model."
997
+
998
+ # The five simplification prompts every model is run with. The keys match the
999
+ # "Simplification prompt" filter values (and the ``*_by_prompt`` summary
1000
+ # buckets); each value is ``(short description, user-message template)``, where
1001
+ # ``<text>`` marks where the source text is inserted. Kept in sync with
1002
+ # generation/prompting/instruction.py. Ordered from least to most detailed.
1003
+ PROMPTS: dict[str, tuple[str, str]] = {
1004
+ "mini": (
1005
+ "Minimal - a single-line instruction, no rules.",
1006
+ "Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
1007
+ "bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
1008
+ ),
1009
+ "compact": (
1010
+ "Compact - a short bulleted rule set.",
1011
+ """Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.
1012
+
1013
+ Zasady:
1014
+ - Skup się na najważniejszych informacjach, usuń zbędne treści.
1015
+ - Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
1016
+ - Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
1017
+ - Twórz krótkie zdania (jedna myśl = jedno zdanie).
1018
+ - Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
1019
+ - Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
1020
+ - Zachowaj poprawność językową i logiczną spójność.
1021
+ - W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
1022
+
1023
+ ---
1024
+
1025
+ ### Tekst do uproszczenia:
1026
+
1027
+ <text>""",
1028
+ ),
1029
+ "medium": (
1030
+ "Medium - moderately detailed rules with sub-points.",
1031
+ """Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.
1032
+
1033
+ ### Zasady:
1034
+ - Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
1035
+ - Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
1036
+ - Stosuj proste i naturalne słownictwo:
1037
+ - zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
1038
+ - jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
1039
+ - Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
1040
+ - Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
1041
+ - Unikaj:
1042
+ - żargonu, stylu urzędowego i zapożyczeń,
1043
+ - form bezosobowych i strony biernej (jeśli nie są konieczne),
1044
+ - nadmiaru rzeczowników odczasownikowych,
1045
+ - podwójnych przeczeń i zawiłych konstrukcji.
1046
+ - Zachowaj poprawność językową, spójność i logiczny układ tekstu.
1047
+ - W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
1048
+
1049
+ ---
1050
+
1051
+ ### Tekst do uproszczenia:
1052
+
1053
+ <text>""",
1054
+ ),
1055
+ "long": (
1056
+ "Long - full, sectioned plain-language guidelines.",
1057
+ """Uprość poniższy tekst zgodnie z zasadami prostego języka.
1058
+
1059
+ ### 1. Cel i odbiorca
1060
+ - Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
1061
+ - Skup się na najważniejszych informacjach.
1062
+
1063
+ ### 2. Struktura
1064
+ - Usuń informacje zbędne i poboczne.
1065
+ - Uporządkuj treść: najważniejsze informacje podaj na początku.
1066
+ - Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
1067
+ - Jeśli tekst jest dłuższy, użyj nagłówków lub list.
1068
+
1069
+ ### 3. Słownictwo
1070
+ - Zastępuj trudne słowa prostszymi.
1071
+ - Unikaj:
1072
+ - terminów specjalistycznych (chyba że je wyjaśnisz),
1073
+ - słów rzadkich, książkowych i urzędowych,
1074
+ - zapożyczeń i modnych zwrotów,
1075
+ - skrótów niezrozumiałych dla odbiorcy.
1076
+ - W razie potrzeby:
1077
+ - wyjaśnij trudne pojęcia,
1078
+ - podaj przykłady,
1079
+ - używaj konkretnych nazw zamiast ogólników.
1080
+
1081
+ ### 4. Składnia
1082
+ - Twórz krótkie zdania (ok. 20 słów).
1083
+ - Jedno zdanie = jedna myśl.
1084
+ - Używaj zdań twierdzących.
1085
+ - Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
1086
+ - Używaj strony czynnej zamiast biernej.
1087
+ - Unikaj form bezosobowych i skomplikowanych konstrukcji.
1088
+ - Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).
1089
+
1090
+ ### 5. Styl
1091
+ - Unikaj podwójnych przeczeń.
1092
+ - Upraszczaj złożone konstrukcje.
1093
+ - Zachowaj naturalny, jasny ton.
1094
+
1095
+ ### 6. Końcowa kontrola
1096
+ - Sprawdź, czy tekst jest:
1097
+ - zrozumiały,
1098
+ - poprawny językowo,
1099
+ - logiczny i spójny.
1100
+
1101
+ ### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
1102
+
1103
+ ---
1104
+
1105
+ ### Tekst do uproszczenia:
1106
+
1107
+ <text>""",
1108
+ ),
1109
+ "step_by_step": (
1110
+ "Step by step - role-based, numbered editorial guidelines.",
1111
+ """Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:
1112
+
1113
+ 1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
1114
+ 2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
1115
+ 3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
1116
+ 4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
1117
+ 5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
1118
+ 6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
1119
+ 7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
1120
+ 8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
1121
+ 9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
1122
+
1123
+ ---
1124
+
1125
+ ### Tekst do uproszczenia:
1126
+
1127
+ <text>""",
1128
+ ),
1129
+ }
1130
+
1131
+ # ── PLCC-inspired visual style ──────────────────────────────────────────────
1132
+ # Mirrors the sdadas/plcc leaderboard: clean white background, a system
1133
+ # sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
1134
+ # (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
1135
+ # CSS — a custom gr.themes.* would tint the component label chips blue, which
1136
+ # is not part of the PLCC look.
1137
+ PLCC_CSS = """
1138
+ .gradio-container {
1139
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
1140
+ "Helvetica Neue", Arial, sans-serif !important;
1141
+ max-width: 1500px !important;
1142
+ }
1143
+ /* PLCC-style data tables */
1144
+ .plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
1145
+ .plain-table thead th {
1146
+ background: #f9fafd !important;
1147
+ border-bottom: 2px solid #ddd !important;
1148
+ color: #222 !important;
1149
+ font-weight: 700 !important;
1150
+ }
1151
+ .plain-table tbody td { padding: 8px 10px !important; }
1152
+ .plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
1153
+ .plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
1154
+ /* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
1155
+ .params-col tbody td:nth-child(3),
1156
+ .params-col thead th:nth-child(3) {
1157
+ text-align: right !important;
1158
+ white-space: nowrap;
1159
+ }
1160
+ .params-col tbody td:nth-child(3) { color: #999 !important; }
1161
+ /* Filter bar — the grey rounded block holding the dropdowns */
1162
+ .filter-bar {
1163
+ background: #f9fafd;
1164
+ border: 1px solid #ddd;
1165
+ border-radius: 0.5rem;
1166
+ padding: 10px 14px;
1167
+ }
1168
+ """
1169
+
1170
+ # Colour palette for category bars
1171
+ _CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]
1172
+
1173
+
1174
+ def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
1175
+ """Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
1176
+ if df.empty or "Model" not in df.columns:
1177
+ return df
1178
+ return df[df["Model"].isin(allowed)].reset_index(drop=True)
1179
+
1180
+
1181
+ def load_rrf_views(
1182
+ text_category: str | None = None,
1183
+ prompt: str | None = None,
1184
+ size_limit: str | None = None,
1185
+ model_type: str | None = None,
1186
+ ) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
1187
+ """Final ranking DataFrame and per-category DataFrames for the selected filters.
1188
+
1189
+ Ranks and RRF scores are computed over **all** models (honouring only the
1190
+ text-category / prompt filters). The size-limit and model-type selections
1191
+ are then applied as pure row filters that hide models without recomputing
1192
+ any ranking - so a surviving model keeps the rank it held in the full table.
1193
+ """
1194
+ category_data = [
1195
+ (cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
1196
+ ]
1197
+ final_df = build_final_ranking_df(category_data)
1198
+
1199
+ allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
1200
+ final_df = _filter_model_rows(final_df, allowed)
1201
+ category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
1202
+ return final_df, category_data
1203
+
1204
+
1205
+ def _tradeoff_figs(
1206
+ text_category: str | None = None,
1207
+ prompt: str | None = None,
1208
+ size_limit: str | None = None,
1209
+ model_type: str | None = None,
1210
+ ) -> tuple[go.Figure, go.Figure]:
1211
+ """Both trade-off scatters for the selected filters (empty figure when no data)."""
1212
+ return (
1213
+ build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
1214
+ build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
1215
+ )
1216
+
1217
+
1218
+ def build_app() -> gr.Blocks:
1219
+ (
1220
+ read_orth_df, read_lemma_df,
1221
+ lex_orth_df, lex_lemma_df,
1222
+ similarity_df, questeval_df,
1223
+ markers_df, detail_df,
1224
+ ) = load_leaderboard_data()
1225
+
1226
+ ifeval_cmp_df = load_ifeval_comparison_df()
1227
+ final_df, category_data = load_rrf_views(None, None)
1228
+ tc_choices = text_category_choices()
1229
+ pr_choices = prompt_choices()
1230
+ size_choices = _visible_size_limits()
1231
+ tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)
1232
+
1233
+ with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
1234
+ gr.Markdown(INTRO)
1235
+
1236
+ if read_orth_df.empty:
1237
+ gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*")
1238
+ else:
1239
+ # Reactive output components, gathered in the order the change
1240
+ # handler returns them: final table, then one table per in-RRF
1241
+ # category, then the two trade-off scatters (and the IFEval table).
1242
+ rrf_outputs: list = []
1243
+
1244
+ with gr.Row(elem_classes=["filter-bar"]):
1245
+ tc_dropdown = gr.Dropdown(
1246
+ choices=tc_choices,
1247
+ value="All",
1248
+ label="Text category",
1249
+ info="Filter the RRF rankings to one source-text category.",
1250
+ )
1251
+ pr_dropdown = gr.Dropdown(
1252
+ choices=pr_choices,
1253
+ value="All",
1254
+ label="Simplification prompt",
1255
+ info="Filter the RRF rankings to one simplification prompt.",
1256
+ )
1257
+ size_dropdown = gr.Dropdown(
1258
+ choices=size_choices,
1259
+ value="ALL",
1260
+ label="Size limit",
1261
+ info="Keep only models up to this many parameters.",
1262
+ )
1263
+ type_dropdown = gr.Dropdown(
1264
+ choices=MODEL_TYPES,
1265
+ value="ALL",
1266
+ label="Model type",
1267
+ info="Filter by open- vs closed-weights models.",
1268
+ )
1269
+
1270
+ with gr.Tabs():
1271
+
1272
+ # ── Final Ranking ──────────────────────────────────────────
1273
+ with gr.TabItem("Final Ranking"):
1274
+ gr.Markdown(
1275
+ "Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. "
1276
+ "Each category ranks models by its own RRF score; those ranks are then fused into a "
1277
+ "single **Final RRF** score. Higher = better overall simplification. "
1278
+ "The **PLCC** column shows the model's score on the external "
1279
+ "[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
1280
+ "benchmark for reference only - it does not affect the ranking (blank where unavailable)."
1281
+ )
1282
+ final_table = gr.Dataframe(
1283
+ value=final_df, interactive=False, wrap=True,
1284
+ elem_classes=["plain-table", "params-col"],
1285
+ )
1286
+ gr.Markdown(N_NOTE)
1287
+ rrf_outputs += [final_table]
1288
+
1289
+ # ── RRF category tabs ──────────────────────────────────────
1290
+ for cat, cat_df in category_data:
1291
+ if not cat.get("in_rrf", True):
1292
+ continue
1293
+ with gr.TabItem(cat["name"]):
1294
+ gr.Markdown(cat["description"])
1295
+ cat_table = gr.Dataframe(
1296
+ value=cat_df, interactive=False, wrap=True,
1297
+ elem_classes=["plain-table", "params-col"],
1298
+ )
1299
+ gr.Markdown(N_NOTE)
1300
+ rrf_outputs += [cat_table]
1301
+
1302
+ # ── Trade-off plots ────────────────────────────────────────
1303
+ with gr.TabItem("Trade-off"):
1304
+ gr.Markdown(
1305
+ "Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
1306
+ "(QuestEval F1), one point per model. Top-left is ideal: "
1307
+ "greater complexity reduction **and** faithful to the original."
1308
+ )
1309
+ tradeoff_plot = gr.Plot(value=tradeoff_fig)
1310
+ gr.Markdown(
1311
+ "---\n"
1312
+ "Gunning Fog orth reduction (Δ%) versus NLI F1. "
1313
+ "Top-left is best: greater complexity reduction **and** strong NLI entailment."
1314
+ )
1315
+ fog_nli_plot = gr.Plot(value=fog_nli_fig)
1316
+ rrf_outputs += [tradeoff_plot, fog_nli_plot]
1317
+
1318
+ with gr.TabItem("Detailed scores", visible=False):
1319
+ gr.Markdown(
1320
+ "Average scores before and after simplification, plus absolute (Δ) "
1321
+ "and percentage (Δ%) change - for all readability, lexical, and marker metrics."
1322
+ )
1323
+ gr.Dataframe(
1324
+ value=detail_df, interactive=False, wrap=True,
1325
+ elem_classes=["plain-table"],
1326
+ )
1327
+
1328
+ # ── IFEval: manual vs automatic ────────────────────────────
1329
+ if not ifeval_cmp_df.empty:
1330
+ with gr.TabItem("IFEval manual vs auto"):
1331
+ gr.Markdown(
1332
+ "**Automatic** IFEval constraints are generated by an LLM; "
1333
+ "**manual** constraints are hand-written gold rules, available for a "
1334
+ "subset of the prompts. To isolate rule quality from sampling, the "
1335
+ "comparison is restricted to the texts that carry **both** scores "
1336
+ "(N = matched texts per model), so these automatic figures differ from "
1337
+ "the full-sample IFEval used elsewhere.\n\n"
1338
+ "**include** = fraction of *include* constraints satisfied, "
1339
+ "**exclude** = fraction of *exclude* constraints satisfied (higher is "
1340
+ "better for both). **Δ = manual − automatic** (on the matched texts): a "
1341
+ "negative Δ means the automatic rules were easier to satisfy than the "
1342
+ "hand-checked ones (more lenient automatic scoring). The **(all)** columns "
1343
+ "show automatic IFEval over *every* text (the full-sample figure used "
1344
+ "elsewhere). **Δ (man−auto all)** is manual minus that full-sample "
1345
+ "automatic value - useful as a sanity check, but note the two cover "
1346
+ "different text sets (matched subset vs. all texts), so **Δ (man−auto)** "
1347
+ "is the rigorous like-for-like comparison."
1348
+ )
1349
+ ifeval_cmp_table = gr.Dataframe(
1350
+ value=ifeval_cmp_df, interactive=False, wrap=True,
1351
+ elem_classes=["plain-table"],
1352
+ )
1353
+ rrf_outputs.append(ifeval_cmp_table)
1354
+
1355
+ # Metric documentation, shown below the results.
1356
+ gr.Markdown(METRICS_DOC)
1357
+
1358
+ # Simplification prompts, documenting the "Simplification prompt"
1359
+ # filter values — shown below the metric documentation.
1360
+ gr.Markdown(
1361
+ "## Simplification prompts\n\n"
1362
+ "The five prompt templates every model is run with - these are the "
1363
+ "values of the **Simplification prompt** filter above. Each source "
1364
+ "text is simplified once per prompt, so they range from a bare "
1365
+ "one-line instruction to full plain-language guidelines. "
1366
+ "`<text>` marks where the source text is inserted."
1367
+ )
1368
+ for _name, (_desc, _body) in PROMPTS.items():
1369
+ with gr.Accordion(f"{_name} - {_desc}", open=False):
1370
+ gr.Markdown(f"```\n{_body}\n```")
1371
+
1372
+ # Recompute the RRF rankings whenever any filter changes.
1373
+ _filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]
1374
+
1375
+ def _refresh_rrf(
1376
+ text_category: str, prompt: str, size_limit: str, model_type: str
1377
+ ) -> list:
1378
+ f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
1379
+ updates: list = [f_df]
1380
+ for cat, df in cat_data:
1381
+ if not cat.get("in_rrf", True):
1382
+ continue
1383
+ updates += [df]
1384
+ updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
1385
+ if not ifeval_cmp_df.empty:
1386
+ updates.append(
1387
+ load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
1388
+ )
1389
+ return updates
1390
+
1391
+ for _dd in _filters:
1392
+ _dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)
1393
+
1394
+ return app
1395
+
1396
+
1397
+ app = build_app()
1398
+
1399
+ if __name__ == "__main__":
1400
+ app.launch()
data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2412_2026-06-15_095534_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2512_2026-06-12_082622_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2412_2026-06-02_091112_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2512_2026-06-02_121044_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__PLLuM-12B-chat-2412_2026-06-02_141510_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__PLLuM-12B-chat-2512_2026-06-02_195811_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__PLLuM-12B-instruct-2512_2026-06-10_102424_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/CYFRAGOVPL__PLLuM-4B-chat-2512_2026-06-02_223411_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/deepseek__deepseek-v4-pro_reasoning-high_2026-05-31_094932_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/google__gemini-3.1-pro-preview_2026-06-11_121124_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/google__gemma-3-4b-it_reasoning-none_2026-06-08_110604_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/google__gemma-4-26b-a4b-it_reasoning-high_2026-05-31_223337_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/google__gemma-4-26b-a4b-it_reasoning-none_2026-06-01_020338_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/google__gemma-4-31b-it_reasoning-high_2026-05-31_124753_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/google__gemma-4-31b-it_reasoning-none_2026-05-31_200347_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/meta-llama__llama-3.1-70b-instruct_reasoning-none_2026-06-08_102826_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/meta-llama__llama-3.1-8b-instruct_reasoning-none_2026-06-08_100015_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/mistralai__ministral-8b-2512_2026-05-31_083128_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/mistralai__mistral-nemo_2026-05-31_084528_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/openai__gpt-oss-120b_2026-06-12_123249_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/openai__gpt-oss-20b_2026-06-12_133408_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/qwen__qwen3.5-35b-a3b_reasoning-high_2026-06-01_023022_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
data/current/speakleash__Bielik-11B-v3.0-Instruct_2026-06-01_112337_scored_anon.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas
2
+ plotly