Spaces:
Running
Running
File size: 60,837 Bytes
3bd48fe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 | """PLainBench - Polish Text Simplification Leaderboard.
Reads scored anon JSON files from the data/current/ directory and displays a
leaderboard showing how well each LLM simplifies Polish texts, measured
by readability indices, difficulty markers, reference-based similarity
metrics, and a QuestEval-style QA consistency score.
"""
import json
from functools import lru_cache
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
DATA_DIR = Path(__file__).parent / "data" / "current"
@lru_cache(maxsize=1)
def load_records() -> tuple[dict, ...]:
"""Parse every scored anon JSON once and cache the result.
The full files are large (~9 MB each, holding per-text records), but the
app only ever reads ``metadata`` and ``summary``. We keep just those two
sections so each file is parsed a single time and every loader/refresh
reuses the in-memory copy instead of re-reading from disk.
"""
records: list[dict] = []
if not DATA_DIR.exists():
return ()
for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
with open(fp, encoding="utf-8") as f:
data = json.load(f)
records.append({"metadata": data["metadata"], "summary": data["summary"]})
return tuple(records)
# Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
# quick-filters. Size options are *upper bounds* in billions of parameters.
SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]
def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
"""Whether a model's metadata satisfies the size-limit / model-type filters."""
if model_type and model_type != "ALL":
want = "open" if model_type == "open-weights" else "closed"
if meta.get("weights") != want:
return False
if size_limit and size_limit != "ALL":
cap = float(size_limit.rstrip("B"))
params = meta.get("total_params_b") or 0
# Unknown / unreported size (0) can't be placed under a cap, so exclude it.
if params <= 0 or params > cap:
return False
return True
def _filtered_records(
size_limit: str | None = None, model_type: str | None = None
) -> list[dict]:
"""Records whose model passes the size-limit / model-type filters."""
sl = size_limit or "ALL"
mt = model_type or "ALL"
return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]
def _visible_size_limits() -> list[str]:
"""Prune ``SIZE_LIMITS`` to the caps that actually split the current models.
A numeric cap is redundant when it selects the same set of models as the
next-smaller cap (no model has a size in the band between them) - those
upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
always kept. Recomputed from the data, so adding models later automatically
re-expands the list.
"""
params = [
p for d in load_records()
if (p := d["metadata"].get("total_params_b") or 0) > 0
]
# Ascending by value: keep the smallest representative of each distinct
# subset; a larger cap with the same model count is the redundant "upper" one.
kept: set[str] = set()
prev_count = -1
for s in sorted(
(s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
):
cap = float(s.rstrip("B"))
count = sum(1 for p in params if p <= cap)
if count > 0 and count != prev_count:
kept.add(s)
prev_count = count
# Preserve the original descending display order, with ALL first.
return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]
READABILITY_ORTH_LABELS = {
"flesch_reading_ease_orth": "Flesch RE",
"flesch_kincaid_grade_orth": "Flesch-Kincaid",
"gunning_fog_orth": "Gunning Fog",
"ari_orth": "ARI",
"linsear_write_orth": "Linsear Write",
"smog_grade_orth": "SMOG",
"coleman_liau_orth": "Coleman-Liau",
"pisarek_orth": "Pisarek",
}
READABILITY_LEMMA_LABELS = {
"flesch_reading_ease_lemma": "Flesch RE",
"flesch_kincaid_grade_lemma": "Flesch-Kincaid",
"gunning_fog_lemma": "Gunning Fog",
"ari_lemma": "ARI",
"linsear_write_lemma": "Linsear Write",
"smog_grade_lemma": "SMOG",
"coleman_liau_lemma": "Coleman-Liau",
"pisarek_lemma": "Pisarek",
}
LEXICAL_ORTH_LABELS = {
"ttr_orth": "TTR",
"rttr_orth": "RTTR",
"cttr_orth": "CTTR",
"herdan_orth": "Herdan",
"summer_orth": "Summer",
"dugast_orth": "Dugast",
"maas_orth": "Maas",
"mtld_orth": "MTLD",
"mattr_orth": "MATTR",
}
LEXICAL_LEMMA_LABELS = {
"ttr_lemma": "TTR",
"rttr_lemma": "RTTR",
"cttr_lemma": "CTTR",
"herdan_lemma": "Herdan",
"summer_lemma": "Summer",
"dugast_lemma": "Dugast",
"maas_lemma": "Maas",
"mtld_lemma": "MTLD",
"mattr_lemma": "MATTR",
}
SIMILARITY_LABELS = {
"bert_score_precision": "BERTScore P",
"bert_score_recall": "BERTScore R",
"bert_score_f1": "BERTScore F1",
"bleu": "BLEU",
"chrf": "chrF",
"chrfpp": "chrF++",
"nli_precision": "NLI P",
"nli_recall": "NLI R",
"nli_f1": "NLI F1",
"rouge_1_precision": "ROUGE-1 P",
"rouge_1_recall": "ROUGE-1 R",
"rouge_1_f1": "ROUGE-1 F1",
"rouge_2_precision": "ROUGE-2 P",
"rouge_2_recall": "ROUGE-2 R",
"rouge_2_f1": "ROUGE-2 F1",
"rouge_l_precision": "ROUGE-L P",
"rouge_l_recall": "ROUGE-L R",
"rouge_l_f1": "ROUGE-L F1",
"wer": "WER",
"mer": "MER",
"wil": "WIL",
"ne_retention": "NE Retention",
}
MARKER_LABELS = {
# counts
"paragraph_count": "Paragraph count",
"sentence_count": "Sentence count",
"word_count": "Word count",
"named_entity_count": "Named entity count",
"difficult_word_count": "Difficult word count",
"difficult_word_count_orth": "Difficult word count (orth)",
# average lengths
"avg_word_syllables": "Avg word syllables",
"avg_sentence_length": "Avg sentence length",
"avg_paragraph_length": "Avg paragraph length",
# lexical difficulty
"named_entity_ratio": "Named entity ratio",
"difficult_word_ratio": "Difficult word ratio",
"difficult_word_ratio_orth": "Difficult word ratio (orth)",
# POS ratios
"noun_ratio": "Noun ratio",
"difficult_noun_ratio": "Difficult noun ratio",
"difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
"verb_ratio": "Verb ratio",
"difficult_verb_ratio": "Difficult verb ratio",
"difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
"adjective_ratio": "Adjective ratio",
"difficult_adjective_ratio": "Difficult adjective ratio",
"difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
# POS-to-POS ratios
"noun_to_verb_ratio": "Noun/verb ratio",
"verbo_nominal_ratio": "Verbo-nominal ratio",
"adj_to_verb_ratio": "Adj/verb ratio",
"adj_to_noun_ratio": "Adj/noun ratio",
# morphological
"nie_prefix_ratio": "Nie-prefix ratio",
"participle_ratio": "Participle ratio",
"gerund_ratio": "Gerund ratio",
"osc_noun_ratio": "OSC noun ratio",
"impersonal_verb_ratio": "Impersonal verb ratio",
"genitive_noun_ratio": "Genitive noun ratio",
"avg_genitive_chain_length": "Avg genitive chain",
# syntactic
"sentence_length_variance": "Sentence length variance",
"mean_dependency_distance": "Mean dep. distance",
"subordination_index": "Subordination index",
}
QUESTEVAL_LABELS = {
"precision": "QuestEval P",
"recall": "QuestEval R",
"f1": "QuestEval F1",
"answerable_rate_forward": "Answerable (fwd)",
"answerable_rate_backward": "Answerable (bwd)",
}
RRF_K = 60
# Each entry: (source, key, label, ascending_rrf, in_rrf)
# source — "metrics" | "markers" → use avg_diff_pct (Δ%)
# "similarity" | "questeval" → use absolute value
# ascending_rrf — True = lower value is better (rank 1 = smallest)
# in_rrf — include this metric in category RRF computation
CATEGORIES: list[dict] = [
{
"name": "Readability",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Readability indices - **orth** (surface-form) variants. "
"Δ% = percentage change after simplification. "
"For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. "
"**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
"**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
"where complex words have many syllables (lower → easier). "
"**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
"IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
),
"metrics": [
("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True),
("metrics", "gunning_fog_orth", "Gunning Fog", True, True),
("metrics", "coleman_liau_orth", "Coleman-Liau", True, True),
("ifeval", "avg_exclude", "IFEval exclude", False, True),
],
},
{
"name": "Lexical Difficulty",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Word-level difficulty markers - **orth** variants where available. "
"Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
"**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). "
"**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
"(higher → harder). "
"**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
"**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
"(higher → more complex nominal vocabulary)."
),
"metrics": [
("markers", "avg_word_syllables", "Avg word syllables", True, True),
("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True),
("markers", "verb_ratio", "Verb ratio", False, True),
("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True),
],
},
{
"name": "Syntactic",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Sentence and clause structure complexity markers. "
"Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
"**Avg sentence length** is the mean number of words per sentence (higher → harder). "
"**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
"**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
),
"metrics": [
("markers", "avg_sentence_length", "Avg sentence length", True, True),
("markers", "sentence_length_variance", "Sentence length var.", True, False),
("markers", "mean_dependency_distance", "Mean dep. distance", True, True),
("markers", "subordination_index", "Subordination index", True, True),
],
},
{
"name": "Morphological",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Polish-specific morphological complexity markers. "
"Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
"**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) "
"among alphabetic tokens - a bookish, formal construction (higher → more complex). "
"**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words "
"(higher → more nominalised, formal). "
"**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), "
"passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish "
"(higher → more impersonal, harder). "
"**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
"(higher → harder). "
"**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
"(higher → more genitive stacking, harder). "
"**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of "
"administrative Polish (higher → harder). "
"**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)."
),
"metrics": [
("markers", "participle_ratio", "Participle ratio", True, False),
("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True),
("markers", "gerund_ratio", "Gerund ratio", True, True),
("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True),
("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True),
("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True),
("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True),
("markers", "osc_noun_ratio", "OSC noun ratio", True, True),
],
},
{
"name": "Meaning Preservation",
"in_rrf": True,
"rrf_weight": 4,
"description": (
"Semantic metrics that directly test whether the simplified text says the same thing as the original. "
"NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
"NE Retention measures what fraction of named entities from the original appear in the simplified text "
"(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
"IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
"Higher is better for all."
),
"metrics": [
("similarity", "nli_f1", "NLI F1", False, True),
("questeval", "f1", "QuestEval F1", False, True),
("similarity", "ne_retention", "NE Retention", False, True),
("ifeval", "avg_include", "IFEval include", False, True),
],
},
]
def _col_name(source: str, label: str) -> str:
"""Column name used in category DataFrames."""
if source in ("metrics", "markers"):
return f"{label} (Δ%)"
return label
def _model_label(data: dict) -> str:
"""Return a unique display name, appending reasoning effort when present.
The parameter size is shown separately (see :func:`_params_str`), in its
own column, mirroring the PLCC leaderboard layout.
"""
model = data["metadata"]["model"]
effort = (
data["metadata"]
.get("model_kwargs", {})
.get("extra_body", {})
.get("reasoning", {})
.get("effort")
)
if effort is not None:
return f"{model} [reasoning: {effort}]"
return model
def _params_str(params: float | None) -> str | None:
"""PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
p = params or 0
if p <= 0:
return None
return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"
def _params_map() -> dict[str, str]:
"""Model label → formatted parameter size, read from each file's metadata."""
out: dict[str, str] = {}
for data in load_records():
label = _params_str(data["metadata"].get("total_params_b"))
if label:
out[_model_label(data)] = label
return out
def _metric_row(
label_map: dict,
summary_metrics: dict,
row: dict,
detail_row: dict,
*,
include_detail: bool = True,
) -> None:
"""Populate leaderboard row and detail row from a label→key map."""
for key, label in label_map.items():
vals = summary_metrics.get(key, {})
row[f"{label} (Δ)"] = vals.get("avg_diff")
row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
if include_detail:
detail_row[f"{label} before"] = vals.get("avg_before")
detail_row[f"{label} after"] = vals.get("avg_after")
detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
"""Load scored JSON files and build leaderboard DataFrames.
Returns:
(readability_orth_df, readability_lemma_df,
lexical_orth_df, lexical_lemma_df,
similarity_df, questeval_df, markers_df, detail_df)
"""
read_orth_rows, read_lemma_rows = [], []
lex_orth_rows, lex_lemma_rows = [], []
similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []
if not DATA_DIR.exists():
empty = pd.DataFrame()
return empty, empty, empty, empty, empty, empty, empty, empty
for data in load_records():
model = _model_label(data)
n = data["summary"]["n"]
metrics = data["summary"]["metrics"]
similarity = data["summary"].get("similarity", {})
questeval = data["summary"].get("questeval", {})
markers = data["summary"].get("markers", {})
base = {"Model": model, "N": n}
read_orth_row = dict(base)
read_lemma_row = dict(base)
lex_orth_row = dict(base)
lex_lemma_row = dict(base)
similarity_row = dict(base)
questeval_row = dict(base)
markers_row = dict(base)
detail_row = dict(base)
_metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
_metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
_metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
_metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)
for key, label in SIMILARITY_LABELS.items():
similarity_row[label] = similarity.get(key)
for key, label in QUESTEVAL_LABELS.items():
questeval_row[label] = questeval.get(key)
for key, label in MARKER_LABELS.items():
vals = markers.get(key, {})
markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
detail_row[f"{label} before"] = vals.get("avg_before")
detail_row[f"{label} after"] = vals.get("avg_after")
detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
read_orth_rows.append(read_orth_row)
read_lemma_rows.append(read_lemma_row)
lex_orth_rows.append(lex_orth_row)
lex_lemma_rows.append(lex_lemma_row)
similarity_rows.append(similarity_row)
questeval_rows.append(questeval_row)
markers_rows.append(markers_row)
detail_rows.append(detail_row)
dfs = [
pd.DataFrame(read_orth_rows),
pd.DataFrame(read_lemma_rows),
pd.DataFrame(lex_orth_rows),
pd.DataFrame(lex_lemma_rows),
pd.DataFrame(similarity_rows),
pd.DataFrame(questeval_rows),
pd.DataFrame(markers_rows),
pd.DataFrame(detail_rows),
]
for df in dfs:
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].round(4)
return tuple(dfs)
@lru_cache(maxsize=1)
def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
"""Per-model matched IFEval records, cached once.
Manual IFEval rules are hand-written for a subset of the prompts, so the
comparison only makes sense on records carrying *both* an automatic and a
manual score. This reads the per-text ``results`` arrays (which
``load_records`` discards) once and keeps, per model, the tuples
``(category, prompt_id, auto_include, auto_exclude, man_include,
man_exclude)`` so the dropdown filters can re-aggregate cheaply.
"""
out: list[tuple[str, tuple[tuple, ...]]] = []
if not DATA_DIR.exists():
return ()
for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
with open(fp, encoding="utf-8") as f:
data = json.load(f)
model = _model_label(data)
recs: list[tuple] = []
for rec in data["results"]:
man = rec.get("ifeval_manual")
auto = rec.get("ifeval")
if not man or not auto:
continue
recs.append((
rec.get("category"),
rec.get("prompt_id"),
auto.get("include"), auto.get("exclude"),
man.get("include"), man.get("exclude"),
))
if recs:
out.append((model, tuple(recs)))
return tuple(out)
def load_ifeval_comparison_df(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> pd.DataFrame:
"""Compare manual (gold) IFEval against automatic IFEval, per model.
The comparison is restricted to records carrying *both* an automatic and a
manual score - the very same texts scored both ways, which isolates the
rule-quality gap from sampling differences (the overall ``ifeval`` summary
averages over ~5× more texts and so is not directly comparable). ``Δ``
columns are manual − automatic: a negative value means the automatic
constraints were easier to satisfy than the hand-checked ones, i.e. the
automatic rules are more lenient.
``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
restrict the matched records to one source-text category and/or one
simplification prompt, mirroring the RRF dropdown filters.
"""
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
# Automatic IFEval over *all* records (not just the manual-matched subset),
# from the summary buckets, so it tracks the same category/prompt filters.
# Restricted to models passing the size / model-type filters.
allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
summaries = {
_model_label(data): data["summary"]
for data in load_records()
if _model_label(data) in allowed
}
rows: list[dict] = []
for model, recs in _load_ifeval_records():
if model not in allowed:
continue
ai = ae = mi = me = 0.0
ni = ne = 0
for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
if tc and cat != tc:
continue
if pr and prompt_id != pr:
continue
if m_inc is not None and a_inc is not None:
ai += a_inc; mi += m_inc; ni += 1
if m_exc is not None and a_exc is not None:
ae += a_exc; me += m_exc; ne += 1
if ni == 0 and ne == 0:
continue
auto_inc = ai / ni if ni else None
man_inc = mi / ni if ni else None
auto_exc = ae / ne if ne else None
man_exc = me / ne if ne else None
auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
all_inc = auto_all.get("avg_include")
all_exc = auto_all.get("avg_exclude")
rows.append({
"Model": model,
"N": ni or ne,
"Manual include": man_inc,
"Manual exclude": man_exc,
"Auto include": auto_inc,
"Auto include (all)": all_inc,
"Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
"Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
"Auto exclude": auto_exc,
"Auto exclude (all)": all_exc,
"Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
"Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
})
df = pd.DataFrame(rows)
if df.empty:
return df
df = df.sort_values("Model").reset_index(drop=True)
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].round(4)
return df
def text_category_choices() -> list[str]:
"""All source-text categories present in the data, prefixed with 'All'."""
cats: set[str] = set()
for data in load_records():
cats.update(data["summary"].get("metrics_by_category", {}).keys())
return ["All"] + sorted(cats)
def prompt_choices() -> list[str]:
"""All simplification prompts present in the data, prefixed with 'All'."""
prompts: set[str] = set()
for data in load_records():
prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
return ["All"] + sorted(prompts)
def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict:
"""Return the metric bucket for one source, filtered by text category and/or prompt.
Picks the overall summary when neither filter is set, the ``*_by_category`` /
``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket
(keyed ``"CATEGORY/PROMPT"``) when both are set.
"""
if source in ("metrics", "markers", "similarity"):
if tc and prompt:
return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
if tc:
return s.get(f"{source}_by_category", {}).get(tc, {})
if prompt:
return s.get(f"{source}_by_prompt", {}).get(prompt, {})
return s.get(source, {})
# questeval / ifeval keep their per-filter buckets nested under the source object
src = s.get(source, {})
if tc and prompt:
return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
if tc:
return src.get("by_category", {}).get(tc, {})
if prompt:
return src.get("by_prompt", {}).get(prompt, {})
return src
def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None:
"""Sample count for the selected filters, from whichever source records it."""
for src in ("questeval", "ifeval"):
n = _source_bucket(s, src, tc, prompt).get("n")
if n is not None:
return n
return None
def load_category_df(
category: dict,
text_category: str | None = None,
prompt: str | None = None,
) -> pd.DataFrame:
"""Build a DataFrame for one metric category with a per-category RRF score.
``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
restrict the metrics to one source-text category and/or one simplification
prompt via the matching ``*_by_category`` / ``*_by_prompt`` /
``*_by_category_prompt`` buckets; otherwise the overall summary is used.
The RRF is always computed over **all** models; the size-limit / model-type
filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
so they never change a model's rank or score.
"""
rows: list[dict] = []
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
for data in load_records():
s = data["summary"]
model = _model_label(data)
n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
row: dict = {"Model": model, "N": n}
for source, key, label, _asc, in_rrf in category["metrics"]:
if not in_rrf:
continue
col = _col_name(source, label)
bucket = _source_bucket(s, source, tc, pr)
if source in ("metrics", "markers"):
row[col] = bucket.get(key, {}).get("avg_diff_pct")
else: # similarity, questeval, ifeval store the value directly
row[col] = bucket.get(key)
rows.append(row)
df = pd.DataFrame(rows)
if df.empty:
return df
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].round(4)
rrf = pd.Series(0.0, index=df.index)
for source, key, label, ascending, in_rrf in category["metrics"]:
if not in_rrf:
continue
col = _col_name(source, label)
if col not in df.columns or df[col].isna().all():
continue
rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))
df.insert(2, "RRF Score", rrf.round(4))
df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
return df
def _plcc_overall_map() -> dict[str, float]:
"""Model label → external PLCC overall score, read from each file's metadata.
PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
score is carried verbatim in ``metadata.plcc.overall`` and shown for
reference only - it does not feed the RRF ranking. Models without a PLCC
entry are omitted (mapped to NaN in the table).
"""
out: dict[str, float] = {}
for data in load_records():
plcc = data["metadata"].get("plcc") or {}
overall = plcc.get("overall")
if overall is not None:
out[_model_label(data)] = overall
return out
def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
"""Fuse per-category RRF scores into a final ranking via RRF.
Each category column shows the model's **rank within that category** (1 = best);
those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
A reference ``PLCC`` column carries the external PLCC benchmark score and does
not influence the ranking.
"""
merged: pd.DataFrame | None = None
for cat, cat_df in category_data:
if not cat.get("in_rrf", True) or cat_df.empty:
continue
sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")
if merged is None or merged.empty:
return pd.DataFrame()
# N (sample count) is identical across categories for a given model, so take
# it from whichever category table carries it.
n_map: dict = {}
for _cat, cat_df in category_data:
if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
n_map = dict(zip(cat_df["Model"], cat_df["N"]))
break
score_cols = [c for c in merged.columns if c != "Model"]
weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}
out = merged[["Model"]].copy()
rrf = pd.Series(0.0, index=merged.index)
rank_cols: dict[str, pd.Series] = {}
for col in score_cols:
ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
rrf += weights.get(col, 1) / (RRF_K + ranks)
rank_cols[col] = ranks
out.insert(1, "Final RRF", rrf.round(4))
out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
for name, ranks in rank_cols.items():
out[name] = ranks
out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
out.insert(0, "Rank", range(1, len(out) + 1))
out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
return out
def build_tradeoff_scatter(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> go.Figure | None:
"""Scatter of Gunning Fog reduction vs meaning preservation, one point per model.
X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
Y: QuestEval F1 (higher = better meaning preservation)
Honours the same text-category / prompt / size / model-type filters as the
RRF rankings.
"""
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
points = []
for data in _filtered_records(size_limit, model_type):
s = data["summary"]
model = _model_label(data)
x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
y = _source_bucket(s, "questeval", tc, pr).get("f1")
if x is None or y is None:
continue
points.append((model, x, y))
if not points:
return None
models, xs, ys = zip(*points)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=xs,
y=ys,
mode="markers+text",
text=models,
textposition="top center",
textfont={"size": 10},
marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
)
)
x_mid = (min(xs) + max(xs)) / 2
y_mid = (min(ys) + max(ys)) / 2
fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
fig.update_layout(
title="Complexity reduction vs meaning preservation",
xaxis_title="Gunning Fog orth Δ% (← easier text)",
yaxis_title="QuestEval F1 (↑ meaning preserved)",
height=560,
margin={"l": 60, "r": 40, "t": 60, "b": 60},
plot_bgcolor="white",
)
fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
fig.update_yaxes(showgrid=True, gridcolor="#EEE")
return fig
def build_fog_nli_scatter(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> go.Figure | None:
"""Scatter of Gunning Fog reduction vs NLI F1, one point per model.
X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
Y: NLI F1 (higher = stronger entailment / meaning preserved)
Honours the same text-category / prompt / size / model-type filters as the
RRF rankings.
"""
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
points = []
for data in _filtered_records(size_limit, model_type):
s = data["summary"]
model = _model_label(data)
x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
if x is None or y is None:
continue
points.append((model, x, y))
if not points:
return None
models, xs, ys = zip(*points)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=xs,
y=ys,
mode="markers+text",
text=models,
textposition="top center",
textfont={"size": 10},
marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
)
)
x_mid = (min(xs) + max(xs)) / 2
y_mid = (min(ys) + max(ys)) / 2
fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
fig.update_layout(
title="Complexity reduction vs NLI consistency",
xaxis_title="Gunning Fog orth Δ% (← easier text)",
yaxis_title="NLI F1 (↑ meaning preserved)",
height=560,
margin={"l": 60, "r": 40, "t": 60, "b": 60},
plot_bgcolor="white",
)
fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
fig.update_yaxes(showgrid=True, gridcolor="#EEE")
return fig
INTRO = """\
# PLainBench - Polish Text Simplification Leaderboard
This benchmark evaluates how well LLMs simplify difficult Polish texts -
drawn from legal/administrative (BIP/GOV), finance, and science domains - while
preserving the original meaning. Each model simplifies 210 source texts under
5 simplification prompts (1050 outputs per model). Outputs are scored on
readability indices, fine-grained difficulty markers (lexical, syntactic,
morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
named-entity retention), and instruction following (IFEval include/exclude).
The per-category scores are fused into an overall **Final RRF** ranking.
"""
METRICS_DOC = """\
## Metrics
### Readability indices
All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
dictionary) and counted on surface (orthographic) word forms.
Δ is the absolute change (after − before); Δ% is the average percentage change
from the original text to the simplified text.
| Metric | Formula | Interpretation |
|---|---|---|
| **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** |
| **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** |
| **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** |
### Difficulty markers
Fine-grained syntactic, morphological, and lexical features.
Δ is absolute change; Δ% is percentage change.
Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
surface (orthographic) form.
| Marker | Description | Desired Δ% |
|---|---|---|
| **Avg word syllables** | Mean syllable count per word | − (shorter words) |
| **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − |
| **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − |
| **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) |
| **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) |
| **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) |
| **Subordination index** | Subordinate clauses / total clauses | − |
| **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − |
| **Gerund ratio** | Gerunds / all tokens | − |
| **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − |
| **Genitive noun ratio** | Nouns in genitive case / all tokens | − |
| **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − |
| **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − |
| **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − |
### Similarity metrics
Reference-based metrics comparing simplified text against the original.
| Metric | Description | Direction |
|---|---|---|
| **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment |
| **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved |
*Only **NLI F1** feeds the RRF score; P and R are shown for context.*
### QuestEval - QA consistency
| Metric | Description | Direction |
|---|---|---|
| **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations |
| **QuestEval R** | Forward recall - information preserved | Higher = less content dropped |
| **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation |
| **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic |
| **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original |
*Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.*
### IFEval - instruction following
| Metric | Description | Direction |
|---|---|---|
| **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better |
| **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better |
"""
# Sample-count note shown under each table that carries an ``N`` column.
N_NOTE = "**N** = number of prompt × text evaluations per model."
# The five simplification prompts every model is run with. The keys match the
# "Simplification prompt" filter values (and the ``*_by_prompt`` summary
# buckets); each value is ``(short description, user-message template)``, where
# ``<text>`` marks where the source text is inserted. Kept in sync with
# generation/prompting/instruction.py. Ordered from least to most detailed.
PROMPTS: dict[str, tuple[str, str]] = {
"mini": (
"Minimal - a single-line instruction, no rules.",
"Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
"bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
),
"compact": (
"Compact - a short bulleted rule set.",
"""Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.
Zasady:
- Skup się na najważniejszych informacjach, usuń zbędne treści.
- Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
- Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
- Twórz krótkie zdania (jedna myśl = jedno zdanie).
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
- Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
- Zachowaj poprawność językową i logiczną spójność.
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
"medium": (
"Medium - moderately detailed rules with sub-points.",
"""Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.
### Zasady:
- Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
- Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
- Stosuj proste i naturalne słownictwo:
- zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
- jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
- Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
- Unikaj:
- żargonu, stylu urzędowego i zapożyczeń,
- form bezosobowych i strony biernej (jeśli nie są konieczne),
- nadmiaru rzeczowników odczasownikowych,
- podwójnych przeczeń i zawiłych konstrukcji.
- Zachowaj poprawność językową, spójność i logiczny układ tekstu.
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
"long": (
"Long - full, sectioned plain-language guidelines.",
"""Uprość poniższy tekst zgodnie z zasadami prostego języka.
### 1. Cel i odbiorca
- Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
- Skup się na najważniejszych informacjach.
### 2. Struktura
- Usuń informacje zbędne i poboczne.
- Uporządkuj treść: najważniejsze informacje podaj na początku.
- Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
- Jeśli tekst jest dłuższy, użyj nagłówków lub list.
### 3. Słownictwo
- Zastępuj trudne słowa prostszymi.
- Unikaj:
- terminów specjalistycznych (chyba że je wyjaśnisz),
- słów rzadkich, książkowych i urzędowych,
- zapożyczeń i modnych zwrotów,
- skrótów niezrozumiałych dla odbiorcy.
- W razie potrzeby:
- wyjaśnij trudne pojęcia,
- podaj przykłady,
- używaj konkretnych nazw zamiast ogólników.
### 4. Składnia
- Twórz krótkie zdania (ok. 20 słów).
- Jedno zdanie = jedna myśl.
- Używaj zdań twierdzących.
- Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
- Używaj strony czynnej zamiast biernej.
- Unikaj form bezosobowych i skomplikowanych konstrukcji.
- Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).
### 5. Styl
- Unikaj podwójnych przeczeń.
- Upraszczaj złożone konstrukcje.
- Zachowaj naturalny, jasny ton.
### 6. Końcowa kontrola
- Sprawdź, czy tekst jest:
- zrozumiały,
- poprawny językowo,
- logiczny i spójny.
### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
"step_by_step": (
"Step by step - role-based, numbered editorial guidelines.",
"""Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:
1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
}
# ── PLCC-inspired visual style ──────────────────────────────────────────────
# Mirrors the sdadas/plcc leaderboard: clean white background, a system
# sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
# (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
# CSS — a custom gr.themes.* would tint the component label chips blue, which
# is not part of the PLCC look.
PLCC_CSS = """
.gradio-container {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
"Helvetica Neue", Arial, sans-serif !important;
max-width: 1500px !important;
}
/* PLCC-style data tables */
.plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
.plain-table thead th {
background: #f9fafd !important;
border-bottom: 2px solid #ddd !important;
color: #222 !important;
font-weight: 700 !important;
}
.plain-table tbody td { padding: 8px 10px !important; }
.plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
.plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
/* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
.params-col tbody td:nth-child(3),
.params-col thead th:nth-child(3) {
text-align: right !important;
white-space: nowrap;
}
.params-col tbody td:nth-child(3) { color: #999 !important; }
/* Filter bar — the grey rounded block holding the dropdowns */
.filter-bar {
background: #f9fafd;
border: 1px solid #ddd;
border-radius: 0.5rem;
padding: 10px 14px;
}
"""
# Colour palette for category bars
_CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]
def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
"""Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
if df.empty or "Model" not in df.columns:
return df
return df[df["Model"].isin(allowed)].reset_index(drop=True)
def load_rrf_views(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
"""Final ranking DataFrame and per-category DataFrames for the selected filters.
Ranks and RRF scores are computed over **all** models (honouring only the
text-category / prompt filters). The size-limit and model-type selections
are then applied as pure row filters that hide models without recomputing
any ranking - so a surviving model keeps the rank it held in the full table.
"""
category_data = [
(cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
]
final_df = build_final_ranking_df(category_data)
allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
final_df = _filter_model_rows(final_df, allowed)
category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
return final_df, category_data
def _tradeoff_figs(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> tuple[go.Figure, go.Figure]:
"""Both trade-off scatters for the selected filters (empty figure when no data)."""
return (
build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
)
def build_app() -> gr.Blocks:
(
read_orth_df, read_lemma_df,
lex_orth_df, lex_lemma_df,
similarity_df, questeval_df,
markers_df, detail_df,
) = load_leaderboard_data()
ifeval_cmp_df = load_ifeval_comparison_df()
final_df, category_data = load_rrf_views(None, None)
tc_choices = text_category_choices()
pr_choices = prompt_choices()
size_choices = _visible_size_limits()
tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)
with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
gr.Markdown(INTRO)
if read_orth_df.empty:
gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*")
else:
# Reactive output components, gathered in the order the change
# handler returns them: final table, then one table per in-RRF
# category, then the two trade-off scatters (and the IFEval table).
rrf_outputs: list = []
with gr.Row(elem_classes=["filter-bar"]):
tc_dropdown = gr.Dropdown(
choices=tc_choices,
value="All",
label="Text category",
info="Filter the RRF rankings to one source-text category.",
)
pr_dropdown = gr.Dropdown(
choices=pr_choices,
value="All",
label="Simplification prompt",
info="Filter the RRF rankings to one simplification prompt.",
)
size_dropdown = gr.Dropdown(
choices=size_choices,
value="ALL",
label="Size limit",
info="Keep only models up to this many parameters.",
)
type_dropdown = gr.Dropdown(
choices=MODEL_TYPES,
value="ALL",
label="Model type",
info="Filter by open- vs closed-weights models.",
)
with gr.Tabs():
# ── Final Ranking ──────────────────────────────────────────
with gr.TabItem("Final Ranking"):
gr.Markdown(
"Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. "
"Each category ranks models by its own RRF score; those ranks are then fused into a "
"single **Final RRF** score. Higher = better overall simplification. "
"The **PLCC** column shows the model's score on the external "
"[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
"benchmark for reference only - it does not affect the ranking (blank where unavailable)."
)
final_table = gr.Dataframe(
value=final_df, interactive=False, wrap=True,
elem_classes=["plain-table", "params-col"],
)
gr.Markdown(N_NOTE)
rrf_outputs += [final_table]
# ── RRF category tabs ──────────────────────────────────────
for cat, cat_df in category_data:
if not cat.get("in_rrf", True):
continue
with gr.TabItem(cat["name"]):
gr.Markdown(cat["description"])
cat_table = gr.Dataframe(
value=cat_df, interactive=False, wrap=True,
elem_classes=["plain-table", "params-col"],
)
gr.Markdown(N_NOTE)
rrf_outputs += [cat_table]
# ── Trade-off plots ────────────────────────────────────────
with gr.TabItem("Trade-off"):
gr.Markdown(
"Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
"(QuestEval F1), one point per model. Top-left is ideal: "
"greater complexity reduction **and** faithful to the original."
)
tradeoff_plot = gr.Plot(value=tradeoff_fig)
gr.Markdown(
"---\n"
"Gunning Fog orth reduction (Δ%) versus NLI F1. "
"Top-left is best: greater complexity reduction **and** strong NLI entailment."
)
fog_nli_plot = gr.Plot(value=fog_nli_fig)
rrf_outputs += [tradeoff_plot, fog_nli_plot]
with gr.TabItem("Detailed scores", visible=False):
gr.Markdown(
"Average scores before and after simplification, plus absolute (Δ) "
"and percentage (Δ%) change - for all readability, lexical, and marker metrics."
)
gr.Dataframe(
value=detail_df, interactive=False, wrap=True,
elem_classes=["plain-table"],
)
# ── IFEval: manual vs automatic ────────────────────────────
if not ifeval_cmp_df.empty:
with gr.TabItem("IFEval manual vs auto"):
gr.Markdown(
"**Automatic** IFEval constraints are generated by an LLM; "
"**manual** constraints are hand-written gold rules, available for a "
"subset of the prompts. To isolate rule quality from sampling, the "
"comparison is restricted to the texts that carry **both** scores "
"(N = matched texts per model), so these automatic figures differ from "
"the full-sample IFEval used elsewhere.\n\n"
"**include** = fraction of *include* constraints satisfied, "
"**exclude** = fraction of *exclude* constraints satisfied (higher is "
"better for both). **Δ = manual − automatic** (on the matched texts): a "
"negative Δ means the automatic rules were easier to satisfy than the "
"hand-checked ones (more lenient automatic scoring). The **(all)** columns "
"show automatic IFEval over *every* text (the full-sample figure used "
"elsewhere). **Δ (man−auto all)** is manual minus that full-sample "
"automatic value - useful as a sanity check, but note the two cover "
"different text sets (matched subset vs. all texts), so **Δ (man−auto)** "
"is the rigorous like-for-like comparison."
)
ifeval_cmp_table = gr.Dataframe(
value=ifeval_cmp_df, interactive=False, wrap=True,
elem_classes=["plain-table"],
)
rrf_outputs.append(ifeval_cmp_table)
# Metric documentation, shown below the results.
gr.Markdown(METRICS_DOC)
# Simplification prompts, documenting the "Simplification prompt"
# filter values — shown below the metric documentation.
gr.Markdown(
"## Simplification prompts\n\n"
"The five prompt templates every model is run with - these are the "
"values of the **Simplification prompt** filter above. Each source "
"text is simplified once per prompt, so they range from a bare "
"one-line instruction to full plain-language guidelines. "
"`<text>` marks where the source text is inserted."
)
for _name, (_desc, _body) in PROMPTS.items():
with gr.Accordion(f"{_name} - {_desc}", open=False):
gr.Markdown(f"```\n{_body}\n```")
# Recompute the RRF rankings whenever any filter changes.
_filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]
def _refresh_rrf(
text_category: str, prompt: str, size_limit: str, model_type: str
) -> list:
f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
updates: list = [f_df]
for cat, df in cat_data:
if not cat.get("in_rrf", True):
continue
updates += [df]
updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
if not ifeval_cmp_df.empty:
updates.append(
load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
)
return updates
for _dd in _filters:
_dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)
return app
app = build_app()
if __name__ == "__main__":
app.launch()
|