PLainBench / app.py
bartlomiejn87's picture
Initial commit
3bd48fe
Raw
History Blame Contribute Delete
60.8 kB
"""PLainBench - Polish Text Simplification Leaderboard.
Reads scored anon JSON files from the data/current/ directory and displays a
leaderboard showing how well each LLM simplifies Polish texts, measured
by readability indices, difficulty markers, reference-based similarity
metrics, and a QuestEval-style QA consistency score.
"""
import json
from functools import lru_cache
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
DATA_DIR = Path(__file__).parent / "data" / "current"
@lru_cache(maxsize=1)
def load_records() -> tuple[dict, ...]:
"""Parse every scored anon JSON once and cache the result.
The full files are large (~9 MB each, holding per-text records), but the
app only ever reads ``metadata`` and ``summary``. We keep just those two
sections so each file is parsed a single time and every loader/refresh
reuses the in-memory copy instead of re-reading from disk.
"""
records: list[dict] = []
if not DATA_DIR.exists():
return ()
for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
with open(fp, encoding="utf-8") as f:
data = json.load(f)
records.append({"metadata": data["metadata"], "summary": data["summary"]})
return tuple(records)
# Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
# quick-filters. Size options are *upper bounds* in billions of parameters.
SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]
def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
"""Whether a model's metadata satisfies the size-limit / model-type filters."""
if model_type and model_type != "ALL":
want = "open" if model_type == "open-weights" else "closed"
if meta.get("weights") != want:
return False
if size_limit and size_limit != "ALL":
cap = float(size_limit.rstrip("B"))
params = meta.get("total_params_b") or 0
# Unknown / unreported size (0) can't be placed under a cap, so exclude it.
if params <= 0 or params > cap:
return False
return True
def _filtered_records(
size_limit: str | None = None, model_type: str | None = None
) -> list[dict]:
"""Records whose model passes the size-limit / model-type filters."""
sl = size_limit or "ALL"
mt = model_type or "ALL"
return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]
def _visible_size_limits() -> list[str]:
"""Prune ``SIZE_LIMITS`` to the caps that actually split the current models.
A numeric cap is redundant when it selects the same set of models as the
next-smaller cap (no model has a size in the band between them) - those
upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
always kept. Recomputed from the data, so adding models later automatically
re-expands the list.
"""
params = [
p for d in load_records()
if (p := d["metadata"].get("total_params_b") or 0) > 0
]
# Ascending by value: keep the smallest representative of each distinct
# subset; a larger cap with the same model count is the redundant "upper" one.
kept: set[str] = set()
prev_count = -1
for s in sorted(
(s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
):
cap = float(s.rstrip("B"))
count = sum(1 for p in params if p <= cap)
if count > 0 and count != prev_count:
kept.add(s)
prev_count = count
# Preserve the original descending display order, with ALL first.
return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]
READABILITY_ORTH_LABELS = {
"flesch_reading_ease_orth": "Flesch RE",
"flesch_kincaid_grade_orth": "Flesch-Kincaid",
"gunning_fog_orth": "Gunning Fog",
"ari_orth": "ARI",
"linsear_write_orth": "Linsear Write",
"smog_grade_orth": "SMOG",
"coleman_liau_orth": "Coleman-Liau",
"pisarek_orth": "Pisarek",
}
READABILITY_LEMMA_LABELS = {
"flesch_reading_ease_lemma": "Flesch RE",
"flesch_kincaid_grade_lemma": "Flesch-Kincaid",
"gunning_fog_lemma": "Gunning Fog",
"ari_lemma": "ARI",
"linsear_write_lemma": "Linsear Write",
"smog_grade_lemma": "SMOG",
"coleman_liau_lemma": "Coleman-Liau",
"pisarek_lemma": "Pisarek",
}
LEXICAL_ORTH_LABELS = {
"ttr_orth": "TTR",
"rttr_orth": "RTTR",
"cttr_orth": "CTTR",
"herdan_orth": "Herdan",
"summer_orth": "Summer",
"dugast_orth": "Dugast",
"maas_orth": "Maas",
"mtld_orth": "MTLD",
"mattr_orth": "MATTR",
}
LEXICAL_LEMMA_LABELS = {
"ttr_lemma": "TTR",
"rttr_lemma": "RTTR",
"cttr_lemma": "CTTR",
"herdan_lemma": "Herdan",
"summer_lemma": "Summer",
"dugast_lemma": "Dugast",
"maas_lemma": "Maas",
"mtld_lemma": "MTLD",
"mattr_lemma": "MATTR",
}
SIMILARITY_LABELS = {
"bert_score_precision": "BERTScore P",
"bert_score_recall": "BERTScore R",
"bert_score_f1": "BERTScore F1",
"bleu": "BLEU",
"chrf": "chrF",
"chrfpp": "chrF++",
"nli_precision": "NLI P",
"nli_recall": "NLI R",
"nli_f1": "NLI F1",
"rouge_1_precision": "ROUGE-1 P",
"rouge_1_recall": "ROUGE-1 R",
"rouge_1_f1": "ROUGE-1 F1",
"rouge_2_precision": "ROUGE-2 P",
"rouge_2_recall": "ROUGE-2 R",
"rouge_2_f1": "ROUGE-2 F1",
"rouge_l_precision": "ROUGE-L P",
"rouge_l_recall": "ROUGE-L R",
"rouge_l_f1": "ROUGE-L F1",
"wer": "WER",
"mer": "MER",
"wil": "WIL",
"ne_retention": "NE Retention",
}
MARKER_LABELS = {
# counts
"paragraph_count": "Paragraph count",
"sentence_count": "Sentence count",
"word_count": "Word count",
"named_entity_count": "Named entity count",
"difficult_word_count": "Difficult word count",
"difficult_word_count_orth": "Difficult word count (orth)",
# average lengths
"avg_word_syllables": "Avg word syllables",
"avg_sentence_length": "Avg sentence length",
"avg_paragraph_length": "Avg paragraph length",
# lexical difficulty
"named_entity_ratio": "Named entity ratio",
"difficult_word_ratio": "Difficult word ratio",
"difficult_word_ratio_orth": "Difficult word ratio (orth)",
# POS ratios
"noun_ratio": "Noun ratio",
"difficult_noun_ratio": "Difficult noun ratio",
"difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
"verb_ratio": "Verb ratio",
"difficult_verb_ratio": "Difficult verb ratio",
"difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
"adjective_ratio": "Adjective ratio",
"difficult_adjective_ratio": "Difficult adjective ratio",
"difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
# POS-to-POS ratios
"noun_to_verb_ratio": "Noun/verb ratio",
"verbo_nominal_ratio": "Verbo-nominal ratio",
"adj_to_verb_ratio": "Adj/verb ratio",
"adj_to_noun_ratio": "Adj/noun ratio",
# morphological
"nie_prefix_ratio": "Nie-prefix ratio",
"participle_ratio": "Participle ratio",
"gerund_ratio": "Gerund ratio",
"osc_noun_ratio": "OSC noun ratio",
"impersonal_verb_ratio": "Impersonal verb ratio",
"genitive_noun_ratio": "Genitive noun ratio",
"avg_genitive_chain_length": "Avg genitive chain",
# syntactic
"sentence_length_variance": "Sentence length variance",
"mean_dependency_distance": "Mean dep. distance",
"subordination_index": "Subordination index",
}
QUESTEVAL_LABELS = {
"precision": "QuestEval P",
"recall": "QuestEval R",
"f1": "QuestEval F1",
"answerable_rate_forward": "Answerable (fwd)",
"answerable_rate_backward": "Answerable (bwd)",
}
RRF_K = 60
# Each entry: (source, key, label, ascending_rrf, in_rrf)
# source — "metrics" | "markers" → use avg_diff_pct (Δ%)
# "similarity" | "questeval" → use absolute value
# ascending_rrf — True = lower value is better (rank 1 = smallest)
# in_rrf — include this metric in category RRF computation
CATEGORIES: list[dict] = [
{
"name": "Readability",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Readability indices - **orth** (surface-form) variants. "
"Δ% = percentage change after simplification. "
"For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. "
"**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
"**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
"where complex words have many syllables (lower → easier). "
"**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
"IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
),
"metrics": [
("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True),
("metrics", "gunning_fog_orth", "Gunning Fog", True, True),
("metrics", "coleman_liau_orth", "Coleman-Liau", True, True),
("ifeval", "avg_exclude", "IFEval exclude", False, True),
],
},
{
"name": "Lexical Difficulty",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Word-level difficulty markers - **orth** variants where available. "
"Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
"**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). "
"**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
"(higher → harder). "
"**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
"**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
"(higher → more complex nominal vocabulary)."
),
"metrics": [
("markers", "avg_word_syllables", "Avg word syllables", True, True),
("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True),
("markers", "verb_ratio", "Verb ratio", False, True),
("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True),
],
},
{
"name": "Syntactic",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Sentence and clause structure complexity markers. "
"Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
"**Avg sentence length** is the mean number of words per sentence (higher → harder). "
"**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
"**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
),
"metrics": [
("markers", "avg_sentence_length", "Avg sentence length", True, True),
("markers", "sentence_length_variance", "Sentence length var.", True, False),
("markers", "mean_dependency_distance", "Mean dep. distance", True, True),
("markers", "subordination_index", "Subordination index", True, True),
],
},
{
"name": "Morphological",
"in_rrf": True,
"rrf_weight": 1,
"description": (
"Polish-specific morphological complexity markers. "
"Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
"**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) "
"among alphabetic tokens - a bookish, formal construction (higher → more complex). "
"**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words "
"(higher → more nominalised, formal). "
"**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), "
"passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish "
"(higher → more impersonal, harder). "
"**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
"(higher → harder). "
"**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
"(higher → more genitive stacking, harder). "
"**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of "
"administrative Polish (higher → harder). "
"**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)."
),
"metrics": [
("markers", "participle_ratio", "Participle ratio", True, False),
("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True),
("markers", "gerund_ratio", "Gerund ratio", True, True),
("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True),
("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True),
("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True),
("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True),
("markers", "osc_noun_ratio", "OSC noun ratio", True, True),
],
},
{
"name": "Meaning Preservation",
"in_rrf": True,
"rrf_weight": 4,
"description": (
"Semantic metrics that directly test whether the simplified text says the same thing as the original. "
"NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
"NE Retention measures what fraction of named entities from the original appear in the simplified text "
"(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
"IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
"Higher is better for all."
),
"metrics": [
("similarity", "nli_f1", "NLI F1", False, True),
("questeval", "f1", "QuestEval F1", False, True),
("similarity", "ne_retention", "NE Retention", False, True),
("ifeval", "avg_include", "IFEval include", False, True),
],
},
]
def _col_name(source: str, label: str) -> str:
"""Column name used in category DataFrames."""
if source in ("metrics", "markers"):
return f"{label} (Δ%)"
return label
def _model_label(data: dict) -> str:
"""Return a unique display name, appending reasoning effort when present.
The parameter size is shown separately (see :func:`_params_str`), in its
own column, mirroring the PLCC leaderboard layout.
"""
model = data["metadata"]["model"]
effort = (
data["metadata"]
.get("model_kwargs", {})
.get("extra_body", {})
.get("reasoning", {})
.get("effort")
)
if effort is not None:
return f"{model} [reasoning: {effort}]"
return model
def _params_str(params: float | None) -> str | None:
"""PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
p = params or 0
if p <= 0:
return None
return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"
def _params_map() -> dict[str, str]:
"""Model label → formatted parameter size, read from each file's metadata."""
out: dict[str, str] = {}
for data in load_records():
label = _params_str(data["metadata"].get("total_params_b"))
if label:
out[_model_label(data)] = label
return out
def _metric_row(
label_map: dict,
summary_metrics: dict,
row: dict,
detail_row: dict,
*,
include_detail: bool = True,
) -> None:
"""Populate leaderboard row and detail row from a label→key map."""
for key, label in label_map.items():
vals = summary_metrics.get(key, {})
row[f"{label} (Δ)"] = vals.get("avg_diff")
row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
if include_detail:
detail_row[f"{label} before"] = vals.get("avg_before")
detail_row[f"{label} after"] = vals.get("avg_after")
detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
"""Load scored JSON files and build leaderboard DataFrames.
Returns:
(readability_orth_df, readability_lemma_df,
lexical_orth_df, lexical_lemma_df,
similarity_df, questeval_df, markers_df, detail_df)
"""
read_orth_rows, read_lemma_rows = [], []
lex_orth_rows, lex_lemma_rows = [], []
similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []
if not DATA_DIR.exists():
empty = pd.DataFrame()
return empty, empty, empty, empty, empty, empty, empty, empty
for data in load_records():
model = _model_label(data)
n = data["summary"]["n"]
metrics = data["summary"]["metrics"]
similarity = data["summary"].get("similarity", {})
questeval = data["summary"].get("questeval", {})
markers = data["summary"].get("markers", {})
base = {"Model": model, "N": n}
read_orth_row = dict(base)
read_lemma_row = dict(base)
lex_orth_row = dict(base)
lex_lemma_row = dict(base)
similarity_row = dict(base)
questeval_row = dict(base)
markers_row = dict(base)
detail_row = dict(base)
_metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
_metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
_metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
_metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)
for key, label in SIMILARITY_LABELS.items():
similarity_row[label] = similarity.get(key)
for key, label in QUESTEVAL_LABELS.items():
questeval_row[label] = questeval.get(key)
for key, label in MARKER_LABELS.items():
vals = markers.get(key, {})
markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
detail_row[f"{label} before"] = vals.get("avg_before")
detail_row[f"{label} after"] = vals.get("avg_after")
detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
read_orth_rows.append(read_orth_row)
read_lemma_rows.append(read_lemma_row)
lex_orth_rows.append(lex_orth_row)
lex_lemma_rows.append(lex_lemma_row)
similarity_rows.append(similarity_row)
questeval_rows.append(questeval_row)
markers_rows.append(markers_row)
detail_rows.append(detail_row)
dfs = [
pd.DataFrame(read_orth_rows),
pd.DataFrame(read_lemma_rows),
pd.DataFrame(lex_orth_rows),
pd.DataFrame(lex_lemma_rows),
pd.DataFrame(similarity_rows),
pd.DataFrame(questeval_rows),
pd.DataFrame(markers_rows),
pd.DataFrame(detail_rows),
]
for df in dfs:
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].round(4)
return tuple(dfs)
@lru_cache(maxsize=1)
def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
"""Per-model matched IFEval records, cached once.
Manual IFEval rules are hand-written for a subset of the prompts, so the
comparison only makes sense on records carrying *both* an automatic and a
manual score. This reads the per-text ``results`` arrays (which
``load_records`` discards) once and keeps, per model, the tuples
``(category, prompt_id, auto_include, auto_exclude, man_include,
man_exclude)`` so the dropdown filters can re-aggregate cheaply.
"""
out: list[tuple[str, tuple[tuple, ...]]] = []
if not DATA_DIR.exists():
return ()
for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
with open(fp, encoding="utf-8") as f:
data = json.load(f)
model = _model_label(data)
recs: list[tuple] = []
for rec in data["results"]:
man = rec.get("ifeval_manual")
auto = rec.get("ifeval")
if not man or not auto:
continue
recs.append((
rec.get("category"),
rec.get("prompt_id"),
auto.get("include"), auto.get("exclude"),
man.get("include"), man.get("exclude"),
))
if recs:
out.append((model, tuple(recs)))
return tuple(out)
def load_ifeval_comparison_df(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> pd.DataFrame:
"""Compare manual (gold) IFEval against automatic IFEval, per model.
The comparison is restricted to records carrying *both* an automatic and a
manual score - the very same texts scored both ways, which isolates the
rule-quality gap from sampling differences (the overall ``ifeval`` summary
averages over ~5× more texts and so is not directly comparable). ``Δ``
columns are manual − automatic: a negative value means the automatic
constraints were easier to satisfy than the hand-checked ones, i.e. the
automatic rules are more lenient.
``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
restrict the matched records to one source-text category and/or one
simplification prompt, mirroring the RRF dropdown filters.
"""
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
# Automatic IFEval over *all* records (not just the manual-matched subset),
# from the summary buckets, so it tracks the same category/prompt filters.
# Restricted to models passing the size / model-type filters.
allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
summaries = {
_model_label(data): data["summary"]
for data in load_records()
if _model_label(data) in allowed
}
rows: list[dict] = []
for model, recs in _load_ifeval_records():
if model not in allowed:
continue
ai = ae = mi = me = 0.0
ni = ne = 0
for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
if tc and cat != tc:
continue
if pr and prompt_id != pr:
continue
if m_inc is not None and a_inc is not None:
ai += a_inc; mi += m_inc; ni += 1
if m_exc is not None and a_exc is not None:
ae += a_exc; me += m_exc; ne += 1
if ni == 0 and ne == 0:
continue
auto_inc = ai / ni if ni else None
man_inc = mi / ni if ni else None
auto_exc = ae / ne if ne else None
man_exc = me / ne if ne else None
auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
all_inc = auto_all.get("avg_include")
all_exc = auto_all.get("avg_exclude")
rows.append({
"Model": model,
"N": ni or ne,
"Manual include": man_inc,
"Manual exclude": man_exc,
"Auto include": auto_inc,
"Auto include (all)": all_inc,
"Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
"Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
"Auto exclude": auto_exc,
"Auto exclude (all)": all_exc,
"Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
"Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
})
df = pd.DataFrame(rows)
if df.empty:
return df
df = df.sort_values("Model").reset_index(drop=True)
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].round(4)
return df
def text_category_choices() -> list[str]:
"""All source-text categories present in the data, prefixed with 'All'."""
cats: set[str] = set()
for data in load_records():
cats.update(data["summary"].get("metrics_by_category", {}).keys())
return ["All"] + sorted(cats)
def prompt_choices() -> list[str]:
"""All simplification prompts present in the data, prefixed with 'All'."""
prompts: set[str] = set()
for data in load_records():
prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
return ["All"] + sorted(prompts)
def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict:
"""Return the metric bucket for one source, filtered by text category and/or prompt.
Picks the overall summary when neither filter is set, the ``*_by_category`` /
``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket
(keyed ``"CATEGORY/PROMPT"``) when both are set.
"""
if source in ("metrics", "markers", "similarity"):
if tc and prompt:
return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
if tc:
return s.get(f"{source}_by_category", {}).get(tc, {})
if prompt:
return s.get(f"{source}_by_prompt", {}).get(prompt, {})
return s.get(source, {})
# questeval / ifeval keep their per-filter buckets nested under the source object
src = s.get(source, {})
if tc and prompt:
return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
if tc:
return src.get("by_category", {}).get(tc, {})
if prompt:
return src.get("by_prompt", {}).get(prompt, {})
return src
def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None:
"""Sample count for the selected filters, from whichever source records it."""
for src in ("questeval", "ifeval"):
n = _source_bucket(s, src, tc, prompt).get("n")
if n is not None:
return n
return None
def load_category_df(
category: dict,
text_category: str | None = None,
prompt: str | None = None,
) -> pd.DataFrame:
"""Build a DataFrame for one metric category with a per-category RRF score.
``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
restrict the metrics to one source-text category and/or one simplification
prompt via the matching ``*_by_category`` / ``*_by_prompt`` /
``*_by_category_prompt`` buckets; otherwise the overall summary is used.
The RRF is always computed over **all** models; the size-limit / model-type
filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
so they never change a model's rank or score.
"""
rows: list[dict] = []
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
for data in load_records():
s = data["summary"]
model = _model_label(data)
n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
row: dict = {"Model": model, "N": n}
for source, key, label, _asc, in_rrf in category["metrics"]:
if not in_rrf:
continue
col = _col_name(source, label)
bucket = _source_bucket(s, source, tc, pr)
if source in ("metrics", "markers"):
row[col] = bucket.get(key, {}).get("avg_diff_pct")
else: # similarity, questeval, ifeval store the value directly
row[col] = bucket.get(key)
rows.append(row)
df = pd.DataFrame(rows)
if df.empty:
return df
num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].round(4)
rrf = pd.Series(0.0, index=df.index)
for source, key, label, ascending, in_rrf in category["metrics"]:
if not in_rrf:
continue
col = _col_name(source, label)
if col not in df.columns or df[col].isna().all():
continue
rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))
df.insert(2, "RRF Score", rrf.round(4))
df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
return df
def _plcc_overall_map() -> dict[str, float]:
"""Model label → external PLCC overall score, read from each file's metadata.
PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
score is carried verbatim in ``metadata.plcc.overall`` and shown for
reference only - it does not feed the RRF ranking. Models without a PLCC
entry are omitted (mapped to NaN in the table).
"""
out: dict[str, float] = {}
for data in load_records():
plcc = data["metadata"].get("plcc") or {}
overall = plcc.get("overall")
if overall is not None:
out[_model_label(data)] = overall
return out
def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
"""Fuse per-category RRF scores into a final ranking via RRF.
Each category column shows the model's **rank within that category** (1 = best);
those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
A reference ``PLCC`` column carries the external PLCC benchmark score and does
not influence the ranking.
"""
merged: pd.DataFrame | None = None
for cat, cat_df in category_data:
if not cat.get("in_rrf", True) or cat_df.empty:
continue
sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")
if merged is None or merged.empty:
return pd.DataFrame()
# N (sample count) is identical across categories for a given model, so take
# it from whichever category table carries it.
n_map: dict = {}
for _cat, cat_df in category_data:
if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
n_map = dict(zip(cat_df["Model"], cat_df["N"]))
break
score_cols = [c for c in merged.columns if c != "Model"]
weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}
out = merged[["Model"]].copy()
rrf = pd.Series(0.0, index=merged.index)
rank_cols: dict[str, pd.Series] = {}
for col in score_cols:
ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
rrf += weights.get(col, 1) / (RRF_K + ranks)
rank_cols[col] = ranks
out.insert(1, "Final RRF", rrf.round(4))
out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
for name, ranks in rank_cols.items():
out[name] = ranks
out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
out.insert(0, "Rank", range(1, len(out) + 1))
out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
return out
def build_tradeoff_scatter(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> go.Figure | None:
"""Scatter of Gunning Fog reduction vs meaning preservation, one point per model.
X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
Y: QuestEval F1 (higher = better meaning preservation)
Honours the same text-category / prompt / size / model-type filters as the
RRF rankings.
"""
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
points = []
for data in _filtered_records(size_limit, model_type):
s = data["summary"]
model = _model_label(data)
x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
y = _source_bucket(s, "questeval", tc, pr).get("f1")
if x is None or y is None:
continue
points.append((model, x, y))
if not points:
return None
models, xs, ys = zip(*points)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=xs,
y=ys,
mode="markers+text",
text=models,
textposition="top center",
textfont={"size": 10},
marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
)
)
x_mid = (min(xs) + max(xs)) / 2
y_mid = (min(ys) + max(ys)) / 2
fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
fig.update_layout(
title="Complexity reduction vs meaning preservation",
xaxis_title="Gunning Fog orth Δ% (← easier text)",
yaxis_title="QuestEval F1 (↑ meaning preserved)",
height=560,
margin={"l": 60, "r": 40, "t": 60, "b": 60},
plot_bgcolor="white",
)
fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
fig.update_yaxes(showgrid=True, gridcolor="#EEE")
return fig
def build_fog_nli_scatter(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> go.Figure | None:
"""Scatter of Gunning Fog reduction vs NLI F1, one point per model.
X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
Y: NLI F1 (higher = stronger entailment / meaning preserved)
Honours the same text-category / prompt / size / model-type filters as the
RRF rankings.
"""
tc = None if text_category in (None, "All") else text_category
pr = None if prompt in (None, "All") else prompt
points = []
for data in _filtered_records(size_limit, model_type):
s = data["summary"]
model = _model_label(data)
x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
if x is None or y is None:
continue
points.append((model, x, y))
if not points:
return None
models, xs, ys = zip(*points)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=xs,
y=ys,
mode="markers+text",
text=models,
textposition="top center",
textfont={"size": 10},
marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
)
)
x_mid = (min(xs) + max(xs)) / 2
y_mid = (min(ys) + max(ys)) / 2
fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
fig.update_layout(
title="Complexity reduction vs NLI consistency",
xaxis_title="Gunning Fog orth Δ% (← easier text)",
yaxis_title="NLI F1 (↑ meaning preserved)",
height=560,
margin={"l": 60, "r": 40, "t": 60, "b": 60},
plot_bgcolor="white",
)
fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
fig.update_yaxes(showgrid=True, gridcolor="#EEE")
return fig
INTRO = """\
# PLainBench - Polish Text Simplification Leaderboard
This benchmark evaluates how well LLMs simplify difficult Polish texts -
drawn from legal/administrative (BIP/GOV), finance, and science domains - while
preserving the original meaning. Each model simplifies 210 source texts under
5 simplification prompts (1050 outputs per model). Outputs are scored on
readability indices, fine-grained difficulty markers (lexical, syntactic,
morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
named-entity retention), and instruction following (IFEval include/exclude).
The per-category scores are fused into an overall **Final RRF** ranking.
"""
METRICS_DOC = """\
## Metrics
### Readability indices
All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
dictionary) and counted on surface (orthographic) word forms.
Δ is the absolute change (after − before); Δ% is the average percentage change
from the original text to the simplified text.
| Metric | Formula | Interpretation |
|---|---|---|
| **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** |
| **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** |
| **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** |
### Difficulty markers
Fine-grained syntactic, morphological, and lexical features.
Δ is absolute change; Δ% is percentage change.
Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
surface (orthographic) form.
| Marker | Description | Desired Δ% |
|---|---|---|
| **Avg word syllables** | Mean syllable count per word | − (shorter words) |
| **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − |
| **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − |
| **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) |
| **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) |
| **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) |
| **Subordination index** | Subordinate clauses / total clauses | − |
| **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − |
| **Gerund ratio** | Gerunds / all tokens | − |
| **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − |
| **Genitive noun ratio** | Nouns in genitive case / all tokens | − |
| **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − |
| **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − |
| **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − |
### Similarity metrics
Reference-based metrics comparing simplified text against the original.
| Metric | Description | Direction |
|---|---|---|
| **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment |
| **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved |
*Only **NLI F1** feeds the RRF score; P and R are shown for context.*
### QuestEval - QA consistency
| Metric | Description | Direction |
|---|---|---|
| **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations |
| **QuestEval R** | Forward recall - information preserved | Higher = less content dropped |
| **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation |
| **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic |
| **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original |
*Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.*
### IFEval - instruction following
| Metric | Description | Direction |
|---|---|---|
| **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better |
| **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better |
"""
# Sample-count note shown under each table that carries an ``N`` column.
N_NOTE = "**N** = number of prompt × text evaluations per model."
# The five simplification prompts every model is run with. The keys match the
# "Simplification prompt" filter values (and the ``*_by_prompt`` summary
# buckets); each value is ``(short description, user-message template)``, where
# ``<text>`` marks where the source text is inserted. Kept in sync with
# generation/prompting/instruction.py. Ordered from least to most detailed.
PROMPTS: dict[str, tuple[str, str]] = {
"mini": (
"Minimal - a single-line instruction, no rules.",
"Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
"bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
),
"compact": (
"Compact - a short bulleted rule set.",
"""Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.
Zasady:
- Skup się na najważniejszych informacjach, usuń zbędne treści.
- Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
- Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
- Twórz krótkie zdania (jedna myśl = jedno zdanie).
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
- Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
- Zachowaj poprawność językową i logiczną spójność.
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
"medium": (
"Medium - moderately detailed rules with sub-points.",
"""Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.
### Zasady:
- Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
- Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
- Stosuj proste i naturalne słownictwo:
- zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
- jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
- Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
- Unikaj:
- żargonu, stylu urzędowego i zapożyczeń,
- form bezosobowych i strony biernej (jeśli nie są konieczne),
- nadmiaru rzeczowników odczasownikowych,
- podwójnych przeczeń i zawiłych konstrukcji.
- Zachowaj poprawność językową, spójność i logiczny układ tekstu.
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
"long": (
"Long - full, sectioned plain-language guidelines.",
"""Uprość poniższy tekst zgodnie z zasadami prostego języka.
### 1. Cel i odbiorca
- Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
- Skup się na najważniejszych informacjach.
### 2. Struktura
- Usuń informacje zbędne i poboczne.
- Uporządkuj treść: najważniejsze informacje podaj na początku.
- Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
- Jeśli tekst jest dłuższy, użyj nagłówków lub list.
### 3. Słownictwo
- Zastępuj trudne słowa prostszymi.
- Unikaj:
- terminów specjalistycznych (chyba że je wyjaśnisz),
- słów rzadkich, książkowych i urzędowych,
- zapożyczeń i modnych zwrotów,
- skrótów niezrozumiałych dla odbiorcy.
- W razie potrzeby:
- wyjaśnij trudne pojęcia,
- podaj przykłady,
- używaj konkretnych nazw zamiast ogólników.
### 4. Składnia
- Twórz krótkie zdania (ok. 20 słów).
- Jedno zdanie = jedna myśl.
- Używaj zdań twierdzących.
- Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
- Używaj strony czynnej zamiast biernej.
- Unikaj form bezosobowych i skomplikowanych konstrukcji.
- Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).
### 5. Styl
- Unikaj podwójnych przeczeń.
- Upraszczaj złożone konstrukcje.
- Zachowaj naturalny, jasny ton.
### 6. Końcowa kontrola
- Sprawdź, czy tekst jest:
- zrozumiały,
- poprawny językowo,
- logiczny i spójny.
### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
"step_by_step": (
"Step by step - role-based, numbered editorial guidelines.",
"""Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:
1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
---
### Tekst do uproszczenia:
<text>""",
),
}
# ── PLCC-inspired visual style ──────────────────────────────────────────────
# Mirrors the sdadas/plcc leaderboard: clean white background, a system
# sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
# (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
# CSS — a custom gr.themes.* would tint the component label chips blue, which
# is not part of the PLCC look.
PLCC_CSS = """
.gradio-container {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
"Helvetica Neue", Arial, sans-serif !important;
max-width: 1500px !important;
}
/* PLCC-style data tables */
.plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
.plain-table thead th {
background: #f9fafd !important;
border-bottom: 2px solid #ddd !important;
color: #222 !important;
font-weight: 700 !important;
}
.plain-table tbody td { padding: 8px 10px !important; }
.plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
.plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
/* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
.params-col tbody td:nth-child(3),
.params-col thead th:nth-child(3) {
text-align: right !important;
white-space: nowrap;
}
.params-col tbody td:nth-child(3) { color: #999 !important; }
/* Filter bar — the grey rounded block holding the dropdowns */
.filter-bar {
background: #f9fafd;
border: 1px solid #ddd;
border-radius: 0.5rem;
padding: 10px 14px;
}
"""
# Colour palette for category bars
_CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]
def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
"""Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
if df.empty or "Model" not in df.columns:
return df
return df[df["Model"].isin(allowed)].reset_index(drop=True)
def load_rrf_views(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
"""Final ranking DataFrame and per-category DataFrames for the selected filters.
Ranks and RRF scores are computed over **all** models (honouring only the
text-category / prompt filters). The size-limit and model-type selections
are then applied as pure row filters that hide models without recomputing
any ranking - so a surviving model keeps the rank it held in the full table.
"""
category_data = [
(cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
]
final_df = build_final_ranking_df(category_data)
allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
final_df = _filter_model_rows(final_df, allowed)
category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
return final_df, category_data
def _tradeoff_figs(
text_category: str | None = None,
prompt: str | None = None,
size_limit: str | None = None,
model_type: str | None = None,
) -> tuple[go.Figure, go.Figure]:
"""Both trade-off scatters for the selected filters (empty figure when no data)."""
return (
build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
)
def build_app() -> gr.Blocks:
(
read_orth_df, read_lemma_df,
lex_orth_df, lex_lemma_df,
similarity_df, questeval_df,
markers_df, detail_df,
) = load_leaderboard_data()
ifeval_cmp_df = load_ifeval_comparison_df()
final_df, category_data = load_rrf_views(None, None)
tc_choices = text_category_choices()
pr_choices = prompt_choices()
size_choices = _visible_size_limits()
tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)
with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
gr.Markdown(INTRO)
if read_orth_df.empty:
gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*")
else:
# Reactive output components, gathered in the order the change
# handler returns them: final table, then one table per in-RRF
# category, then the two trade-off scatters (and the IFEval table).
rrf_outputs: list = []
with gr.Row(elem_classes=["filter-bar"]):
tc_dropdown = gr.Dropdown(
choices=tc_choices,
value="All",
label="Text category",
info="Filter the RRF rankings to one source-text category.",
)
pr_dropdown = gr.Dropdown(
choices=pr_choices,
value="All",
label="Simplification prompt",
info="Filter the RRF rankings to one simplification prompt.",
)
size_dropdown = gr.Dropdown(
choices=size_choices,
value="ALL",
label="Size limit",
info="Keep only models up to this many parameters.",
)
type_dropdown = gr.Dropdown(
choices=MODEL_TYPES,
value="ALL",
label="Model type",
info="Filter by open- vs closed-weights models.",
)
with gr.Tabs():
# ── Final Ranking ──────────────────────────────────────────
with gr.TabItem("Final Ranking"):
gr.Markdown(
"Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. "
"Each category ranks models by its own RRF score; those ranks are then fused into a "
"single **Final RRF** score. Higher = better overall simplification. "
"The **PLCC** column shows the model's score on the external "
"[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
"benchmark for reference only - it does not affect the ranking (blank where unavailable)."
)
final_table = gr.Dataframe(
value=final_df, interactive=False, wrap=True,
elem_classes=["plain-table", "params-col"],
)
gr.Markdown(N_NOTE)
rrf_outputs += [final_table]
# ── RRF category tabs ──────────────────────────────────────
for cat, cat_df in category_data:
if not cat.get("in_rrf", True):
continue
with gr.TabItem(cat["name"]):
gr.Markdown(cat["description"])
cat_table = gr.Dataframe(
value=cat_df, interactive=False, wrap=True,
elem_classes=["plain-table", "params-col"],
)
gr.Markdown(N_NOTE)
rrf_outputs += [cat_table]
# ── Trade-off plots ────────────────────────────────────────
with gr.TabItem("Trade-off"):
gr.Markdown(
"Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
"(QuestEval F1), one point per model. Top-left is ideal: "
"greater complexity reduction **and** faithful to the original."
)
tradeoff_plot = gr.Plot(value=tradeoff_fig)
gr.Markdown(
"---\n"
"Gunning Fog orth reduction (Δ%) versus NLI F1. "
"Top-left is best: greater complexity reduction **and** strong NLI entailment."
)
fog_nli_plot = gr.Plot(value=fog_nli_fig)
rrf_outputs += [tradeoff_plot, fog_nli_plot]
with gr.TabItem("Detailed scores", visible=False):
gr.Markdown(
"Average scores before and after simplification, plus absolute (Δ) "
"and percentage (Δ%) change - for all readability, lexical, and marker metrics."
)
gr.Dataframe(
value=detail_df, interactive=False, wrap=True,
elem_classes=["plain-table"],
)
# ── IFEval: manual vs automatic ────────────────────────────
if not ifeval_cmp_df.empty:
with gr.TabItem("IFEval manual vs auto"):
gr.Markdown(
"**Automatic** IFEval constraints are generated by an LLM; "
"**manual** constraints are hand-written gold rules, available for a "
"subset of the prompts. To isolate rule quality from sampling, the "
"comparison is restricted to the texts that carry **both** scores "
"(N = matched texts per model), so these automatic figures differ from "
"the full-sample IFEval used elsewhere.\n\n"
"**include** = fraction of *include* constraints satisfied, "
"**exclude** = fraction of *exclude* constraints satisfied (higher is "
"better for both). **Δ = manual − automatic** (on the matched texts): a "
"negative Δ means the automatic rules were easier to satisfy than the "
"hand-checked ones (more lenient automatic scoring). The **(all)** columns "
"show automatic IFEval over *every* text (the full-sample figure used "
"elsewhere). **Δ (man−auto all)** is manual minus that full-sample "
"automatic value - useful as a sanity check, but note the two cover "
"different text sets (matched subset vs. all texts), so **Δ (man−auto)** "
"is the rigorous like-for-like comparison."
)
ifeval_cmp_table = gr.Dataframe(
value=ifeval_cmp_df, interactive=False, wrap=True,
elem_classes=["plain-table"],
)
rrf_outputs.append(ifeval_cmp_table)
# Metric documentation, shown below the results.
gr.Markdown(METRICS_DOC)
# Simplification prompts, documenting the "Simplification prompt"
# filter values — shown below the metric documentation.
gr.Markdown(
"## Simplification prompts\n\n"
"The five prompt templates every model is run with - these are the "
"values of the **Simplification prompt** filter above. Each source "
"text is simplified once per prompt, so they range from a bare "
"one-line instruction to full plain-language guidelines. "
"`<text>` marks where the source text is inserted."
)
for _name, (_desc, _body) in PROMPTS.items():
with gr.Accordion(f"{_name} - {_desc}", open=False):
gr.Markdown(f"```\n{_body}\n```")
# Recompute the RRF rankings whenever any filter changes.
_filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]
def _refresh_rrf(
text_category: str, prompt: str, size_limit: str, model_type: str
) -> list:
f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
updates: list = [f_df]
for cat, df in cat_data:
if not cat.get("in_rrf", True):
continue
updates += [df]
updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
if not ifeval_cmp_df.empty:
updates.append(
load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
)
return updates
for _dd in _filters:
_dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)
return app
app = build_app()
if __name__ == "__main__":
app.launch()