Spaces:

QuanticaLab
/

PLainBench

Running

App Files Files Community

PLainBench / app.py

bartlomiejn87

Initial commit

3bd48fe 8 days ago

Raw

History Blame Contribute Delete

60.8 kB

	"""PLainBench - Polish Text Simplification Leaderboard.

	Reads scored anon JSON files from the data/current/ directory and displays a
	leaderboard showing how well each LLM simplifies Polish texts, measured
	by readability indices, difficulty markers, reference-based similarity
	metrics, and a QuestEval-style QA consistency score.
	"""

	import json
	from functools import lru_cache
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go

	DATA_DIR = Path(__file__).parent / "data" / "current"


	@lru_cache(maxsize=1)
	def load_records() -> tuple[dict, ...]:
	"""Parse every scored anon JSON once and cache the result.

	The full files are large (~9 MB each, holding per-text records), but the
	app only ever reads ``metadata`` and ``summary``. We keep just those two
	sections so each file is parsed a single time and every loader/refresh
	reuses the in-memory copy instead of re-reading from disk.
	"""
	records: list[dict] = []
	if not DATA_DIR.exists():
	return ()
	for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
	with open(fp, encoding="utf-8") as f:
	data = json.load(f)
	records.append({"metadata": data["metadata"], "summary": data["summary"]})
	return tuple(records)


	# Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
	# quick-filters. Size options are upper bounds in billions of parameters.
	SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
	MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]


	def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
	"""Whether a model's metadata satisfies the size-limit / model-type filters."""
	if model_type and model_type != "ALL":
	want = "open" if model_type == "open-weights" else "closed"
	if meta.get("weights") != want:
	return False
	if size_limit and size_limit != "ALL":
	cap = float(size_limit.rstrip("B"))
	params = meta.get("total_params_b") or 0
	# Unknown / unreported size (0) can't be placed under a cap, so exclude it.
	if params <= 0 or params > cap:
	return False
	return True


	def _filtered_records(
	size_limit: str \| None = None, model_type: str \| None = None
	) -> list[dict]:
	"""Records whose model passes the size-limit / model-type filters."""
	sl = size_limit or "ALL"
	mt = model_type or "ALL"
	return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]


	def _visible_size_limits() -> list[str]:
	"""Prune ``SIZE_LIMITS`` to the caps that actually split the current models.

	A numeric cap is redundant when it selects the same set of models as the
	next-smaller cap (no model has a size in the band between them) - those
	upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
	always kept. Recomputed from the data, so adding models later automatically
	re-expands the list.
	"""
	params = [
	p for d in load_records()
	if (p := d["metadata"].get("total_params_b") or 0) > 0
	]
	# Ascending by value: keep the smallest representative of each distinct
	# subset; a larger cap with the same model count is the redundant "upper" one.
	kept: set[str] = set()
	prev_count = -1
	for s in sorted(
	(s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
	):
	cap = float(s.rstrip("B"))
	count = sum(1 for p in params if p <= cap)
	if count > 0 and count != prev_count:
	kept.add(s)
	prev_count = count
	# Preserve the original descending display order, with ALL first.
	return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]


	READABILITY_ORTH_LABELS = {
	"flesch_reading_ease_orth": "Flesch RE",
	"flesch_kincaid_grade_orth": "Flesch-Kincaid",
	"gunning_fog_orth": "Gunning Fog",
	"ari_orth": "ARI",
	"linsear_write_orth": "Linsear Write",
	"smog_grade_orth": "SMOG",
	"coleman_liau_orth": "Coleman-Liau",
	"pisarek_orth": "Pisarek",
	}

	READABILITY_LEMMA_LABELS = {
	"flesch_reading_ease_lemma": "Flesch RE",
	"flesch_kincaid_grade_lemma": "Flesch-Kincaid",
	"gunning_fog_lemma": "Gunning Fog",
	"ari_lemma": "ARI",
	"linsear_write_lemma": "Linsear Write",
	"smog_grade_lemma": "SMOG",
	"coleman_liau_lemma": "Coleman-Liau",
	"pisarek_lemma": "Pisarek",
	}

	LEXICAL_ORTH_LABELS = {
	"ttr_orth": "TTR",
	"rttr_orth": "RTTR",
	"cttr_orth": "CTTR",
	"herdan_orth": "Herdan",
	"summer_orth": "Summer",
	"dugast_orth": "Dugast",
	"maas_orth": "Maas",
	"mtld_orth": "MTLD",
	"mattr_orth": "MATTR",
	}

	LEXICAL_LEMMA_LABELS = {
	"ttr_lemma": "TTR",
	"rttr_lemma": "RTTR",
	"cttr_lemma": "CTTR",
	"herdan_lemma": "Herdan",
	"summer_lemma": "Summer",
	"dugast_lemma": "Dugast",
	"maas_lemma": "Maas",
	"mtld_lemma": "MTLD",
	"mattr_lemma": "MATTR",
	}

	SIMILARITY_LABELS = {
	"bert_score_precision": "BERTScore P",
	"bert_score_recall": "BERTScore R",
	"bert_score_f1": "BERTScore F1",
	"bleu": "BLEU",
	"chrf": "chrF",
	"chrfpp": "chrF++",
	"nli_precision": "NLI P",
	"nli_recall": "NLI R",
	"nli_f1": "NLI F1",
	"rouge_1_precision": "ROUGE-1 P",
	"rouge_1_recall": "ROUGE-1 R",
	"rouge_1_f1": "ROUGE-1 F1",
	"rouge_2_precision": "ROUGE-2 P",
	"rouge_2_recall": "ROUGE-2 R",
	"rouge_2_f1": "ROUGE-2 F1",
	"rouge_l_precision": "ROUGE-L P",
	"rouge_l_recall": "ROUGE-L R",
	"rouge_l_f1": "ROUGE-L F1",
	"wer": "WER",
	"mer": "MER",
	"wil": "WIL",
	"ne_retention": "NE Retention",
	}

	MARKER_LABELS = {
	# counts
	"paragraph_count": "Paragraph count",
	"sentence_count": "Sentence count",
	"word_count": "Word count",
	"named_entity_count": "Named entity count",
	"difficult_word_count": "Difficult word count",
	"difficult_word_count_orth": "Difficult word count (orth)",
	# average lengths
	"avg_word_syllables": "Avg word syllables",
	"avg_sentence_length": "Avg sentence length",
	"avg_paragraph_length": "Avg paragraph length",
	# lexical difficulty
	"named_entity_ratio": "Named entity ratio",
	"difficult_word_ratio": "Difficult word ratio",
	"difficult_word_ratio_orth": "Difficult word ratio (orth)",
	# POS ratios
	"noun_ratio": "Noun ratio",
	"difficult_noun_ratio": "Difficult noun ratio",
	"difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
	"verb_ratio": "Verb ratio",
	"difficult_verb_ratio": "Difficult verb ratio",
	"difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
	"adjective_ratio": "Adjective ratio",
	"difficult_adjective_ratio": "Difficult adjective ratio",
	"difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
	# POS-to-POS ratios
	"noun_to_verb_ratio": "Noun/verb ratio",
	"verbo_nominal_ratio": "Verbo-nominal ratio",
	"adj_to_verb_ratio": "Adj/verb ratio",
	"adj_to_noun_ratio": "Adj/noun ratio",
	# morphological
	"nie_prefix_ratio": "Nie-prefix ratio",
	"participle_ratio": "Participle ratio",
	"gerund_ratio": "Gerund ratio",
	"osc_noun_ratio": "OSC noun ratio",
	"impersonal_verb_ratio": "Impersonal verb ratio",
	"genitive_noun_ratio": "Genitive noun ratio",
	"avg_genitive_chain_length": "Avg genitive chain",
	# syntactic
	"sentence_length_variance": "Sentence length variance",
	"mean_dependency_distance": "Mean dep. distance",
	"subordination_index": "Subordination index",
	}

	QUESTEVAL_LABELS = {
	"precision": "QuestEval P",
	"recall": "QuestEval R",
	"f1": "QuestEval F1",
	"answerable_rate_forward": "Answerable (fwd)",
	"answerable_rate_backward": "Answerable (bwd)",
	}

	RRF_K = 60

	# Each entry: (source, key, label, ascending_rrf, in_rrf)
	# source — "metrics" \| "markers" → use avg_diff_pct (Δ%)
	# "similarity" \| "questeval" → use absolute value
	# ascending_rrf — True = lower value is better (rank 1 = smallest)
	# in_rrf — include this metric in category RRF computation

	CATEGORIES: list[dict] = [
	{
	"name": "Readability",
	"in_rrf": True,
	"rrf_weight": 1,
	"description": (
	"Readability indices - orth (surface-form) variants. "
	"Δ% = percentage change after simplification. "
	"For Flesch RE positive Δ% is better; for all others negative Δ% is better. "
	"Flesch RE rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
	"Gunning Fog estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
	"where complex words have many syllables (lower → easier). "
	"Coleman-Liau grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
	"IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
	),
	"metrics": [
	("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True),
	("metrics", "gunning_fog_orth", "Gunning Fog", True, True),
	("metrics", "coleman_liau_orth", "Coleman-Liau", True, True),
	("ifeval", "avg_exclude", "IFEval exclude", False, True),
	],
	},
	{
	"name": "Lexical Difficulty",
	"in_rrf": True,
	"rrf_weight": 1,
	"description": (
	"Word-level difficulty markers - orth variants where available. "
	"Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
	"Avg word syllables is the mean number of syllables per word (higher → longer, harder vocabulary). "
	"Difficult word ratio is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
	"(higher → harder). "
	"Verb ratio is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
	"Difficult noun ratio is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
	"(higher → more complex nominal vocabulary)."
	),
	"metrics": [
	("markers", "avg_word_syllables", "Avg word syllables", True, True),
	("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True),
	("markers", "verb_ratio", "Verb ratio", False, True),
	("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True),
	],
	},
	{
	"name": "Syntactic",
	"in_rrf": True,
	"rrf_weight": 1,
	"description": (
	"Sentence and clause structure complexity markers. "
	"Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
	"Avg sentence length is the mean number of words per sentence (higher → harder). "
	"Mean dep. distance is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
	"Subordination index is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
	),
	"metrics": [
	("markers", "avg_sentence_length", "Avg sentence length", True, True),
	("markers", "sentence_length_variance", "Sentence length var.", True, False),
	("markers", "mean_dependency_distance", "Mean dep. distance", True, True),
	("markers", "subordination_index", "Subordination index", True, True),
	],
	},
	{
	"name": "Morphological",
	"in_rrf": True,
	"rrf_weight": 1,
	"description": (
	"Polish-specific morphological complexity markers. "
	"Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
	"Adverbial participle ratio is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. czytając, przeczytawszy) "
	"among alphabetic tokens - a bookish, formal construction (higher → more complex). "
	"Gerund ratio is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. czytanie) among words "
	"(higher → more nominalised, formal). "
	"Impersonal verb ratio is the share of impersonal verb forms among all verbs - impersonal modals (należy, trzeba, można), "
	"passive -no/-to forms (zrobiono), reflexive-impersonal się (mówi się) and infinitives - typical of legal/administrative Polish "
	"(higher → more impersonal, harder). "
	"Genitive noun ratio is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
	"(higher → harder). "
	"Avg genitive chain is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
	"(higher → more genitive stacking, harder). "
	"Verbo-nominal ratio is the share of light-verb + noun periphrases (dokonać wpłaty, podjąć decyzję) - a hallmark of "
	"administrative Polish (higher → harder). "
	"OSC noun ratio is the share of abstract -ość nouns (możliwość, konieczność) among nouns (higher → more abstract, harder)."
	),
	"metrics": [
	("markers", "participle_ratio", "Participle ratio", True, False),
	("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True),
	("markers", "gerund_ratio", "Gerund ratio", True, True),
	("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True),
	("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True),
	("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True),
	("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True),
	("markers", "osc_noun_ratio", "OSC noun ratio", True, True),
	],
	},
	{
	"name": "Meaning Preservation",
	"in_rrf": True,
	"rrf_weight": 4,
	"description": (
	"Semantic metrics that directly test whether the simplified text says the same thing as the original. "
	"NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
	"NE Retention measures what fraction of named entities from the original appear in the simplified text "
	"(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
	"IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
	"Higher is better for all."
	),
	"metrics": [
	("similarity", "nli_f1", "NLI F1", False, True),
	("questeval", "f1", "QuestEval F1", False, True),
	("similarity", "ne_retention", "NE Retention", False, True),
	("ifeval", "avg_include", "IFEval include", False, True),
	],
	},
	]


	def _col_name(source: str, label: str) -> str:
	"""Column name used in category DataFrames."""
	if source in ("metrics", "markers"):
	return f"{label} (Δ%)"
	return label


	def _model_label(data: dict) -> str:
	"""Return a unique display name, appending reasoning effort when present.

	The parameter size is shown separately (see :func:`_params_str`), in its
	own column, mirroring the PLCC leaderboard layout.
	"""
	model = data["metadata"]["model"]
	effort = (
	data["metadata"]
	.get("model_kwargs", {})
	.get("extra_body", {})
	.get("reasoning", {})
	.get("effort")
	)
	if effort is not None:
	return f"{model} [reasoning: {effort}]"
	return model


	def _params_str(params: float \| None) -> str \| None:
	"""PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
	p = params or 0
	if p <= 0:
	return None
	return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"


	def _params_map() -> dict[str, str]:
	"""Model label → formatted parameter size, read from each file's metadata."""
	out: dict[str, str] = {}
	for data in load_records():
	label = _params_str(data["metadata"].get("total_params_b"))
	if label:
	out[_model_label(data)] = label
	return out


	def _metric_row(
	label_map: dict,
	summary_metrics: dict,
	row: dict,
	detail_row: dict,
	*,
	include_detail: bool = True,
	) -> None:
	"""Populate leaderboard row and detail row from a label→key map."""
	for key, label in label_map.items():
	vals = summary_metrics.get(key, {})
	row[f"{label} (Δ)"] = vals.get("avg_diff")
	row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
	if include_detail:
	detail_row[f"{label} before"] = vals.get("avg_before")
	detail_row[f"{label} after"] = vals.get("avg_after")
	detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
	detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")


	def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
	"""Load scored JSON files and build leaderboard DataFrames.

	Returns:
	(readability_orth_df, readability_lemma_df,
	lexical_orth_df, lexical_lemma_df,
	similarity_df, questeval_df, markers_df, detail_df)
	"""
	read_orth_rows, read_lemma_rows = [], []
	lex_orth_rows, lex_lemma_rows = [], []
	similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []

	if not DATA_DIR.exists():
	empty = pd.DataFrame()
	return empty, empty, empty, empty, empty, empty, empty, empty

	for data in load_records():
	model = _model_label(data)
	n = data["summary"]["n"]
	metrics = data["summary"]["metrics"]
	similarity = data["summary"].get("similarity", {})
	questeval = data["summary"].get("questeval", {})
	markers = data["summary"].get("markers", {})

	base = {"Model": model, "N": n}
	read_orth_row = dict(base)
	read_lemma_row = dict(base)
	lex_orth_row = dict(base)
	lex_lemma_row = dict(base)
	similarity_row = dict(base)
	questeval_row = dict(base)
	markers_row = dict(base)
	detail_row = dict(base)

	_metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
	_metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
	_metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
	_metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)

	for key, label in SIMILARITY_LABELS.items():
	similarity_row[label] = similarity.get(key)

	for key, label in QUESTEVAL_LABELS.items():
	questeval_row[label] = questeval.get(key)

	for key, label in MARKER_LABELS.items():
	vals = markers.get(key, {})
	markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
	markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
	detail_row[f"{label} before"] = vals.get("avg_before")
	detail_row[f"{label} after"] = vals.get("avg_after")
	detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
	detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")

	read_orth_rows.append(read_orth_row)
	read_lemma_rows.append(read_lemma_row)
	lex_orth_rows.append(lex_orth_row)
	lex_lemma_rows.append(lex_lemma_row)
	similarity_rows.append(similarity_row)
	questeval_rows.append(questeval_row)
	markers_rows.append(markers_row)
	detail_rows.append(detail_row)

	dfs = [
	pd.DataFrame(read_orth_rows),
	pd.DataFrame(read_lemma_rows),
	pd.DataFrame(lex_orth_rows),
	pd.DataFrame(lex_lemma_rows),
	pd.DataFrame(similarity_rows),
	pd.DataFrame(questeval_rows),
	pd.DataFrame(markers_rows),
	pd.DataFrame(detail_rows),
	]
	for df in dfs:
	num_cols = df.select_dtypes(include="number").columns
	df[num_cols] = df[num_cols].round(4)

	return tuple(dfs)


	@lru_cache(maxsize=1)
	def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
	"""Per-model matched IFEval records, cached once.

	Manual IFEval rules are hand-written for a subset of the prompts, so the
	comparison only makes sense on records carrying both an automatic and a
	manual score. This reads the per-text ``results`` arrays (which
	``load_records`` discards) once and keeps, per model, the tuples
	``(category, prompt_id, auto_include, auto_exclude, man_include,
	man_exclude)`` so the dropdown filters can re-aggregate cheaply.
	"""
	out: list[tuple[str, tuple[tuple, ...]]] = []
	if not DATA_DIR.exists():
	return ()
	for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
	with open(fp, encoding="utf-8") as f:
	data = json.load(f)
	model = _model_label(data)
	recs: list[tuple] = []
	for rec in data["results"]:
	man = rec.get("ifeval_manual")
	auto = rec.get("ifeval")
	if not man or not auto:
	continue
	recs.append((
	rec.get("category"),
	rec.get("prompt_id"),
	auto.get("include"), auto.get("exclude"),
	man.get("include"), man.get("exclude"),
	))
	if recs:
	out.append((model, tuple(recs)))
	return tuple(out)


	def load_ifeval_comparison_df(
	text_category: str \| None = None,
	prompt: str \| None = None,
	size_limit: str \| None = None,
	model_type: str \| None = None,
	) -> pd.DataFrame:
	"""Compare manual (gold) IFEval against automatic IFEval, per model.

	The comparison is restricted to records carrying both an automatic and a
	manual score - the very same texts scored both ways, which isolates the
	rule-quality gap from sampling differences (the overall ``ifeval`` summary
	averages over ~5× more texts and so is not directly comparable). ``Δ``
	columns are manual − automatic: a negative value means the automatic
	constraints were easier to satisfy than the hand-checked ones, i.e. the
	automatic rules are more lenient.

	``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
	restrict the matched records to one source-text category and/or one
	simplification prompt, mirroring the RRF dropdown filters.
	"""
	tc = None if text_category in (None, "All") else text_category
	pr = None if prompt in (None, "All") else prompt

	# Automatic IFEval over all records (not just the manual-matched subset),
	# from the summary buckets, so it tracks the same category/prompt filters.
	# Restricted to models passing the size / model-type filters.
	allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
	summaries = {
	_model_label(data): data["summary"]
	for data in load_records()
	if _model_label(data) in allowed
	}

	rows: list[dict] = []
	for model, recs in _load_ifeval_records():
	if model not in allowed:
	continue
	ai = ae = mi = me = 0.0
	ni = ne = 0
	for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
	if tc and cat != tc:
	continue
	if pr and prompt_id != pr:
	continue
	if m_inc is not None and a_inc is not None:
	ai += a_inc; mi += m_inc; ni += 1
	if m_exc is not None and a_exc is not None:
	ae += a_exc; me += m_exc; ne += 1
	if ni == 0 and ne == 0:
	continue
	auto_inc = ai / ni if ni else None
	man_inc = mi / ni if ni else None
	auto_exc = ae / ne if ne else None
	man_exc = me / ne if ne else None
	auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
	all_inc = auto_all.get("avg_include")
	all_exc = auto_all.get("avg_exclude")
	rows.append({
	"Model": model,
	"N": ni or ne,
	"Manual include": man_inc,
	"Manual exclude": man_exc,
	"Auto include": auto_inc,
	"Auto include (all)": all_inc,
	"Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
	"Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
	"Auto exclude": auto_exc,
	"Auto exclude (all)": all_exc,
	"Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
	"Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
	})

	df = pd.DataFrame(rows)
	if df.empty:
	return df
	df = df.sort_values("Model").reset_index(drop=True)
	num_cols = df.select_dtypes(include="number").columns
	df[num_cols] = df[num_cols].round(4)
	return df


	def text_category_choices() -> list[str]:
	"""All source-text categories present in the data, prefixed with 'All'."""
	cats: set[str] = set()
	for data in load_records():
	cats.update(data["summary"].get("metrics_by_category", {}).keys())
	return ["All"] + sorted(cats)


	def prompt_choices() -> list[str]:
	"""All simplification prompts present in the data, prefixed with 'All'."""
	prompts: set[str] = set()
	for data in load_records():
	prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
	return ["All"] + sorted(prompts)


	def _source_bucket(s: dict, source: str, tc: str \| None, prompt: str \| None) -> dict:
	"""Return the metric bucket for one source, filtered by text category and/or prompt.

	Picks the overall summary when neither filter is set, the ``*_by_category`` /
	``_by_prompt`` bucket when one is set, and the ``_by_category_prompt`` bucket
	(keyed ``"CATEGORY/PROMPT"``) when both are set.
	"""
	if source in ("metrics", "markers", "similarity"):
	if tc and prompt:
	return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
	if tc:
	return s.get(f"{source}_by_category", {}).get(tc, {})
	if prompt:
	return s.get(f"{source}_by_prompt", {}).get(prompt, {})
	return s.get(source, {})
	# questeval / ifeval keep their per-filter buckets nested under the source object
	src = s.get(source, {})
	if tc and prompt:
	return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
	if tc:
	return src.get("by_category", {}).get(tc, {})
	if prompt:
	return src.get("by_prompt", {}).get(prompt, {})
	return src


	def _bucket_n(s: dict, tc: str \| None, prompt: str \| None) -> int \| None:
	"""Sample count for the selected filters, from whichever source records it."""
	for src in ("questeval", "ifeval"):
	n = _source_bucket(s, src, tc, prompt).get("n")
	if n is not None:
	return n
	return None


	def load_category_df(
	category: dict,
	text_category: str \| None = None,
	prompt: str \| None = None,
	) -> pd.DataFrame:
	"""Build a DataFrame for one metric category with a per-category RRF score.

	``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
	restrict the metrics to one source-text category and/or one simplification
	prompt via the matching ``_by_category`` / ``_by_prompt`` /
	``*_by_category_prompt`` buckets; otherwise the overall summary is used.
	The RRF is always computed over all models; the size-limit / model-type
	filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
	so they never change a model's rank or score.
	"""
	rows: list[dict] = []
	tc = None if text_category in (None, "All") else text_category
	pr = None if prompt in (None, "All") else prompt

	for data in load_records():
	s = data["summary"]
	model = _model_label(data)
	n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
	row: dict = {"Model": model, "N": n}

	for source, key, label, _asc, in_rrf in category["metrics"]:
	if not in_rrf:
	continue
	col = _col_name(source, label)
	bucket = _source_bucket(s, source, tc, pr)
	if source in ("metrics", "markers"):
	row[col] = bucket.get(key, {}).get("avg_diff_pct")
	else: # similarity, questeval, ifeval store the value directly
	row[col] = bucket.get(key)

	rows.append(row)

	df = pd.DataFrame(rows)
	if df.empty:
	return df

	num_cols = df.select_dtypes(include="number").columns
	df[num_cols] = df[num_cols].round(4)

	rrf = pd.Series(0.0, index=df.index)
	for source, key, label, ascending, in_rrf in category["metrics"]:
	if not in_rrf:
	continue
	col = _col_name(source, label)
	if col not in df.columns or df[col].isna().all():
	continue
	rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))

	df.insert(2, "RRF Score", rrf.round(4))
	df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
	df.insert(0, "Rank", range(1, len(df) + 1))
	df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
	return df


	def _plcc_overall_map() -> dict[str, float]:
	"""Model label → external PLCC overall score, read from each file's metadata.

	PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
	score is carried verbatim in ``metadata.plcc.overall`` and shown for
	reference only - it does not feed the RRF ranking. Models without a PLCC
	entry are omitted (mapped to NaN in the table).
	"""
	out: dict[str, float] = {}
	for data in load_records():
	plcc = data["metadata"].get("plcc") or {}
	overall = plcc.get("overall")
	if overall is not None:
	out[_model_label(data)] = overall
	return out


	def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
	"""Fuse per-category RRF scores into a final ranking via RRF.

	Each category column shows the model's rank within that category (1 = best);
	those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
	A reference ``PLCC`` column carries the external PLCC benchmark score and does
	not influence the ranking.
	"""
	merged: pd.DataFrame \| None = None
	for cat, cat_df in category_data:
	if not cat.get("in_rrf", True) or cat_df.empty:
	continue
	sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
	merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")

	if merged is None or merged.empty:
	return pd.DataFrame()

	# N (sample count) is identical across categories for a given model, so take
	# it from whichever category table carries it.
	n_map: dict = {}
	for _cat, cat_df in category_data:
	if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
	n_map = dict(zip(cat_df["Model"], cat_df["N"]))
	break

	score_cols = [c for c in merged.columns if c != "Model"]
	weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}

	out = merged[["Model"]].copy()
	rrf = pd.Series(0.0, index=merged.index)
	rank_cols: dict[str, pd.Series] = {}
	for col in score_cols:
	ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
	rrf += weights.get(col, 1) / (RRF_K + ranks)
	rank_cols[col] = ranks

	out.insert(1, "Final RRF", rrf.round(4))
	out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
	for name, ranks in rank_cols.items():
	out[name] = ranks
	out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
	out.insert(0, "Rank", range(1, len(out) + 1))
	out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
	out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
	return out


	def build_tradeoff_scatter(
	text_category: str \| None = None,
	prompt: str \| None = None,
	size_limit: str \| None = None,
	model_type: str \| None = None,
	) -> go.Figure \| None:
	"""Scatter of Gunning Fog reduction vs meaning preservation, one point per model.

	X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
	Y: QuestEval F1 (higher = better meaning preservation)

	Honours the same text-category / prompt / size / model-type filters as the
	RRF rankings.
	"""
	tc = None if text_category in (None, "All") else text_category
	pr = None if prompt in (None, "All") else prompt
	points = []
	for data in _filtered_records(size_limit, model_type):
	s = data["summary"]
	model = _model_label(data)
	x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
	y = _source_bucket(s, "questeval", tc, pr).get("f1")
	if x is None or y is None:
	continue
	points.append((model, x, y))

	if not points:
	return None

	models, xs, ys = zip(*points)

	fig = go.Figure()
	fig.add_trace(
	go.Scatter(
	x=xs,
	y=ys,
	mode="markers+text",
	text=models,
	textposition="top center",
	textfont={"size": 10},
	marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
	hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
	)
	)

	x_mid = (min(xs) + max(xs)) / 2
	y_mid = (min(ys) + max(ys)) / 2
	fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
	fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")

	fig.update_layout(
	title="Complexity reduction vs meaning preservation",
	xaxis_title="Gunning Fog orth Δ% (← easier text)",
	yaxis_title="QuestEval F1 (↑ meaning preserved)",
	height=560,
	margin={"l": 60, "r": 40, "t": 60, "b": 60},
	plot_bgcolor="white",
	)
	fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
	fig.update_yaxes(showgrid=True, gridcolor="#EEE")

	return fig


	def build_fog_nli_scatter(
	text_category: str \| None = None,
	prompt: str \| None = None,
	size_limit: str \| None = None,
	model_type: str \| None = None,
	) -> go.Figure \| None:
	"""Scatter of Gunning Fog reduction vs NLI F1, one point per model.

	X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
	Y: NLI F1 (higher = stronger entailment / meaning preserved)

	Honours the same text-category / prompt / size / model-type filters as the
	RRF rankings.
	"""
	tc = None if text_category in (None, "All") else text_category
	pr = None if prompt in (None, "All") else prompt
	points = []
	for data in _filtered_records(size_limit, model_type):
	s = data["summary"]
	model = _model_label(data)
	x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
	y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
	if x is None or y is None:
	continue
	points.append((model, x, y))

	if not points:
	return None

	models, xs, ys = zip(*points)

	fig = go.Figure()
	fig.add_trace(
	go.Scatter(
	x=xs,
	y=ys,
	mode="markers+text",
	text=models,
	textposition="top center",
	textfont={"size": 10},
	marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
	hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
	)
	)

	x_mid = (min(xs) + max(xs)) / 2
	y_mid = (min(ys) + max(ys)) / 2
	fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
	fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")

	fig.update_layout(
	title="Complexity reduction vs NLI consistency",
	xaxis_title="Gunning Fog orth Δ% (← easier text)",
	yaxis_title="NLI F1 (↑ meaning preserved)",
	height=560,
	margin={"l": 60, "r": 40, "t": 60, "b": 60},
	plot_bgcolor="white",
	)
	fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
	fig.update_yaxes(showgrid=True, gridcolor="#EEE")

	return fig


	INTRO = """\
	# PLainBench - Polish Text Simplification Leaderboard

	This benchmark evaluates how well LLMs simplify difficult Polish texts -
	drawn from legal/administrative (BIP/GOV), finance, and science domains - while
	preserving the original meaning. Each model simplifies 210 source texts under
	5 simplification prompts (1050 outputs per model). Outputs are scored on
	readability indices, fine-grained difficulty markers (lexical, syntactic,
	morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
	named-entity retention), and instruction following (IFEval include/exclude).
	The per-category scores are fused into an overall Final RRF ranking.
	"""

	METRICS_DOC = """\
	## Metrics

	### Readability indices

	All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
	dictionary) and counted on surface (orthographic) word forms.

	Δ is the absolute change (after − before); Δ% is the average percentage change
	from the original text to the simplified text.

	\| Metric \| Formula \| Interpretation \|
	\|---\|---\|---\|
	\| Flesch Reading Ease \| `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` \| Higher → easier text (0–100 typical range). Desired Δ%: positive (+) \|
	\| Gunning Fog \| `0.4 × [(words/sentences) + 100 × (complex_words/words)]` \| School years needed (complex = ≥ 4 syllables). Lower → easier. Desired Δ%: negative (−) \|
	\| Coleman-Liau \| `0.0588 × L − 0.296 × S − 15.8` \| Character-based grade level. Lower → easier. Desired Δ%: negative (−) \|

	### Difficulty markers

	Fine-grained syntactic, morphological, and lexical features.
	Δ is absolute change; Δ% is percentage change.
	Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
	surface (orthographic) form.

	\| Marker \| Description \| Desired Δ% \|
	\|---\|---\|---\|
	\| Avg word syllables \| Mean syllable count per word \| − (shorter words) \|
	\| Difficult word ratio (orth) \| Difficult words / all words (surface, excl. NEs) \| − \|
	\| Difficult noun ratio (orth) \| Difficult nouns / all tokens (surface, excl. NEs) \| − \|
	\| Verb ratio \| Verbs / all tokens \| + (more verbal, less nominal) \|
	\| Avg sentence length \| Mean tokens per sentence \| − (shorter sentences) \|
	\| Mean dep. distance \| Avg linear head-dependent distance (syntax complexity) \| − (flatter syntax) \|
	\| Subordination index \| Subordinate clauses / total clauses \| − \|
	\| Adverbial participle ratio \| Adverbial participles (converbs, e.g. czytając, przeczytawszy) / all tokens \| − \|
	\| Gerund ratio \| Gerunds / all tokens \| − \|
	\| Impersonal verb ratio \| Impersonal verb forms (modals należy/trzeba, -no/-to passives, reflexive się, infinitives) / all verbs \| − \|
	\| Genitive noun ratio \| Nouns in genitive case / all tokens \| − \|
	\| Avg genitive chain \| Mean length of consecutive genitive noun phrases \| − \|
	\| Verbo-nominal ratio \| Light-verb + noun periphrases (dokonać wpłaty, podjąć decyzję); administrative style \| − \|
	\| OSC noun ratio \| Abstract -ość nouns (możliwość, konieczność) / all nouns \| − \|

	### Similarity metrics

	Reference-based metrics comparing simplified text against the original.

	\| Metric \| Description \| Direction \|
	\|---\|---\|---\|
	\| NLI P / R / F1 \| NLI consistency via stella embeddings + mDeBERTa cross-encoder \| Higher = stronger entailment \|
	\| NE Retention \| Fraction of named entities from the original kept in the simplified text \| Higher = more entities preserved \|

	Only NLI F1* feeds the RRF score; P and R are shown for context.*

	### QuestEval - QA consistency

	\| Metric \| Description \| Direction \|
	\|---\|---\|---\|
	\| QuestEval P \| Backward precision - grounding of simplified claims \| Higher = fewer hallucinations \|
	\| QuestEval R \| Forward recall - information preserved \| Higher = less content dropped \|
	\| QuestEval F1 \| Harmonic mean of P and R \| Higher = better meaning preservation \|
	\| Answerable (fwd) \| Fraction of forward questions answerable \| Higher = stays on-topic \|
	\| Answerable (bwd) \| Fraction of backward questions answerable \| Higher = claims traceable to original \|

	Only QuestEval F1* feeds the RRF score; the other rows are shown for context.*

	### IFEval - instruction following

	\| Metric \| Description \| Direction \|
	\|---\|---\|---\|
	\| IFEval include \| Fraction of include constraints (terms the simplification must keep) satisfied \| Higher = better \|
	\| IFEval exclude \| Fraction of exclude constraints (terms the simplification must avoid) satisfied \| Higher = better \|
	"""

	# Sample-count note shown under each table that carries an ``N`` column.
	N_NOTE = "N = number of prompt × text evaluations per model."

	# The five simplification prompts every model is run with. The keys match the
	# "Simplification prompt" filter values (and the ``*_by_prompt`` summary
	# buckets); each value is ``(short description, user-message template)``, where
	# ``<text>`` marks where the source text is inserted. Kept in sync with
	# generation/prompting/instruction.py. Ordered from least to most detailed.
	PROMPTS: dict[str, tuple[str, str]] = {
	"mini": (
	"Minimal - a single-line instruction, no rules.",
	"Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
	"bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
	),
	"compact": (
	"Compact - a short bulleted rule set.",
	"""Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.

	Zasady:
	- Skup się na najważniejszych informacjach, usuń zbędne treści.
	- Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
	- Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
	- Twórz krótkie zdania (jedna myśl = jedno zdanie).
	- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
	- Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
	- Zachowaj poprawność językową i logiczną spójność.
	- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

	---

	### Tekst do uproszczenia:

	<text>""",
	),
	"medium": (
	"Medium - moderately detailed rules with sub-points.",
	"""Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.

	### Zasady:
	- Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
	- Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
	- Stosuj proste i naturalne słownictwo:
	- zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
	- jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
	- Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
	- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
	- Unikaj:
	- żargonu, stylu urzędowego i zapożyczeń,
	- form bezosobowych i strony biernej (jeśli nie są konieczne),
	- nadmiaru rzeczowników odczasownikowych,
	- podwójnych przeczeń i zawiłych konstrukcji.
	- Zachowaj poprawność językową, spójność i logiczny układ tekstu.
	- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

	---

	### Tekst do uproszczenia:

	<text>""",
	),
	"long": (
	"Long - full, sectioned plain-language guidelines.",
	"""Uprość poniższy tekst zgodnie z zasadami prostego języka.

	### 1. Cel i odbiorca
	- Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
	- Skup się na najważniejszych informacjach.

	### 2. Struktura
	- Usuń informacje zbędne i poboczne.
	- Uporządkuj treść: najważniejsze informacje podaj na początku.
	- Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
	- Jeśli tekst jest dłuższy, użyj nagłówków lub list.

	### 3. Słownictwo
	- Zastępuj trudne słowa prostszymi.
	- Unikaj:
	- terminów specjalistycznych (chyba że je wyjaśnisz),
	- słów rzadkich, książkowych i urzędowych,
	- zapożyczeń i modnych zwrotów,
	- skrótów niezrozumiałych dla odbiorcy.
	- W razie potrzeby:
	- wyjaśnij trudne pojęcia,
	- podaj przykłady,
	- używaj konkretnych nazw zamiast ogólników.

	### 4. Składnia
	- Twórz krótkie zdania (ok. 20 słów).
	- Jedno zdanie = jedna myśl.
	- Używaj zdań twierdzących.
	- Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
	- Używaj strony czynnej zamiast biernej.
	- Unikaj form bezosobowych i skomplikowanych konstrukcji.
	- Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).

	### 5. Styl
	- Unikaj podwójnych przeczeń.
	- Upraszczaj złożone konstrukcje.
	- Zachowaj naturalny, jasny ton.

	### 6. Końcowa kontrola
	- Sprawdź, czy tekst jest:
	- zrozumiały,
	- poprawny językowo,
	- logiczny i spójny.

	### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

	---

	### Tekst do uproszczenia:

	<text>""",
	),
	"step_by_step": (
	"Step by step - role-based, numbered editorial guidelines.",
	"""Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:

	1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
	2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
	3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
	4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
	5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
	6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
	7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
	8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
	9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.

	---

	### Tekst do uproszczenia:

	<text>""",
	),
	}

	# ── PLCC-inspired visual style ──────────────────────────────────────────────
	# Mirrors the sdadas/plcc leaderboard: clean white background, a system
	# sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
	# (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
	# CSS — a custom gr.themes.* would tint the component label chips blue, which
	# is not part of the PLCC look.
	PLCC_CSS = """
	.gradio-container {
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
	"Helvetica Neue", Arial, sans-serif !important;
	max-width: 1500px !important;
	}
	/* PLCC-style data tables */
	.plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
	.plain-table thead th {
	background: #f9fafd !important;
	border-bottom: 2px solid #ddd !important;
	color: #222 !important;
	font-weight: 700 !important;
	}
	.plain-table tbody td { padding: 8px 10px !important; }
	.plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
	.plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
	/* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
	.params-col tbody td:nth-child(3),
	.params-col thead th:nth-child(3) {
	text-align: right !important;
	white-space: nowrap;
	}
	.params-col tbody td:nth-child(3) { color: #999 !important; }
	/* Filter bar — the grey rounded block holding the dropdowns */
	.filter-bar {
	background: #f9fafd;
	border: 1px solid #ddd;
	border-radius: 0.5rem;
	padding: 10px 14px;
	}
	"""

	# Colour palette for category bars
	_CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]


	def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
	"""Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
	if df.empty or "Model" not in df.columns:
	return df
	return df[df["Model"].isin(allowed)].reset_index(drop=True)


	def load_rrf_views(
	text_category: str \| None = None,
	prompt: str \| None = None,
	size_limit: str \| None = None,
	model_type: str \| None = None,
	) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
	"""Final ranking DataFrame and per-category DataFrames for the selected filters.

	Ranks and RRF scores are computed over all models (honouring only the
	text-category / prompt filters). The size-limit and model-type selections
	are then applied as pure row filters that hide models without recomputing
	any ranking - so a surviving model keeps the rank it held in the full table.
	"""
	category_data = [
	(cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
	]
	final_df = build_final_ranking_df(category_data)

	allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
	final_df = _filter_model_rows(final_df, allowed)
	category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
	return final_df, category_data


	def _tradeoff_figs(
	text_category: str \| None = None,
	prompt: str \| None = None,
	size_limit: str \| None = None,
	model_type: str \| None = None,
	) -> tuple[go.Figure, go.Figure]:
	"""Both trade-off scatters for the selected filters (empty figure when no data)."""
	return (
	build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
	build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
	)


	def build_app() -> gr.Blocks:
	(
	read_orth_df, read_lemma_df,
	lex_orth_df, lex_lemma_df,
	similarity_df, questeval_df,
	markers_df, detail_df,
	) = load_leaderboard_data()

	ifeval_cmp_df = load_ifeval_comparison_df()
	final_df, category_data = load_rrf_views(None, None)
	tc_choices = text_category_choices()
	pr_choices = prompt_choices()
	size_choices = _visible_size_limits()
	tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)

	with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
	gr.Markdown(INTRO)

	if read_orth_df.empty:
	gr.Markdown("No data found. Upload scored anon JSON files to the `data/current/` directory.")
	else:
	# Reactive output components, gathered in the order the change
	# handler returns them: final table, then one table per in-RRF
	# category, then the two trade-off scatters (and the IFEval table).
	rrf_outputs: list = []

	with gr.Row(elem_classes=["filter-bar"]):
	tc_dropdown = gr.Dropdown(
	choices=tc_choices,
	value="All",
	label="Text category",
	info="Filter the RRF rankings to one source-text category.",
	)
	pr_dropdown = gr.Dropdown(
	choices=pr_choices,
	value="All",
	label="Simplification prompt",
	info="Filter the RRF rankings to one simplification prompt.",
	)
	size_dropdown = gr.Dropdown(
	choices=size_choices,
	value="ALL",
	label="Size limit",
	info="Keep only models up to this many parameters.",
	)
	type_dropdown = gr.Dropdown(
	choices=MODEL_TYPES,
	value="ALL",
	label="Model type",
	info="Filter by open- vs closed-weights models.",
	)

	with gr.Tabs():

	# ── Final Ranking ──────────────────────────────────────────
	with gr.TabItem("Final Ranking"):
	gr.Markdown(
	"Final model ranking via Reciprocal Rank Fusion (k=60) over per-category RRF scores. "
	"Each category ranks models by its own RRF score; those ranks are then fused into a "
	"single Final RRF score. Higher = better overall simplification. "
	"The PLCC column shows the model's score on the external "
	"[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
	"benchmark for reference only - it does not affect the ranking (blank where unavailable)."
	)
	final_table = gr.Dataframe(
	value=final_df, interactive=False, wrap=True,
	elem_classes=["plain-table", "params-col"],
	)
	gr.Markdown(N_NOTE)
	rrf_outputs += [final_table]

	# ── RRF category tabs ──────────────────────────────────────
	for cat, cat_df in category_data:
	if not cat.get("in_rrf", True):
	continue
	with gr.TabItem(cat["name"]):
	gr.Markdown(cat["description"])
	cat_table = gr.Dataframe(
	value=cat_df, interactive=False, wrap=True,
	elem_classes=["plain-table", "params-col"],
	)
	gr.Markdown(N_NOTE)
	rrf_outputs += [cat_table]

	# ── Trade-off plots ────────────────────────────────────────
	with gr.TabItem("Trade-off"):
	gr.Markdown(
	"Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
	"(QuestEval F1), one point per model. Top-left is ideal: "
	"greater complexity reduction and faithful to the original."
	)
	tradeoff_plot = gr.Plot(value=tradeoff_fig)
	gr.Markdown(
	"---\n"
	"Gunning Fog orth reduction (Δ%) versus NLI F1. "
	"Top-left is best: greater complexity reduction and strong NLI entailment."
	)
	fog_nli_plot = gr.Plot(value=fog_nli_fig)
	rrf_outputs += [tradeoff_plot, fog_nli_plot]

	with gr.TabItem("Detailed scores", visible=False):
	gr.Markdown(
	"Average scores before and after simplification, plus absolute (Δ) "
	"and percentage (Δ%) change - for all readability, lexical, and marker metrics."
	)
	gr.Dataframe(
	value=detail_df, interactive=False, wrap=True,
	elem_classes=["plain-table"],
	)

	# ── IFEval: manual vs automatic ────────────────────────────
	if not ifeval_cmp_df.empty:
	with gr.TabItem("IFEval manual vs auto"):
	gr.Markdown(
	"Automatic IFEval constraints are generated by an LLM; "
	"manual constraints are hand-written gold rules, available for a "
	"subset of the prompts. To isolate rule quality from sampling, the "
	"comparison is restricted to the texts that carry both scores "
	"(N = matched texts per model), so these automatic figures differ from "
	"the full-sample IFEval used elsewhere.\n\n"
	"include = fraction of include constraints satisfied, "
	"exclude = fraction of exclude constraints satisfied (higher is "
	"better for both). Δ = manual − automatic (on the matched texts): a "
	"negative Δ means the automatic rules were easier to satisfy than the "
	"hand-checked ones (more lenient automatic scoring). The (all) columns "
	"show automatic IFEval over every text (the full-sample figure used "
	"elsewhere). Δ (man−auto all) is manual minus that full-sample "
	"automatic value - useful as a sanity check, but note the two cover "
	"different text sets (matched subset vs. all texts), so Δ (man−auto) "
	"is the rigorous like-for-like comparison."
	)
	ifeval_cmp_table = gr.Dataframe(
	value=ifeval_cmp_df, interactive=False, wrap=True,
	elem_classes=["plain-table"],
	)
	rrf_outputs.append(ifeval_cmp_table)

	# Metric documentation, shown below the results.
	gr.Markdown(METRICS_DOC)

	# Simplification prompts, documenting the "Simplification prompt"
	# filter values — shown below the metric documentation.
	gr.Markdown(
	"## Simplification prompts\n\n"
	"The five prompt templates every model is run with - these are the "
	"values of the Simplification prompt filter above. Each source "
	"text is simplified once per prompt, so they range from a bare "
	"one-line instruction to full plain-language guidelines. "
	"`<text>` marks where the source text is inserted."
	)
	for _name, (_desc, _body) in PROMPTS.items():
	with gr.Accordion(f"{_name} - {_desc}", open=False):
	gr.Markdown(f"```\n{_body}\n```")

	# Recompute the RRF rankings whenever any filter changes.
	_filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]

	def _refresh_rrf(
	text_category: str, prompt: str, size_limit: str, model_type: str
	) -> list:
	f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
	updates: list = [f_df]
	for cat, df in cat_data:
	if not cat.get("in_rrf", True):
	continue
	updates += [df]
	updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
	if not ifeval_cmp_df.empty:
	updates.append(
	load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
	)
	return updates

	for _dd in _filters:
	_dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)

	return app


	app = build_app()

	if __name__ == "__main__":
	app.launch()