"""PLainBench - Polish Text Simplification Leaderboard. Reads scored anon JSON files from the data/current/ directory and displays a leaderboard showing how well each LLM simplifies Polish texts, measured by readability indices, difficulty markers, reference-based similarity metrics, and a QuestEval-style QA consistency score. """ import json from functools import lru_cache from pathlib import Path import gradio as gr import pandas as pd import plotly.graph_objects as go DATA_DIR = Path(__file__).parent / "data" / "current" @lru_cache(maxsize=1) def load_records() -> tuple[dict, ...]: """Parse every scored anon JSON once and cache the result. The full files are large (~9 MB each, holding per-text records), but the app only ever reads ``metadata`` and ``summary``. We keep just those two sections so each file is parsed a single time and every loader/refresh reuses the in-memory copy instead of re-reading from disk. """ records: list[dict] = [] if not DATA_DIR.exists(): return () for fp in sorted(DATA_DIR.glob("*_scored_anon.json")): with open(fp, encoding="utf-8") as f: data = json.load(f) records.append({"metadata": data["metadata"], "summary": data["summary"]}) return tuple(records) # Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type" # quick-filters. Size options are *upper bounds* in billions of parameters. SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"] MODEL_TYPES = ["ALL", "open-weights", "closed-weights"] def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool: """Whether a model's metadata satisfies the size-limit / model-type filters.""" if model_type and model_type != "ALL": want = "open" if model_type == "open-weights" else "closed" if meta.get("weights") != want: return False if size_limit and size_limit != "ALL": cap = float(size_limit.rstrip("B")) params = meta.get("total_params_b") or 0 # Unknown / unreported size (0) can't be placed under a cap, so exclude it. if params <= 0 or params > cap: return False return True def _filtered_records( size_limit: str | None = None, model_type: str | None = None ) -> list[dict]: """Records whose model passes the size-limit / model-type filters.""" sl = size_limit or "ALL" mt = model_type or "ALL" return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)] def _visible_size_limits() -> list[str]: """Prune ``SIZE_LIMITS`` to the caps that actually split the current models. A numeric cap is redundant when it selects the same set of models as the next-smaller cap (no model has a size in the band between them) - those upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is always kept. Recomputed from the data, so adding models later automatically re-expands the list. """ params = [ p for d in load_records() if (p := d["metadata"].get("total_params_b") or 0) > 0 ] # Ascending by value: keep the smallest representative of each distinct # subset; a larger cap with the same model count is the redundant "upper" one. kept: set[str] = set() prev_count = -1 for s in sorted( (s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B")) ): cap = float(s.rstrip("B")) count = sum(1 for p in params if p <= cap) if count > 0 and count != prev_count: kept.add(s) prev_count = count # Preserve the original descending display order, with ALL first. return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept] READABILITY_ORTH_LABELS = { "flesch_reading_ease_orth": "Flesch RE", "flesch_kincaid_grade_orth": "Flesch-Kincaid", "gunning_fog_orth": "Gunning Fog", "ari_orth": "ARI", "linsear_write_orth": "Linsear Write", "smog_grade_orth": "SMOG", "coleman_liau_orth": "Coleman-Liau", "pisarek_orth": "Pisarek", } READABILITY_LEMMA_LABELS = { "flesch_reading_ease_lemma": "Flesch RE", "flesch_kincaid_grade_lemma": "Flesch-Kincaid", "gunning_fog_lemma": "Gunning Fog", "ari_lemma": "ARI", "linsear_write_lemma": "Linsear Write", "smog_grade_lemma": "SMOG", "coleman_liau_lemma": "Coleman-Liau", "pisarek_lemma": "Pisarek", } LEXICAL_ORTH_LABELS = { "ttr_orth": "TTR", "rttr_orth": "RTTR", "cttr_orth": "CTTR", "herdan_orth": "Herdan", "summer_orth": "Summer", "dugast_orth": "Dugast", "maas_orth": "Maas", "mtld_orth": "MTLD", "mattr_orth": "MATTR", } LEXICAL_LEMMA_LABELS = { "ttr_lemma": "TTR", "rttr_lemma": "RTTR", "cttr_lemma": "CTTR", "herdan_lemma": "Herdan", "summer_lemma": "Summer", "dugast_lemma": "Dugast", "maas_lemma": "Maas", "mtld_lemma": "MTLD", "mattr_lemma": "MATTR", } SIMILARITY_LABELS = { "bert_score_precision": "BERTScore P", "bert_score_recall": "BERTScore R", "bert_score_f1": "BERTScore F1", "bleu": "BLEU", "chrf": "chrF", "chrfpp": "chrF++", "nli_precision": "NLI P", "nli_recall": "NLI R", "nli_f1": "NLI F1", "rouge_1_precision": "ROUGE-1 P", "rouge_1_recall": "ROUGE-1 R", "rouge_1_f1": "ROUGE-1 F1", "rouge_2_precision": "ROUGE-2 P", "rouge_2_recall": "ROUGE-2 R", "rouge_2_f1": "ROUGE-2 F1", "rouge_l_precision": "ROUGE-L P", "rouge_l_recall": "ROUGE-L R", "rouge_l_f1": "ROUGE-L F1", "wer": "WER", "mer": "MER", "wil": "WIL", "ne_retention": "NE Retention", } MARKER_LABELS = { # counts "paragraph_count": "Paragraph count", "sentence_count": "Sentence count", "word_count": "Word count", "named_entity_count": "Named entity count", "difficult_word_count": "Difficult word count", "difficult_word_count_orth": "Difficult word count (orth)", # average lengths "avg_word_syllables": "Avg word syllables", "avg_sentence_length": "Avg sentence length", "avg_paragraph_length": "Avg paragraph length", # lexical difficulty "named_entity_ratio": "Named entity ratio", "difficult_word_ratio": "Difficult word ratio", "difficult_word_ratio_orth": "Difficult word ratio (orth)", # POS ratios "noun_ratio": "Noun ratio", "difficult_noun_ratio": "Difficult noun ratio", "difficult_noun_ratio_orth": "Difficult noun ratio (orth)", "verb_ratio": "Verb ratio", "difficult_verb_ratio": "Difficult verb ratio", "difficult_verb_ratio_orth": "Difficult verb ratio (orth)", "adjective_ratio": "Adjective ratio", "difficult_adjective_ratio": "Difficult adjective ratio", "difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)", # POS-to-POS ratios "noun_to_verb_ratio": "Noun/verb ratio", "verbo_nominal_ratio": "Verbo-nominal ratio", "adj_to_verb_ratio": "Adj/verb ratio", "adj_to_noun_ratio": "Adj/noun ratio", # morphological "nie_prefix_ratio": "Nie-prefix ratio", "participle_ratio": "Participle ratio", "gerund_ratio": "Gerund ratio", "osc_noun_ratio": "OSC noun ratio", "impersonal_verb_ratio": "Impersonal verb ratio", "genitive_noun_ratio": "Genitive noun ratio", "avg_genitive_chain_length": "Avg genitive chain", # syntactic "sentence_length_variance": "Sentence length variance", "mean_dependency_distance": "Mean dep. distance", "subordination_index": "Subordination index", } QUESTEVAL_LABELS = { "precision": "QuestEval P", "recall": "QuestEval R", "f1": "QuestEval F1", "answerable_rate_forward": "Answerable (fwd)", "answerable_rate_backward": "Answerable (bwd)", } RRF_K = 60 # Each entry: (source, key, label, ascending_rrf, in_rrf) # source — "metrics" | "markers" → use avg_diff_pct (Δ%) # "similarity" | "questeval" → use absolute value # ascending_rrf — True = lower value is better (rank 1 = smallest) # in_rrf — include this metric in category RRF computation CATEGORIES: list[dict] = [ { "name": "Readability", "in_rrf": True, "rrf_weight": 1, "description": ( "Readability indices - **orth** (surface-form) variants. " "Δ% = percentage change after simplification. " "For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. " "**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). " "**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), " "where complex words have many syllables (lower → easier). " "**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). " "IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)." ), "metrics": [ ("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True), ("metrics", "gunning_fog_orth", "Gunning Fog", True, True), ("metrics", "coleman_liau_orth", "Coleman-Liau", True, True), ("ifeval", "avg_exclude", "IFEval exclude", False, True), ], }, { "name": "Lexical Difficulty", "in_rrf": True, "rrf_weight": 1, "description": ( "Word-level difficulty markers - **orth** variants where available. " "Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. " "**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). " "**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) " "(higher → harder). " "**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. " "**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words " "(higher → more complex nominal vocabulary)." ), "metrics": [ ("markers", "avg_word_syllables", "Avg word syllables", True, True), ("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True), ("markers", "verb_ratio", "Verb ratio", False, True), ("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True), ], }, { "name": "Syntactic", "in_rrf": True, "rrf_weight": 1, "description": ( "Sentence and clause structure complexity markers. " "Δ% = percentage change. Negative Δ% generally indicates simpler syntax. " "**Avg sentence length** is the mean number of words per sentence (higher → harder). " "**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). " "**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)." ), "metrics": [ ("markers", "avg_sentence_length", "Avg sentence length", True, True), ("markers", "sentence_length_variance", "Sentence length var.", True, False), ("markers", "mean_dependency_distance", "Mean dep. distance", True, True), ("markers", "subordination_index", "Subordination index", True, True), ], }, { "name": "Morphological", "in_rrf": True, "rrf_weight": 1, "description": ( "Polish-specific morphological complexity markers. " "Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. " "**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) " "among alphabetic tokens - a bookish, formal construction (higher → more complex). " "**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words " "(higher → more nominalised, formal). " "**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), " "passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish " "(higher → more impersonal, harder). " "**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish " "(higher → harder). " "**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) " "(higher → more genitive stacking, harder). " "**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of " "administrative Polish (higher → harder). " "**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)." ), "metrics": [ ("markers", "participle_ratio", "Participle ratio", True, False), ("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True), ("markers", "gerund_ratio", "Gerund ratio", True, True), ("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True), ("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True), ("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True), ("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True), ("markers", "osc_noun_ratio", "OSC noun ratio", True, True), ], }, { "name": "Meaning Preservation", "in_rrf": True, "rrf_weight": 4, "description": ( "Semantic metrics that directly test whether the simplified text says the same thing as the original. " "NLI checks bidirectional entailment; QuestEval checks information preservation via QA. " "NE Retention measures what fraction of named entities from the original appear in the simplified text " "(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). " "IFEval include is the fraction of 'include' constraints satisfied by the simplified text. " "Higher is better for all." ), "metrics": [ ("similarity", "nli_f1", "NLI F1", False, True), ("questeval", "f1", "QuestEval F1", False, True), ("similarity", "ne_retention", "NE Retention", False, True), ("ifeval", "avg_include", "IFEval include", False, True), ], }, ] def _col_name(source: str, label: str) -> str: """Column name used in category DataFrames.""" if source in ("metrics", "markers"): return f"{label} (Δ%)" return label def _model_label(data: dict) -> str: """Return a unique display name, appending reasoning effort when present. The parameter size is shown separately (see :func:`_params_str`), in its own column, mirroring the PLCC leaderboard layout. """ model = data["metadata"]["model"] effort = ( data["metadata"] .get("model_kwargs", {}) .get("extra_body", {}) .get("reasoning", {}) .get("effort") ) if effort is not None: return f"{model} [reasoning: {effort}]" return model def _params_str(params: float | None) -> str | None: """PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown).""" p = params or 0 if p <= 0: return None return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B" def _params_map() -> dict[str, str]: """Model label → formatted parameter size, read from each file's metadata.""" out: dict[str, str] = {} for data in load_records(): label = _params_str(data["metadata"].get("total_params_b")) if label: out[_model_label(data)] = label return out def _metric_row( label_map: dict, summary_metrics: dict, row: dict, detail_row: dict, *, include_detail: bool = True, ) -> None: """Populate leaderboard row and detail row from a label→key map.""" for key, label in label_map.items(): vals = summary_metrics.get(key, {}) row[f"{label} (Δ)"] = vals.get("avg_diff") row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") if include_detail: detail_row[f"{label} before"] = vals.get("avg_before") detail_row[f"{label} after"] = vals.get("avg_after") detail_row[f"{label} (Δ)"] = vals.get("avg_diff") detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") def load_leaderboard_data() -> tuple[pd.DataFrame, ...]: """Load scored JSON files and build leaderboard DataFrames. Returns: (readability_orth_df, readability_lemma_df, lexical_orth_df, lexical_lemma_df, similarity_df, questeval_df, markers_df, detail_df) """ read_orth_rows, read_lemma_rows = [], [] lex_orth_rows, lex_lemma_rows = [], [] similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], [] if not DATA_DIR.exists(): empty = pd.DataFrame() return empty, empty, empty, empty, empty, empty, empty, empty for data in load_records(): model = _model_label(data) n = data["summary"]["n"] metrics = data["summary"]["metrics"] similarity = data["summary"].get("similarity", {}) questeval = data["summary"].get("questeval", {}) markers = data["summary"].get("markers", {}) base = {"Model": model, "N": n} read_orth_row = dict(base) read_lemma_row = dict(base) lex_orth_row = dict(base) lex_lemma_row = dict(base) similarity_row = dict(base) questeval_row = dict(base) markers_row = dict(base) detail_row = dict(base) _metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row) _metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row) _metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row) _metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row) for key, label in SIMILARITY_LABELS.items(): similarity_row[label] = similarity.get(key) for key, label in QUESTEVAL_LABELS.items(): questeval_row[label] = questeval.get(key) for key, label in MARKER_LABELS.items(): vals = markers.get(key, {}) markers_row[f"{label} (Δ)"] = vals.get("avg_diff") markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") detail_row[f"{label} before"] = vals.get("avg_before") detail_row[f"{label} after"] = vals.get("avg_after") detail_row[f"{label} (Δ)"] = vals.get("avg_diff") detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") read_orth_rows.append(read_orth_row) read_lemma_rows.append(read_lemma_row) lex_orth_rows.append(lex_orth_row) lex_lemma_rows.append(lex_lemma_row) similarity_rows.append(similarity_row) questeval_rows.append(questeval_row) markers_rows.append(markers_row) detail_rows.append(detail_row) dfs = [ pd.DataFrame(read_orth_rows), pd.DataFrame(read_lemma_rows), pd.DataFrame(lex_orth_rows), pd.DataFrame(lex_lemma_rows), pd.DataFrame(similarity_rows), pd.DataFrame(questeval_rows), pd.DataFrame(markers_rows), pd.DataFrame(detail_rows), ] for df in dfs: num_cols = df.select_dtypes(include="number").columns df[num_cols] = df[num_cols].round(4) return tuple(dfs) @lru_cache(maxsize=1) def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]: """Per-model matched IFEval records, cached once. Manual IFEval rules are hand-written for a subset of the prompts, so the comparison only makes sense on records carrying *both* an automatic and a manual score. This reads the per-text ``results`` arrays (which ``load_records`` discards) once and keeps, per model, the tuples ``(category, prompt_id, auto_include, auto_exclude, man_include, man_exclude)`` so the dropdown filters can re-aggregate cheaply. """ out: list[tuple[str, tuple[tuple, ...]]] = [] if not DATA_DIR.exists(): return () for fp in sorted(DATA_DIR.glob("*_scored_anon.json")): with open(fp, encoding="utf-8") as f: data = json.load(f) model = _model_label(data) recs: list[tuple] = [] for rec in data["results"]: man = rec.get("ifeval_manual") auto = rec.get("ifeval") if not man or not auto: continue recs.append(( rec.get("category"), rec.get("prompt_id"), auto.get("include"), auto.get("exclude"), man.get("include"), man.get("exclude"), )) if recs: out.append((model, tuple(recs))) return tuple(out) def load_ifeval_comparison_df( text_category: str | None = None, prompt: str | None = None, size_limit: str | None = None, model_type: str | None = None, ) -> pd.DataFrame: """Compare manual (gold) IFEval against automatic IFEval, per model. The comparison is restricted to records carrying *both* an automatic and a manual score - the very same texts scored both ways, which isolates the rule-quality gap from sampling differences (the overall ``ifeval`` summary averages over ~5× more texts and so is not directly comparable). ``Δ`` columns are manual − automatic: a negative value means the automatic constraints were easier to satisfy than the hand-checked ones, i.e. the automatic rules are more lenient. ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``) restrict the matched records to one source-text category and/or one simplification prompt, mirroring the RRF dropdown filters. """ tc = None if text_category in (None, "All") else text_category pr = None if prompt in (None, "All") else prompt # Automatic IFEval over *all* records (not just the manual-matched subset), # from the summary buckets, so it tracks the same category/prompt filters. # Restricted to models passing the size / model-type filters. allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)} summaries = { _model_label(data): data["summary"] for data in load_records() if _model_label(data) in allowed } rows: list[dict] = [] for model, recs in _load_ifeval_records(): if model not in allowed: continue ai = ae = mi = me = 0.0 ni = ne = 0 for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs: if tc and cat != tc: continue if pr and prompt_id != pr: continue if m_inc is not None and a_inc is not None: ai += a_inc; mi += m_inc; ni += 1 if m_exc is not None and a_exc is not None: ae += a_exc; me += m_exc; ne += 1 if ni == 0 and ne == 0: continue auto_inc = ai / ni if ni else None man_inc = mi / ni if ni else None auto_exc = ae / ne if ne else None man_exc = me / ne if ne else None auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr) all_inc = auto_all.get("avg_include") all_exc = auto_all.get("avg_exclude") rows.append({ "Model": model, "N": ni or ne, "Manual include": man_inc, "Manual exclude": man_exc, "Auto include": auto_inc, "Auto include (all)": all_inc, "Δ include (man−auto)": (man_inc - auto_inc) if ni else None, "Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None, "Auto exclude": auto_exc, "Auto exclude (all)": all_exc, "Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None, "Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None, }) df = pd.DataFrame(rows) if df.empty: return df df = df.sort_values("Model").reset_index(drop=True) num_cols = df.select_dtypes(include="number").columns df[num_cols] = df[num_cols].round(4) return df def text_category_choices() -> list[str]: """All source-text categories present in the data, prefixed with 'All'.""" cats: set[str] = set() for data in load_records(): cats.update(data["summary"].get("metrics_by_category", {}).keys()) return ["All"] + sorted(cats) def prompt_choices() -> list[str]: """All simplification prompts present in the data, prefixed with 'All'.""" prompts: set[str] = set() for data in load_records(): prompts.update(data["summary"].get("metrics_by_prompt", {}).keys()) return ["All"] + sorted(prompts) def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict: """Return the metric bucket for one source, filtered by text category and/or prompt. Picks the overall summary when neither filter is set, the ``*_by_category`` / ``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket (keyed ``"CATEGORY/PROMPT"``) when both are set. """ if source in ("metrics", "markers", "similarity"): if tc and prompt: return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {}) if tc: return s.get(f"{source}_by_category", {}).get(tc, {}) if prompt: return s.get(f"{source}_by_prompt", {}).get(prompt, {}) return s.get(source, {}) # questeval / ifeval keep their per-filter buckets nested under the source object src = s.get(source, {}) if tc and prompt: return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {}) if tc: return src.get("by_category", {}).get(tc, {}) if prompt: return src.get("by_prompt", {}).get(prompt, {}) return src def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None: """Sample count for the selected filters, from whichever source records it.""" for src in ("questeval", "ifeval"): n = _source_bucket(s, src, tc, prompt).get("n") if n is not None: return n return None def load_category_df( category: dict, text_category: str | None = None, prompt: str | None = None, ) -> pd.DataFrame: """Build a DataFrame for one metric category with a per-category RRF score. ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``) restrict the metrics to one source-text category and/or one simplification prompt via the matching ``*_by_category`` / ``*_by_prompt`` / ``*_by_category_prompt`` buckets; otherwise the overall summary is used. The RRF is always computed over **all** models; the size-limit / model-type filters are applied afterwards (in ``load_rrf_views``) as pure row filters, so they never change a model's rank or score. """ rows: list[dict] = [] tc = None if text_category in (None, "All") else text_category pr = None if prompt in (None, "All") else prompt for data in load_records(): s = data["summary"] model = _model_label(data) n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"] row: dict = {"Model": model, "N": n} for source, key, label, _asc, in_rrf in category["metrics"]: if not in_rrf: continue col = _col_name(source, label) bucket = _source_bucket(s, source, tc, pr) if source in ("metrics", "markers"): row[col] = bucket.get(key, {}).get("avg_diff_pct") else: # similarity, questeval, ifeval store the value directly row[col] = bucket.get(key) rows.append(row) df = pd.DataFrame(rows) if df.empty: return df num_cols = df.select_dtypes(include="number").columns df[num_cols] = df[num_cols].round(4) rrf = pd.Series(0.0, index=df.index) for source, key, label, ascending, in_rrf in category["metrics"]: if not in_rrf: continue col = _col_name(source, label) if col not in df.columns or df[col].isna().all(): continue rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom")) df.insert(2, "RRF Score", rrf.round(4)) df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True) df.insert(0, "Rank", range(1, len(df) + 1)) df.insert(2, "Params", df["Model"].map(_params_map()).fillna("")) return df def _plcc_overall_map() -> dict[str, float]: """Model label → external PLCC overall score, read from each file's metadata. PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the score is carried verbatim in ``metadata.plcc.overall`` and shown for reference only - it does not feed the RRF ranking. Models without a PLCC entry are omitted (mapped to NaN in the table). """ out: dict[str, float] = {} for data in load_records(): plcc = data["metadata"].get("plcc") or {} overall = plcc.get("overall") if overall is not None: out[_model_label(data)] = overall return out def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame: """Fuse per-category RRF scores into a final ranking via RRF. Each category column shows the model's **rank within that category** (1 = best); those ranks are what the RRF fusion uses to produce the overall ``Final RRF``. A reference ``PLCC`` column carries the external PLCC benchmark score and does not influence the ranking. """ merged: pd.DataFrame | None = None for cat, cat_df in category_data: if not cat.get("in_rrf", True) or cat_df.empty: continue sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]}) merged = sub if merged is None else merged.merge(sub, on="Model", how="outer") if merged is None or merged.empty: return pd.DataFrame() # N (sample count) is identical across categories for a given model, so take # it from whichever category table carries it. n_map: dict = {} for _cat, cat_df in category_data: if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns): n_map = dict(zip(cat_df["Model"], cat_df["N"])) break score_cols = [c for c in merged.columns if c != "Model"] weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES} out = merged[["Model"]].copy() rrf = pd.Series(0.0, index=merged.index) rank_cols: dict[str, pd.Series] = {} for col in score_cols: ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int) rrf += weights.get(col, 1) / (RRF_K + ranks) rank_cols[col] = ranks out.insert(1, "Final RRF", rrf.round(4)) out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2)) for name, ranks in rank_cols.items(): out[name] = ranks out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True) out.insert(0, "Rank", range(1, len(out) + 1)) out.insert(2, "Params", out["Model"].map(_params_map()).fillna("")) out.insert(3, "N", out["Model"].map(n_map).astype("Int64")) return out def build_tradeoff_scatter( text_category: str | None = None, prompt: str | None = None, size_limit: str | None = None, model_type: str | None = None, ) -> go.Figure | None: """Scatter of Gunning Fog reduction vs meaning preservation, one point per model. X: Gunning Fog orth Δ% (more negative = greater complexity reduction) Y: QuestEval F1 (higher = better meaning preservation) Honours the same text-category / prompt / size / model-type filters as the RRF rankings. """ tc = None if text_category in (None, "All") else text_category pr = None if prompt in (None, "All") else prompt points = [] for data in _filtered_records(size_limit, model_type): s = data["summary"] model = _model_label(data) x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct") y = _source_bucket(s, "questeval", tc, pr).get("f1") if x is None or y is None: continue points.append((model, x, y)) if not points: return None models, xs, ys = zip(*points) fig = go.Figure() fig.add_trace( go.Scatter( x=xs, y=ys, mode="markers+text", text=models, textposition="top center", textfont={"size": 10}, marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}}, hovertemplate="%{text}
Gunning Fog Δ%: %{x:.2f}
QuestEval F1: %{y:.3f}", ) ) x_mid = (min(xs) + max(xs)) / 2 y_mid = (min(ys) + max(ys)) / 2 fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray") fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray") fig.update_layout( title="Complexity reduction vs meaning preservation", xaxis_title="Gunning Fog orth Δ% (← easier text)", yaxis_title="QuestEval F1 (↑ meaning preserved)", height=560, margin={"l": 60, "r": 40, "t": 60, "b": 60}, plot_bgcolor="white", ) fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC") fig.update_yaxes(showgrid=True, gridcolor="#EEE") return fig def build_fog_nli_scatter( text_category: str | None = None, prompt: str | None = None, size_limit: str | None = None, model_type: str | None = None, ) -> go.Figure | None: """Scatter of Gunning Fog reduction vs NLI F1, one point per model. X: Gunning Fog orth Δ% (more negative = greater complexity reduction) Y: NLI F1 (higher = stronger entailment / meaning preserved) Honours the same text-category / prompt / size / model-type filters as the RRF rankings. """ tc = None if text_category in (None, "All") else text_category pr = None if prompt in (None, "All") else prompt points = [] for data in _filtered_records(size_limit, model_type): s = data["summary"] model = _model_label(data) x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct") y = _source_bucket(s, "similarity", tc, pr).get("nli_f1") if x is None or y is None: continue points.append((model, x, y)) if not points: return None models, xs, ys = zip(*points) fig = go.Figure() fig.add_trace( go.Scatter( x=xs, y=ys, mode="markers+text", text=models, textposition="top center", textfont={"size": 10}, marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}}, hovertemplate="%{text}
Gunning Fog Δ%: %{x:.2f}
NLI F1: %{y:.3f}", ) ) x_mid = (min(xs) + max(xs)) / 2 y_mid = (min(ys) + max(ys)) / 2 fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray") fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray") fig.update_layout( title="Complexity reduction vs NLI consistency", xaxis_title="Gunning Fog orth Δ% (← easier text)", yaxis_title="NLI F1 (↑ meaning preserved)", height=560, margin={"l": 60, "r": 40, "t": 60, "b": 60}, plot_bgcolor="white", ) fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC") fig.update_yaxes(showgrid=True, gridcolor="#EEE") return fig INTRO = """\ # PLainBench - Polish Text Simplification Leaderboard This benchmark evaluates how well LLMs simplify difficult Polish texts - drawn from legal/administrative (BIP/GOV), finance, and science domains - while preserving the original meaning. Each model simplifies 210 source texts under 5 simplification prompts (1050 outputs per model). Outputs are scored on readability indices, fine-grained difficulty markers (lexical, syntactic, morphological), meaning preservation (NLI entailment, QuestEval QA consistency, named-entity retention), and instruction following (IFEval include/exclude). The per-category scores are fused into an overall **Final RRF** ranking. """ METRICS_DOC = """\ ## Metrics ### Readability indices All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL dictionary) and counted on surface (orthographic) word forms. Δ is the absolute change (after − before); Δ% is the average percentage change from the original text to the simplified text. | Metric | Formula | Interpretation | |---|---|---| | **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** | | **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** | | **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** | ### Difficulty markers Fine-grained syntactic, morphological, and lexical features. Δ is absolute change; Δ% is percentage change. Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the surface (orthographic) form. | Marker | Description | Desired Δ% | |---|---|---| | **Avg word syllables** | Mean syllable count per word | − (shorter words) | | **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − | | **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − | | **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) | | **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) | | **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) | | **Subordination index** | Subordinate clauses / total clauses | − | | **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − | | **Gerund ratio** | Gerunds / all tokens | − | | **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − | | **Genitive noun ratio** | Nouns in genitive case / all tokens | − | | **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − | | **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − | | **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − | ### Similarity metrics Reference-based metrics comparing simplified text against the original. | Metric | Description | Direction | |---|---|---| | **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment | | **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved | *Only **NLI F1** feeds the RRF score; P and R are shown for context.* ### QuestEval - QA consistency | Metric | Description | Direction | |---|---|---| | **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations | | **QuestEval R** | Forward recall - information preserved | Higher = less content dropped | | **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation | | **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic | | **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original | *Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.* ### IFEval - instruction following | Metric | Description | Direction | |---|---|---| | **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better | | **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better | """ # Sample-count note shown under each table that carries an ``N`` column. N_NOTE = "**N** = number of prompt × text evaluations per model." # The five simplification prompts every model is run with. The keys match the # "Simplification prompt" filter values (and the ``*_by_prompt`` summary # buckets); each value is ``(short description, user-message template)``, where # ```` marks where the source text is inserted. Kept in sync with # generation/prompting/instruction.py. Ordered from least to most detailed. PROMPTS: dict[str, tuple[str, str]] = { "mini": ( "Minimal - a single-line instruction, no rules.", "Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, " "bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n", ), "compact": ( "Compact - a short bulleted rule set.", """Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej. Zasady: - Skup się na najważniejszych informacjach, usuń zbędne treści. - Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity. - Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj. - Twórz krótkie zdania (jedna myśl = jedno zdanie). - Pisz bezpośrednio do odbiorcy i używaj strony czynnej. - Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń. - Zachowaj poprawność językową i logiczną spójność. - W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. --- ### Tekst do uproszczenia: """, ), "medium": ( "Medium - moderately detailed rules with sub-points.", """Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens. ### Zasady: - Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści. - Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity. - Stosuj proste i naturalne słownictwo: - zamieniaj trudne lub specjalistyczne wyrazy na prostsze, - jeśli trzeba — krótko je wyjaśnij lub podaj przykład. - Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów). - Pisz bezpośrednio do odbiorcy i używaj strony czynnej. - Unikaj: - żargonu, stylu urzędowego i zapożyczeń, - form bezosobowych i strony biernej (jeśli nie są konieczne), - nadmiaru rzeczowników odczasownikowych, - podwójnych przeczeń i zawiłych konstrukcji. - Zachowaj poprawność językową, spójność i logiczny układ tekstu. - W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. --- ### Tekst do uproszczenia: """, ), "long": ( "Long - full, sectioned plain-language guidelines.", """Uprość poniższy tekst zgodnie z zasadami prostego języka. ### 1. Cel i odbiorca - Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej). - Skup się na najważniejszych informacjach. ### 2. Struktura - Usuń informacje zbędne i poboczne. - Uporządkuj treść: najważniejsze informacje podaj na początku. - Podziel tekst na krótkie akapity (1 akapit = 1 myśl). - Jeśli tekst jest dłuższy, użyj nagłówków lub list. ### 3. Słownictwo - Zastępuj trudne słowa prostszymi. - Unikaj: - terminów specjalistycznych (chyba że je wyjaśnisz), - słów rzadkich, książkowych i urzędowych, - zapożyczeń i modnych zwrotów, - skrótów niezrozumiałych dla odbiorcy. - W razie potrzeby: - wyjaśnij trudne pojęcia, - podaj przykłady, - używaj konkretnych nazw zamiast ogólników. ### 4. Składnia - Twórz krótkie zdania (ok. 20 słów). - Jedno zdanie = jedna myśl. - Używaj zdań twierdzących. - Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”). - Używaj strony czynnej zamiast biernej. - Unikaj form bezosobowych i skomplikowanych konstrukcji. - Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki). ### 5. Styl - Unikaj podwójnych przeczeń. - Upraszczaj złożone konstrukcje. - Zachowaj naturalny, jasny ton. ### 6. Końcowa kontrola - Sprawdź, czy tekst jest: - zrozumiały, - poprawny językowo, - logiczny i spójny. ### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. --- ### Tekst do uproszczenia: """, ), "step_by_step": ( "Step by step - role-based, numbered editorial guidelines.", """Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami: 1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań. 2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny. 3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników. 4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę. 5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj. 6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników. 7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane. 8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie. 9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. --- ### Tekst do uproszczenia: """, ), } # ── PLCC-inspired visual style ────────────────────────────────────────────── # Mirrors the sdadas/plcc leaderboard: clean white background, a system # sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded # (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in # CSS — a custom gr.themes.* would tint the component label chips blue, which # is not part of the PLCC look. PLCC_CSS = """ .gradio-container { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important; max-width: 1500px !important; } /* PLCC-style data tables */ .plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; } .plain-table thead th { background: #f9fafd !important; border-bottom: 2px solid #ddd !important; color: #222 !important; font-weight: 700 !important; } .plain-table tbody td { padding: 8px 10px !important; } .plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; } .plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; } /* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */ .params-col tbody td:nth-child(3), .params-col thead th:nth-child(3) { text-align: right !important; white-space: nowrap; } .params-col tbody td:nth-child(3) { color: #999 !important; } /* Filter bar — the grey rounded block holding the dropdowns */ .filter-bar { background: #f9fafd; border: 1px solid #ddd; border-radius: 0.5rem; padding: 10px 14px; } """ # Colour palette for category bars _CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"] def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame: """Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched).""" if df.empty or "Model" not in df.columns: return df return df[df["Model"].isin(allowed)].reset_index(drop=True) def load_rrf_views( text_category: str | None = None, prompt: str | None = None, size_limit: str | None = None, model_type: str | None = None, ) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]: """Final ranking DataFrame and per-category DataFrames for the selected filters. Ranks and RRF scores are computed over **all** models (honouring only the text-category / prompt filters). The size-limit and model-type selections are then applied as pure row filters that hide models without recomputing any ranking - so a surviving model keeps the rank it held in the full table. """ category_data = [ (cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES ] final_df = build_final_ranking_df(category_data) allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)} final_df = _filter_model_rows(final_df, allowed) category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data] return final_df, category_data def _tradeoff_figs( text_category: str | None = None, prompt: str | None = None, size_limit: str | None = None, model_type: str | None = None, ) -> tuple[go.Figure, go.Figure]: """Both trade-off scatters for the selected filters (empty figure when no data).""" return ( build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(), build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(), ) def build_app() -> gr.Blocks: ( read_orth_df, read_lemma_df, lex_orth_df, lex_lemma_df, similarity_df, questeval_df, markers_df, detail_df, ) = load_leaderboard_data() ifeval_cmp_df = load_ifeval_comparison_df() final_df, category_data = load_rrf_views(None, None) tc_choices = text_category_choices() pr_choices = prompt_choices() size_choices = _visible_size_limits() tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None) with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app: gr.Markdown(INTRO) if read_orth_df.empty: gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*") else: # Reactive output components, gathered in the order the change # handler returns them: final table, then one table per in-RRF # category, then the two trade-off scatters (and the IFEval table). rrf_outputs: list = [] with gr.Row(elem_classes=["filter-bar"]): tc_dropdown = gr.Dropdown( choices=tc_choices, value="All", label="Text category", info="Filter the RRF rankings to one source-text category.", ) pr_dropdown = gr.Dropdown( choices=pr_choices, value="All", label="Simplification prompt", info="Filter the RRF rankings to one simplification prompt.", ) size_dropdown = gr.Dropdown( choices=size_choices, value="ALL", label="Size limit", info="Keep only models up to this many parameters.", ) type_dropdown = gr.Dropdown( choices=MODEL_TYPES, value="ALL", label="Model type", info="Filter by open- vs closed-weights models.", ) with gr.Tabs(): # ── Final Ranking ────────────────────────────────────────── with gr.TabItem("Final Ranking"): gr.Markdown( "Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. " "Each category ranks models by its own RRF score; those ranks are then fused into a " "single **Final RRF** score. Higher = better overall simplification. " "The **PLCC** column shows the model's score on the external " "[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence " "benchmark for reference only - it does not affect the ranking (blank where unavailable)." ) final_table = gr.Dataframe( value=final_df, interactive=False, wrap=True, elem_classes=["plain-table", "params-col"], ) gr.Markdown(N_NOTE) rrf_outputs += [final_table] # ── RRF category tabs ────────────────────────────────────── for cat, cat_df in category_data: if not cat.get("in_rrf", True): continue with gr.TabItem(cat["name"]): gr.Markdown(cat["description"]) cat_table = gr.Dataframe( value=cat_df, interactive=False, wrap=True, elem_classes=["plain-table", "params-col"], ) gr.Markdown(N_NOTE) rrf_outputs += [cat_table] # ── Trade-off plots ──────────────────────────────────────── with gr.TabItem("Trade-off"): gr.Markdown( "Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation " "(QuestEval F1), one point per model. Top-left is ideal: " "greater complexity reduction **and** faithful to the original." ) tradeoff_plot = gr.Plot(value=tradeoff_fig) gr.Markdown( "---\n" "Gunning Fog orth reduction (Δ%) versus NLI F1. " "Top-left is best: greater complexity reduction **and** strong NLI entailment." ) fog_nli_plot = gr.Plot(value=fog_nli_fig) rrf_outputs += [tradeoff_plot, fog_nli_plot] with gr.TabItem("Detailed scores", visible=False): gr.Markdown( "Average scores before and after simplification, plus absolute (Δ) " "and percentage (Δ%) change - for all readability, lexical, and marker metrics." ) gr.Dataframe( value=detail_df, interactive=False, wrap=True, elem_classes=["plain-table"], ) # ── IFEval: manual vs automatic ──────────────────────────── if not ifeval_cmp_df.empty: with gr.TabItem("IFEval manual vs auto"): gr.Markdown( "**Automatic** IFEval constraints are generated by an LLM; " "**manual** constraints are hand-written gold rules, available for a " "subset of the prompts. To isolate rule quality from sampling, the " "comparison is restricted to the texts that carry **both** scores " "(N = matched texts per model), so these automatic figures differ from " "the full-sample IFEval used elsewhere.\n\n" "**include** = fraction of *include* constraints satisfied, " "**exclude** = fraction of *exclude* constraints satisfied (higher is " "better for both). **Δ = manual − automatic** (on the matched texts): a " "negative Δ means the automatic rules were easier to satisfy than the " "hand-checked ones (more lenient automatic scoring). The **(all)** columns " "show automatic IFEval over *every* text (the full-sample figure used " "elsewhere). **Δ (man−auto all)** is manual minus that full-sample " "automatic value - useful as a sanity check, but note the two cover " "different text sets (matched subset vs. all texts), so **Δ (man−auto)** " "is the rigorous like-for-like comparison." ) ifeval_cmp_table = gr.Dataframe( value=ifeval_cmp_df, interactive=False, wrap=True, elem_classes=["plain-table"], ) rrf_outputs.append(ifeval_cmp_table) # Metric documentation, shown below the results. gr.Markdown(METRICS_DOC) # Simplification prompts, documenting the "Simplification prompt" # filter values — shown below the metric documentation. gr.Markdown( "## Simplification prompts\n\n" "The five prompt templates every model is run with - these are the " "values of the **Simplification prompt** filter above. Each source " "text is simplified once per prompt, so they range from a bare " "one-line instruction to full plain-language guidelines. " "`` marks where the source text is inserted." ) for _name, (_desc, _body) in PROMPTS.items(): with gr.Accordion(f"{_name} - {_desc}", open=False): gr.Markdown(f"```\n{_body}\n```") # Recompute the RRF rankings whenever any filter changes. _filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown] def _refresh_rrf( text_category: str, prompt: str, size_limit: str, model_type: str ) -> list: f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type) updates: list = [f_df] for cat, df in cat_data: if not cat.get("in_rrf", True): continue updates += [df] updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type)) if not ifeval_cmp_df.empty: updates.append( load_ifeval_comparison_df(text_category, prompt, size_limit, model_type) ) return updates for _dd in _filters: _dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs) return app app = build_app() if __name__ == "__main__": app.launch()