Spaces:
Running
Running
| """PLainBench - Polish Text Simplification Leaderboard. | |
| Reads scored anon JSON files from the data/current/ directory and displays a | |
| leaderboard showing how well each LLM simplifies Polish texts, measured | |
| by readability indices, difficulty markers, reference-based similarity | |
| metrics, and a QuestEval-style QA consistency score. | |
| """ | |
| import json | |
| from functools import lru_cache | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| DATA_DIR = Path(__file__).parent / "data" / "current" | |
| def load_records() -> tuple[dict, ...]: | |
| """Parse every scored anon JSON once and cache the result. | |
| The full files are large (~9 MB each, holding per-text records), but the | |
| app only ever reads ``metadata`` and ``summary``. We keep just those two | |
| sections so each file is parsed a single time and every loader/refresh | |
| reuses the in-memory copy instead of re-reading from disk. | |
| """ | |
| records: list[dict] = [] | |
| if not DATA_DIR.exists(): | |
| return () | |
| for fp in sorted(DATA_DIR.glob("*_scored_anon.json")): | |
| with open(fp, encoding="utf-8") as f: | |
| data = json.load(f) | |
| records.append({"metadata": data["metadata"], "summary": data["summary"]}) | |
| return tuple(records) | |
| # Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type" | |
| # quick-filters. Size options are *upper bounds* in billions of parameters. | |
| SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"] | |
| MODEL_TYPES = ["ALL", "open-weights", "closed-weights"] | |
| def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool: | |
| """Whether a model's metadata satisfies the size-limit / model-type filters.""" | |
| if model_type and model_type != "ALL": | |
| want = "open" if model_type == "open-weights" else "closed" | |
| if meta.get("weights") != want: | |
| return False | |
| if size_limit and size_limit != "ALL": | |
| cap = float(size_limit.rstrip("B")) | |
| params = meta.get("total_params_b") or 0 | |
| # Unknown / unreported size (0) can't be placed under a cap, so exclude it. | |
| if params <= 0 or params > cap: | |
| return False | |
| return True | |
| def _filtered_records( | |
| size_limit: str | None = None, model_type: str | None = None | |
| ) -> list[dict]: | |
| """Records whose model passes the size-limit / model-type filters.""" | |
| sl = size_limit or "ALL" | |
| mt = model_type or "ALL" | |
| return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)] | |
| def _visible_size_limits() -> list[str]: | |
| """Prune ``SIZE_LIMITS`` to the caps that actually split the current models. | |
| A numeric cap is redundant when it selects the same set of models as the | |
| next-smaller cap (no model has a size in the band between them) - those | |
| upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is | |
| always kept. Recomputed from the data, so adding models later automatically | |
| re-expands the list. | |
| """ | |
| params = [ | |
| p for d in load_records() | |
| if (p := d["metadata"].get("total_params_b") or 0) > 0 | |
| ] | |
| # Ascending by value: keep the smallest representative of each distinct | |
| # subset; a larger cap with the same model count is the redundant "upper" one. | |
| kept: set[str] = set() | |
| prev_count = -1 | |
| for s in sorted( | |
| (s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B")) | |
| ): | |
| cap = float(s.rstrip("B")) | |
| count = sum(1 for p in params if p <= cap) | |
| if count > 0 and count != prev_count: | |
| kept.add(s) | |
| prev_count = count | |
| # Preserve the original descending display order, with ALL first. | |
| return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept] | |
| READABILITY_ORTH_LABELS = { | |
| "flesch_reading_ease_orth": "Flesch RE", | |
| "flesch_kincaid_grade_orth": "Flesch-Kincaid", | |
| "gunning_fog_orth": "Gunning Fog", | |
| "ari_orth": "ARI", | |
| "linsear_write_orth": "Linsear Write", | |
| "smog_grade_orth": "SMOG", | |
| "coleman_liau_orth": "Coleman-Liau", | |
| "pisarek_orth": "Pisarek", | |
| } | |
| READABILITY_LEMMA_LABELS = { | |
| "flesch_reading_ease_lemma": "Flesch RE", | |
| "flesch_kincaid_grade_lemma": "Flesch-Kincaid", | |
| "gunning_fog_lemma": "Gunning Fog", | |
| "ari_lemma": "ARI", | |
| "linsear_write_lemma": "Linsear Write", | |
| "smog_grade_lemma": "SMOG", | |
| "coleman_liau_lemma": "Coleman-Liau", | |
| "pisarek_lemma": "Pisarek", | |
| } | |
| LEXICAL_ORTH_LABELS = { | |
| "ttr_orth": "TTR", | |
| "rttr_orth": "RTTR", | |
| "cttr_orth": "CTTR", | |
| "herdan_orth": "Herdan", | |
| "summer_orth": "Summer", | |
| "dugast_orth": "Dugast", | |
| "maas_orth": "Maas", | |
| "mtld_orth": "MTLD", | |
| "mattr_orth": "MATTR", | |
| } | |
| LEXICAL_LEMMA_LABELS = { | |
| "ttr_lemma": "TTR", | |
| "rttr_lemma": "RTTR", | |
| "cttr_lemma": "CTTR", | |
| "herdan_lemma": "Herdan", | |
| "summer_lemma": "Summer", | |
| "dugast_lemma": "Dugast", | |
| "maas_lemma": "Maas", | |
| "mtld_lemma": "MTLD", | |
| "mattr_lemma": "MATTR", | |
| } | |
| SIMILARITY_LABELS = { | |
| "bert_score_precision": "BERTScore P", | |
| "bert_score_recall": "BERTScore R", | |
| "bert_score_f1": "BERTScore F1", | |
| "bleu": "BLEU", | |
| "chrf": "chrF", | |
| "chrfpp": "chrF++", | |
| "nli_precision": "NLI P", | |
| "nli_recall": "NLI R", | |
| "nli_f1": "NLI F1", | |
| "rouge_1_precision": "ROUGE-1 P", | |
| "rouge_1_recall": "ROUGE-1 R", | |
| "rouge_1_f1": "ROUGE-1 F1", | |
| "rouge_2_precision": "ROUGE-2 P", | |
| "rouge_2_recall": "ROUGE-2 R", | |
| "rouge_2_f1": "ROUGE-2 F1", | |
| "rouge_l_precision": "ROUGE-L P", | |
| "rouge_l_recall": "ROUGE-L R", | |
| "rouge_l_f1": "ROUGE-L F1", | |
| "wer": "WER", | |
| "mer": "MER", | |
| "wil": "WIL", | |
| "ne_retention": "NE Retention", | |
| } | |
| MARKER_LABELS = { | |
| # counts | |
| "paragraph_count": "Paragraph count", | |
| "sentence_count": "Sentence count", | |
| "word_count": "Word count", | |
| "named_entity_count": "Named entity count", | |
| "difficult_word_count": "Difficult word count", | |
| "difficult_word_count_orth": "Difficult word count (orth)", | |
| # average lengths | |
| "avg_word_syllables": "Avg word syllables", | |
| "avg_sentence_length": "Avg sentence length", | |
| "avg_paragraph_length": "Avg paragraph length", | |
| # lexical difficulty | |
| "named_entity_ratio": "Named entity ratio", | |
| "difficult_word_ratio": "Difficult word ratio", | |
| "difficult_word_ratio_orth": "Difficult word ratio (orth)", | |
| # POS ratios | |
| "noun_ratio": "Noun ratio", | |
| "difficult_noun_ratio": "Difficult noun ratio", | |
| "difficult_noun_ratio_orth": "Difficult noun ratio (orth)", | |
| "verb_ratio": "Verb ratio", | |
| "difficult_verb_ratio": "Difficult verb ratio", | |
| "difficult_verb_ratio_orth": "Difficult verb ratio (orth)", | |
| "adjective_ratio": "Adjective ratio", | |
| "difficult_adjective_ratio": "Difficult adjective ratio", | |
| "difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)", | |
| # POS-to-POS ratios | |
| "noun_to_verb_ratio": "Noun/verb ratio", | |
| "verbo_nominal_ratio": "Verbo-nominal ratio", | |
| "adj_to_verb_ratio": "Adj/verb ratio", | |
| "adj_to_noun_ratio": "Adj/noun ratio", | |
| # morphological | |
| "nie_prefix_ratio": "Nie-prefix ratio", | |
| "participle_ratio": "Participle ratio", | |
| "gerund_ratio": "Gerund ratio", | |
| "osc_noun_ratio": "OSC noun ratio", | |
| "impersonal_verb_ratio": "Impersonal verb ratio", | |
| "genitive_noun_ratio": "Genitive noun ratio", | |
| "avg_genitive_chain_length": "Avg genitive chain", | |
| # syntactic | |
| "sentence_length_variance": "Sentence length variance", | |
| "mean_dependency_distance": "Mean dep. distance", | |
| "subordination_index": "Subordination index", | |
| } | |
| QUESTEVAL_LABELS = { | |
| "precision": "QuestEval P", | |
| "recall": "QuestEval R", | |
| "f1": "QuestEval F1", | |
| "answerable_rate_forward": "Answerable (fwd)", | |
| "answerable_rate_backward": "Answerable (bwd)", | |
| } | |
| RRF_K = 60 | |
| # Each entry: (source, key, label, ascending_rrf, in_rrf) | |
| # source — "metrics" | "markers" → use avg_diff_pct (Δ%) | |
| # "similarity" | "questeval" → use absolute value | |
| # ascending_rrf — True = lower value is better (rank 1 = smallest) | |
| # in_rrf — include this metric in category RRF computation | |
| CATEGORIES: list[dict] = [ | |
| { | |
| "name": "Readability", | |
| "in_rrf": True, | |
| "rrf_weight": 1, | |
| "description": ( | |
| "Readability indices - **orth** (surface-form) variants. " | |
| "Δ% = percentage change after simplification. " | |
| "For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. " | |
| "**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). " | |
| "**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), " | |
| "where complex words have many syllables (lower → easier). " | |
| "**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). " | |
| "IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)." | |
| ), | |
| "metrics": [ | |
| ("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True), | |
| ("metrics", "gunning_fog_orth", "Gunning Fog", True, True), | |
| ("metrics", "coleman_liau_orth", "Coleman-Liau", True, True), | |
| ("ifeval", "avg_exclude", "IFEval exclude", False, True), | |
| ], | |
| }, | |
| { | |
| "name": "Lexical Difficulty", | |
| "in_rrf": True, | |
| "rrf_weight": 1, | |
| "description": ( | |
| "Word-level difficulty markers - **orth** variants where available. " | |
| "Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. " | |
| "**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). " | |
| "**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) " | |
| "(higher → harder). " | |
| "**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. " | |
| "**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words " | |
| "(higher → more complex nominal vocabulary)." | |
| ), | |
| "metrics": [ | |
| ("markers", "avg_word_syllables", "Avg word syllables", True, True), | |
| ("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True), | |
| ("markers", "verb_ratio", "Verb ratio", False, True), | |
| ("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True), | |
| ], | |
| }, | |
| { | |
| "name": "Syntactic", | |
| "in_rrf": True, | |
| "rrf_weight": 1, | |
| "description": ( | |
| "Sentence and clause structure complexity markers. " | |
| "Δ% = percentage change. Negative Δ% generally indicates simpler syntax. " | |
| "**Avg sentence length** is the mean number of words per sentence (higher → harder). " | |
| "**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). " | |
| "**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)." | |
| ), | |
| "metrics": [ | |
| ("markers", "avg_sentence_length", "Avg sentence length", True, True), | |
| ("markers", "sentence_length_variance", "Sentence length var.", True, False), | |
| ("markers", "mean_dependency_distance", "Mean dep. distance", True, True), | |
| ("markers", "subordination_index", "Subordination index", True, True), | |
| ], | |
| }, | |
| { | |
| "name": "Morphological", | |
| "in_rrf": True, | |
| "rrf_weight": 1, | |
| "description": ( | |
| "Polish-specific morphological complexity markers. " | |
| "Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. " | |
| "**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) " | |
| "among alphabetic tokens - a bookish, formal construction (higher → more complex). " | |
| "**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words " | |
| "(higher → more nominalised, formal). " | |
| "**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), " | |
| "passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish " | |
| "(higher → more impersonal, harder). " | |
| "**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish " | |
| "(higher → harder). " | |
| "**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) " | |
| "(higher → more genitive stacking, harder). " | |
| "**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of " | |
| "administrative Polish (higher → harder). " | |
| "**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)." | |
| ), | |
| "metrics": [ | |
| ("markers", "participle_ratio", "Participle ratio", True, False), | |
| ("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True), | |
| ("markers", "gerund_ratio", "Gerund ratio", True, True), | |
| ("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True), | |
| ("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True), | |
| ("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True), | |
| ("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True), | |
| ("markers", "osc_noun_ratio", "OSC noun ratio", True, True), | |
| ], | |
| }, | |
| { | |
| "name": "Meaning Preservation", | |
| "in_rrf": True, | |
| "rrf_weight": 4, | |
| "description": ( | |
| "Semantic metrics that directly test whether the simplified text says the same thing as the original. " | |
| "NLI checks bidirectional entailment; QuestEval checks information preservation via QA. " | |
| "NE Retention measures what fraction of named entities from the original appear in the simplified text " | |
| "(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). " | |
| "IFEval include is the fraction of 'include' constraints satisfied by the simplified text. " | |
| "Higher is better for all." | |
| ), | |
| "metrics": [ | |
| ("similarity", "nli_f1", "NLI F1", False, True), | |
| ("questeval", "f1", "QuestEval F1", False, True), | |
| ("similarity", "ne_retention", "NE Retention", False, True), | |
| ("ifeval", "avg_include", "IFEval include", False, True), | |
| ], | |
| }, | |
| ] | |
| def _col_name(source: str, label: str) -> str: | |
| """Column name used in category DataFrames.""" | |
| if source in ("metrics", "markers"): | |
| return f"{label} (Δ%)" | |
| return label | |
| def _model_label(data: dict) -> str: | |
| """Return a unique display name, appending reasoning effort when present. | |
| The parameter size is shown separately (see :func:`_params_str`), in its | |
| own column, mirroring the PLCC leaderboard layout. | |
| """ | |
| model = data["metadata"]["model"] | |
| effort = ( | |
| data["metadata"] | |
| .get("model_kwargs", {}) | |
| .get("extra_body", {}) | |
| .get("reasoning", {}) | |
| .get("effort") | |
| ) | |
| if effort is not None: | |
| return f"{model} [reasoning: {effort}]" | |
| return model | |
| def _params_str(params: float | None) -> str | None: | |
| """PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown).""" | |
| p = params or 0 | |
| if p <= 0: | |
| return None | |
| return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B" | |
| def _params_map() -> dict[str, str]: | |
| """Model label → formatted parameter size, read from each file's metadata.""" | |
| out: dict[str, str] = {} | |
| for data in load_records(): | |
| label = _params_str(data["metadata"].get("total_params_b")) | |
| if label: | |
| out[_model_label(data)] = label | |
| return out | |
| def _metric_row( | |
| label_map: dict, | |
| summary_metrics: dict, | |
| row: dict, | |
| detail_row: dict, | |
| *, | |
| include_detail: bool = True, | |
| ) -> None: | |
| """Populate leaderboard row and detail row from a label→key map.""" | |
| for key, label in label_map.items(): | |
| vals = summary_metrics.get(key, {}) | |
| row[f"{label} (Δ)"] = vals.get("avg_diff") | |
| row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") | |
| if include_detail: | |
| detail_row[f"{label} before"] = vals.get("avg_before") | |
| detail_row[f"{label} after"] = vals.get("avg_after") | |
| detail_row[f"{label} (Δ)"] = vals.get("avg_diff") | |
| detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") | |
| def load_leaderboard_data() -> tuple[pd.DataFrame, ...]: | |
| """Load scored JSON files and build leaderboard DataFrames. | |
| Returns: | |
| (readability_orth_df, readability_lemma_df, | |
| lexical_orth_df, lexical_lemma_df, | |
| similarity_df, questeval_df, markers_df, detail_df) | |
| """ | |
| read_orth_rows, read_lemma_rows = [], [] | |
| lex_orth_rows, lex_lemma_rows = [], [] | |
| similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], [] | |
| if not DATA_DIR.exists(): | |
| empty = pd.DataFrame() | |
| return empty, empty, empty, empty, empty, empty, empty, empty | |
| for data in load_records(): | |
| model = _model_label(data) | |
| n = data["summary"]["n"] | |
| metrics = data["summary"]["metrics"] | |
| similarity = data["summary"].get("similarity", {}) | |
| questeval = data["summary"].get("questeval", {}) | |
| markers = data["summary"].get("markers", {}) | |
| base = {"Model": model, "N": n} | |
| read_orth_row = dict(base) | |
| read_lemma_row = dict(base) | |
| lex_orth_row = dict(base) | |
| lex_lemma_row = dict(base) | |
| similarity_row = dict(base) | |
| questeval_row = dict(base) | |
| markers_row = dict(base) | |
| detail_row = dict(base) | |
| _metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row) | |
| _metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row) | |
| _metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row) | |
| _metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row) | |
| for key, label in SIMILARITY_LABELS.items(): | |
| similarity_row[label] = similarity.get(key) | |
| for key, label in QUESTEVAL_LABELS.items(): | |
| questeval_row[label] = questeval.get(key) | |
| for key, label in MARKER_LABELS.items(): | |
| vals = markers.get(key, {}) | |
| markers_row[f"{label} (Δ)"] = vals.get("avg_diff") | |
| markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") | |
| detail_row[f"{label} before"] = vals.get("avg_before") | |
| detail_row[f"{label} after"] = vals.get("avg_after") | |
| detail_row[f"{label} (Δ)"] = vals.get("avg_diff") | |
| detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct") | |
| read_orth_rows.append(read_orth_row) | |
| read_lemma_rows.append(read_lemma_row) | |
| lex_orth_rows.append(lex_orth_row) | |
| lex_lemma_rows.append(lex_lemma_row) | |
| similarity_rows.append(similarity_row) | |
| questeval_rows.append(questeval_row) | |
| markers_rows.append(markers_row) | |
| detail_rows.append(detail_row) | |
| dfs = [ | |
| pd.DataFrame(read_orth_rows), | |
| pd.DataFrame(read_lemma_rows), | |
| pd.DataFrame(lex_orth_rows), | |
| pd.DataFrame(lex_lemma_rows), | |
| pd.DataFrame(similarity_rows), | |
| pd.DataFrame(questeval_rows), | |
| pd.DataFrame(markers_rows), | |
| pd.DataFrame(detail_rows), | |
| ] | |
| for df in dfs: | |
| num_cols = df.select_dtypes(include="number").columns | |
| df[num_cols] = df[num_cols].round(4) | |
| return tuple(dfs) | |
| def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]: | |
| """Per-model matched IFEval records, cached once. | |
| Manual IFEval rules are hand-written for a subset of the prompts, so the | |
| comparison only makes sense on records carrying *both* an automatic and a | |
| manual score. This reads the per-text ``results`` arrays (which | |
| ``load_records`` discards) once and keeps, per model, the tuples | |
| ``(category, prompt_id, auto_include, auto_exclude, man_include, | |
| man_exclude)`` so the dropdown filters can re-aggregate cheaply. | |
| """ | |
| out: list[tuple[str, tuple[tuple, ...]]] = [] | |
| if not DATA_DIR.exists(): | |
| return () | |
| for fp in sorted(DATA_DIR.glob("*_scored_anon.json")): | |
| with open(fp, encoding="utf-8") as f: | |
| data = json.load(f) | |
| model = _model_label(data) | |
| recs: list[tuple] = [] | |
| for rec in data["results"]: | |
| man = rec.get("ifeval_manual") | |
| auto = rec.get("ifeval") | |
| if not man or not auto: | |
| continue | |
| recs.append(( | |
| rec.get("category"), | |
| rec.get("prompt_id"), | |
| auto.get("include"), auto.get("exclude"), | |
| man.get("include"), man.get("exclude"), | |
| )) | |
| if recs: | |
| out.append((model, tuple(recs))) | |
| return tuple(out) | |
| def load_ifeval_comparison_df( | |
| text_category: str | None = None, | |
| prompt: str | None = None, | |
| size_limit: str | None = None, | |
| model_type: str | None = None, | |
| ) -> pd.DataFrame: | |
| """Compare manual (gold) IFEval against automatic IFEval, per model. | |
| The comparison is restricted to records carrying *both* an automatic and a | |
| manual score - the very same texts scored both ways, which isolates the | |
| rule-quality gap from sampling differences (the overall ``ifeval`` summary | |
| averages over ~5× more texts and so is not directly comparable). ``Δ`` | |
| columns are manual − automatic: a negative value means the automatic | |
| constraints were easier to satisfy than the hand-checked ones, i.e. the | |
| automatic rules are more lenient. | |
| ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``) | |
| restrict the matched records to one source-text category and/or one | |
| simplification prompt, mirroring the RRF dropdown filters. | |
| """ | |
| tc = None if text_category in (None, "All") else text_category | |
| pr = None if prompt in (None, "All") else prompt | |
| # Automatic IFEval over *all* records (not just the manual-matched subset), | |
| # from the summary buckets, so it tracks the same category/prompt filters. | |
| # Restricted to models passing the size / model-type filters. | |
| allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)} | |
| summaries = { | |
| _model_label(data): data["summary"] | |
| for data in load_records() | |
| if _model_label(data) in allowed | |
| } | |
| rows: list[dict] = [] | |
| for model, recs in _load_ifeval_records(): | |
| if model not in allowed: | |
| continue | |
| ai = ae = mi = me = 0.0 | |
| ni = ne = 0 | |
| for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs: | |
| if tc and cat != tc: | |
| continue | |
| if pr and prompt_id != pr: | |
| continue | |
| if m_inc is not None and a_inc is not None: | |
| ai += a_inc; mi += m_inc; ni += 1 | |
| if m_exc is not None and a_exc is not None: | |
| ae += a_exc; me += m_exc; ne += 1 | |
| if ni == 0 and ne == 0: | |
| continue | |
| auto_inc = ai / ni if ni else None | |
| man_inc = mi / ni if ni else None | |
| auto_exc = ae / ne if ne else None | |
| man_exc = me / ne if ne else None | |
| auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr) | |
| all_inc = auto_all.get("avg_include") | |
| all_exc = auto_all.get("avg_exclude") | |
| rows.append({ | |
| "Model": model, | |
| "N": ni or ne, | |
| "Manual include": man_inc, | |
| "Manual exclude": man_exc, | |
| "Auto include": auto_inc, | |
| "Auto include (all)": all_inc, | |
| "Δ include (man−auto)": (man_inc - auto_inc) if ni else None, | |
| "Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None, | |
| "Auto exclude": auto_exc, | |
| "Auto exclude (all)": all_exc, | |
| "Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None, | |
| "Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None, | |
| }) | |
| df = pd.DataFrame(rows) | |
| if df.empty: | |
| return df | |
| df = df.sort_values("Model").reset_index(drop=True) | |
| num_cols = df.select_dtypes(include="number").columns | |
| df[num_cols] = df[num_cols].round(4) | |
| return df | |
| def text_category_choices() -> list[str]: | |
| """All source-text categories present in the data, prefixed with 'All'.""" | |
| cats: set[str] = set() | |
| for data in load_records(): | |
| cats.update(data["summary"].get("metrics_by_category", {}).keys()) | |
| return ["All"] + sorted(cats) | |
| def prompt_choices() -> list[str]: | |
| """All simplification prompts present in the data, prefixed with 'All'.""" | |
| prompts: set[str] = set() | |
| for data in load_records(): | |
| prompts.update(data["summary"].get("metrics_by_prompt", {}).keys()) | |
| return ["All"] + sorted(prompts) | |
| def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict: | |
| """Return the metric bucket for one source, filtered by text category and/or prompt. | |
| Picks the overall summary when neither filter is set, the ``*_by_category`` / | |
| ``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket | |
| (keyed ``"CATEGORY/PROMPT"``) when both are set. | |
| """ | |
| if source in ("metrics", "markers", "similarity"): | |
| if tc and prompt: | |
| return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {}) | |
| if tc: | |
| return s.get(f"{source}_by_category", {}).get(tc, {}) | |
| if prompt: | |
| return s.get(f"{source}_by_prompt", {}).get(prompt, {}) | |
| return s.get(source, {}) | |
| # questeval / ifeval keep their per-filter buckets nested under the source object | |
| src = s.get(source, {}) | |
| if tc and prompt: | |
| return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {}) | |
| if tc: | |
| return src.get("by_category", {}).get(tc, {}) | |
| if prompt: | |
| return src.get("by_prompt", {}).get(prompt, {}) | |
| return src | |
| def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None: | |
| """Sample count for the selected filters, from whichever source records it.""" | |
| for src in ("questeval", "ifeval"): | |
| n = _source_bucket(s, src, tc, prompt).get("n") | |
| if n is not None: | |
| return n | |
| return None | |
| def load_category_df( | |
| category: dict, | |
| text_category: str | None = None, | |
| prompt: str | None = None, | |
| ) -> pd.DataFrame: | |
| """Build a DataFrame for one metric category with a per-category RRF score. | |
| ``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``) | |
| restrict the metrics to one source-text category and/or one simplification | |
| prompt via the matching ``*_by_category`` / ``*_by_prompt`` / | |
| ``*_by_category_prompt`` buckets; otherwise the overall summary is used. | |
| The RRF is always computed over **all** models; the size-limit / model-type | |
| filters are applied afterwards (in ``load_rrf_views``) as pure row filters, | |
| so they never change a model's rank or score. | |
| """ | |
| rows: list[dict] = [] | |
| tc = None if text_category in (None, "All") else text_category | |
| pr = None if prompt in (None, "All") else prompt | |
| for data in load_records(): | |
| s = data["summary"] | |
| model = _model_label(data) | |
| n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"] | |
| row: dict = {"Model": model, "N": n} | |
| for source, key, label, _asc, in_rrf in category["metrics"]: | |
| if not in_rrf: | |
| continue | |
| col = _col_name(source, label) | |
| bucket = _source_bucket(s, source, tc, pr) | |
| if source in ("metrics", "markers"): | |
| row[col] = bucket.get(key, {}).get("avg_diff_pct") | |
| else: # similarity, questeval, ifeval store the value directly | |
| row[col] = bucket.get(key) | |
| rows.append(row) | |
| df = pd.DataFrame(rows) | |
| if df.empty: | |
| return df | |
| num_cols = df.select_dtypes(include="number").columns | |
| df[num_cols] = df[num_cols].round(4) | |
| rrf = pd.Series(0.0, index=df.index) | |
| for source, key, label, ascending, in_rrf in category["metrics"]: | |
| if not in_rrf: | |
| continue | |
| col = _col_name(source, label) | |
| if col not in df.columns or df[col].isna().all(): | |
| continue | |
| rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom")) | |
| df.insert(2, "RRF Score", rrf.round(4)) | |
| df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True) | |
| df.insert(0, "Rank", range(1, len(df) + 1)) | |
| df.insert(2, "Params", df["Model"].map(_params_map()).fillna("")) | |
| return df | |
| def _plcc_overall_map() -> dict[str, float]: | |
| """Model label → external PLCC overall score, read from each file's metadata. | |
| PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the | |
| score is carried verbatim in ``metadata.plcc.overall`` and shown for | |
| reference only - it does not feed the RRF ranking. Models without a PLCC | |
| entry are omitted (mapped to NaN in the table). | |
| """ | |
| out: dict[str, float] = {} | |
| for data in load_records(): | |
| plcc = data["metadata"].get("plcc") or {} | |
| overall = plcc.get("overall") | |
| if overall is not None: | |
| out[_model_label(data)] = overall | |
| return out | |
| def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame: | |
| """Fuse per-category RRF scores into a final ranking via RRF. | |
| Each category column shows the model's **rank within that category** (1 = best); | |
| those ranks are what the RRF fusion uses to produce the overall ``Final RRF``. | |
| A reference ``PLCC`` column carries the external PLCC benchmark score and does | |
| not influence the ranking. | |
| """ | |
| merged: pd.DataFrame | None = None | |
| for cat, cat_df in category_data: | |
| if not cat.get("in_rrf", True) or cat_df.empty: | |
| continue | |
| sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]}) | |
| merged = sub if merged is None else merged.merge(sub, on="Model", how="outer") | |
| if merged is None or merged.empty: | |
| return pd.DataFrame() | |
| # N (sample count) is identical across categories for a given model, so take | |
| # it from whichever category table carries it. | |
| n_map: dict = {} | |
| for _cat, cat_df in category_data: | |
| if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns): | |
| n_map = dict(zip(cat_df["Model"], cat_df["N"])) | |
| break | |
| score_cols = [c for c in merged.columns if c != "Model"] | |
| weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES} | |
| out = merged[["Model"]].copy() | |
| rrf = pd.Series(0.0, index=merged.index) | |
| rank_cols: dict[str, pd.Series] = {} | |
| for col in score_cols: | |
| ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int) | |
| rrf += weights.get(col, 1) / (RRF_K + ranks) | |
| rank_cols[col] = ranks | |
| out.insert(1, "Final RRF", rrf.round(4)) | |
| out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2)) | |
| for name, ranks in rank_cols.items(): | |
| out[name] = ranks | |
| out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True) | |
| out.insert(0, "Rank", range(1, len(out) + 1)) | |
| out.insert(2, "Params", out["Model"].map(_params_map()).fillna("")) | |
| out.insert(3, "N", out["Model"].map(n_map).astype("Int64")) | |
| return out | |
| def build_tradeoff_scatter( | |
| text_category: str | None = None, | |
| prompt: str | None = None, | |
| size_limit: str | None = None, | |
| model_type: str | None = None, | |
| ) -> go.Figure | None: | |
| """Scatter of Gunning Fog reduction vs meaning preservation, one point per model. | |
| X: Gunning Fog orth Δ% (more negative = greater complexity reduction) | |
| Y: QuestEval F1 (higher = better meaning preservation) | |
| Honours the same text-category / prompt / size / model-type filters as the | |
| RRF rankings. | |
| """ | |
| tc = None if text_category in (None, "All") else text_category | |
| pr = None if prompt in (None, "All") else prompt | |
| points = [] | |
| for data in _filtered_records(size_limit, model_type): | |
| s = data["summary"] | |
| model = _model_label(data) | |
| x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct") | |
| y = _source_bucket(s, "questeval", tc, pr).get("f1") | |
| if x is None or y is None: | |
| continue | |
| points.append((model, x, y)) | |
| if not points: | |
| return None | |
| models, xs, ys = zip(*points) | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatter( | |
| x=xs, | |
| y=ys, | |
| mode="markers+text", | |
| text=models, | |
| textposition="top center", | |
| textfont={"size": 10}, | |
| marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}}, | |
| hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>", | |
| ) | |
| ) | |
| x_mid = (min(xs) + max(xs)) / 2 | |
| y_mid = (min(ys) + max(ys)) / 2 | |
| fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray") | |
| fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray") | |
| fig.update_layout( | |
| title="Complexity reduction vs meaning preservation", | |
| xaxis_title="Gunning Fog orth Δ% (← easier text)", | |
| yaxis_title="QuestEval F1 (↑ meaning preserved)", | |
| height=560, | |
| margin={"l": 60, "r": 40, "t": 60, "b": 60}, | |
| plot_bgcolor="white", | |
| ) | |
| fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC") | |
| fig.update_yaxes(showgrid=True, gridcolor="#EEE") | |
| return fig | |
| def build_fog_nli_scatter( | |
| text_category: str | None = None, | |
| prompt: str | None = None, | |
| size_limit: str | None = None, | |
| model_type: str | None = None, | |
| ) -> go.Figure | None: | |
| """Scatter of Gunning Fog reduction vs NLI F1, one point per model. | |
| X: Gunning Fog orth Δ% (more negative = greater complexity reduction) | |
| Y: NLI F1 (higher = stronger entailment / meaning preserved) | |
| Honours the same text-category / prompt / size / model-type filters as the | |
| RRF rankings. | |
| """ | |
| tc = None if text_category in (None, "All") else text_category | |
| pr = None if prompt in (None, "All") else prompt | |
| points = [] | |
| for data in _filtered_records(size_limit, model_type): | |
| s = data["summary"] | |
| model = _model_label(data) | |
| x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct") | |
| y = _source_bucket(s, "similarity", tc, pr).get("nli_f1") | |
| if x is None or y is None: | |
| continue | |
| points.append((model, x, y)) | |
| if not points: | |
| return None | |
| models, xs, ys = zip(*points) | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatter( | |
| x=xs, | |
| y=ys, | |
| mode="markers+text", | |
| text=models, | |
| textposition="top center", | |
| textfont={"size": 10}, | |
| marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}}, | |
| hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>", | |
| ) | |
| ) | |
| x_mid = (min(xs) + max(xs)) / 2 | |
| y_mid = (min(ys) + max(ys)) / 2 | |
| fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray") | |
| fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray") | |
| fig.update_layout( | |
| title="Complexity reduction vs NLI consistency", | |
| xaxis_title="Gunning Fog orth Δ% (← easier text)", | |
| yaxis_title="NLI F1 (↑ meaning preserved)", | |
| height=560, | |
| margin={"l": 60, "r": 40, "t": 60, "b": 60}, | |
| plot_bgcolor="white", | |
| ) | |
| fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC") | |
| fig.update_yaxes(showgrid=True, gridcolor="#EEE") | |
| return fig | |
| INTRO = """\ | |
| # PLainBench - Polish Text Simplification Leaderboard | |
| This benchmark evaluates how well LLMs simplify difficult Polish texts - | |
| drawn from legal/administrative (BIP/GOV), finance, and science domains - while | |
| preserving the original meaning. Each model simplifies 210 source texts under | |
| 5 simplification prompts (1050 outputs per model). Outputs are scored on | |
| readability indices, fine-grained difficulty markers (lexical, syntactic, | |
| morphological), meaning preservation (NLI entailment, QuestEval QA consistency, | |
| named-entity retention), and instruction following (IFEval include/exclude). | |
| The per-category scores are fused into an overall **Final RRF** ranking. | |
| """ | |
| METRICS_DOC = """\ | |
| ## Metrics | |
| ### Readability indices | |
| All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL | |
| dictionary) and counted on surface (orthographic) word forms. | |
| Δ is the absolute change (after − before); Δ% is the average percentage change | |
| from the original text to the simplified text. | |
| | Metric | Formula | Interpretation | | |
| |---|---|---| | |
| | **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** | | |
| | **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** | | |
| | **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** | | |
| ### Difficulty markers | |
| Fine-grained syntactic, morphological, and lexical features. | |
| Δ is absolute change; Δ% is percentage change. | |
| Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the | |
| surface (orthographic) form. | |
| | Marker | Description | Desired Δ% | | |
| |---|---|---| | |
| | **Avg word syllables** | Mean syllable count per word | − (shorter words) | | |
| | **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − | | |
| | **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − | | |
| | **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) | | |
| | **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) | | |
| | **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) | | |
| | **Subordination index** | Subordinate clauses / total clauses | − | | |
| | **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − | | |
| | **Gerund ratio** | Gerunds / all tokens | − | | |
| | **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − | | |
| | **Genitive noun ratio** | Nouns in genitive case / all tokens | − | | |
| | **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − | | |
| | **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − | | |
| | **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − | | |
| ### Similarity metrics | |
| Reference-based metrics comparing simplified text against the original. | |
| | Metric | Description | Direction | | |
| |---|---|---| | |
| | **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment | | |
| | **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved | | |
| *Only **NLI F1** feeds the RRF score; P and R are shown for context.* | |
| ### QuestEval - QA consistency | |
| | Metric | Description | Direction | | |
| |---|---|---| | |
| | **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations | | |
| | **QuestEval R** | Forward recall - information preserved | Higher = less content dropped | | |
| | **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation | | |
| | **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic | | |
| | **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original | | |
| *Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.* | |
| ### IFEval - instruction following | |
| | Metric | Description | Direction | | |
| |---|---|---| | |
| | **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better | | |
| | **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better | | |
| """ | |
| # Sample-count note shown under each table that carries an ``N`` column. | |
| N_NOTE = "**N** = number of prompt × text evaluations per model." | |
| # The five simplification prompts every model is run with. The keys match the | |
| # "Simplification prompt" filter values (and the ``*_by_prompt`` summary | |
| # buckets); each value is ``(short description, user-message template)``, where | |
| # ``<text>`` marks where the source text is inserted. Kept in sync with | |
| # generation/prompting/instruction.py. Ordered from least to most detailed. | |
| PROMPTS: dict[str, tuple[str, str]] = { | |
| "mini": ( | |
| "Minimal - a single-line instruction, no rules.", | |
| "Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, " | |
| "bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>", | |
| ), | |
| "compact": ( | |
| "Compact - a short bulleted rule set.", | |
| """Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej. | |
| Zasady: | |
| - Skup się na najważniejszych informacjach, usuń zbędne treści. | |
| - Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity. | |
| - Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj. | |
| - Twórz krótkie zdania (jedna myśl = jedno zdanie). | |
| - Pisz bezpośrednio do odbiorcy i używaj strony czynnej. | |
| - Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń. | |
| - Zachowaj poprawność językową i logiczną spójność. | |
| - W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. | |
| --- | |
| ### Tekst do uproszczenia: | |
| <text>""", | |
| ), | |
| "medium": ( | |
| "Medium - moderately detailed rules with sub-points.", | |
| """Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens. | |
| ### Zasady: | |
| - Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści. | |
| - Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity. | |
| - Stosuj proste i naturalne słownictwo: | |
| - zamieniaj trudne lub specjalistyczne wyrazy na prostsze, | |
| - jeśli trzeba — krótko je wyjaśnij lub podaj przykład. | |
| - Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów). | |
| - Pisz bezpośrednio do odbiorcy i używaj strony czynnej. | |
| - Unikaj: | |
| - żargonu, stylu urzędowego i zapożyczeń, | |
| - form bezosobowych i strony biernej (jeśli nie są konieczne), | |
| - nadmiaru rzeczowników odczasownikowych, | |
| - podwójnych przeczeń i zawiłych konstrukcji. | |
| - Zachowaj poprawność językową, spójność i logiczny układ tekstu. | |
| - W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. | |
| --- | |
| ### Tekst do uproszczenia: | |
| <text>""", | |
| ), | |
| "long": ( | |
| "Long - full, sectioned plain-language guidelines.", | |
| """Uprość poniższy tekst zgodnie z zasadami prostego języka. | |
| ### 1. Cel i odbiorca | |
| - Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej). | |
| - Skup się na najważniejszych informacjach. | |
| ### 2. Struktura | |
| - Usuń informacje zbędne i poboczne. | |
| - Uporządkuj treść: najważniejsze informacje podaj na początku. | |
| - Podziel tekst na krótkie akapity (1 akapit = 1 myśl). | |
| - Jeśli tekst jest dłuższy, użyj nagłówków lub list. | |
| ### 3. Słownictwo | |
| - Zastępuj trudne słowa prostszymi. | |
| - Unikaj: | |
| - terminów specjalistycznych (chyba że je wyjaśnisz), | |
| - słów rzadkich, książkowych i urzędowych, | |
| - zapożyczeń i modnych zwrotów, | |
| - skrótów niezrozumiałych dla odbiorcy. | |
| - W razie potrzeby: | |
| - wyjaśnij trudne pojęcia, | |
| - podaj przykłady, | |
| - używaj konkretnych nazw zamiast ogólników. | |
| ### 4. Składnia | |
| - Twórz krótkie zdania (ok. 20 słów). | |
| - Jedno zdanie = jedna myśl. | |
| - Używaj zdań twierdzących. | |
| - Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”). | |
| - Używaj strony czynnej zamiast biernej. | |
| - Unikaj form bezosobowych i skomplikowanych konstrukcji. | |
| - Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki). | |
| ### 5. Styl | |
| - Unikaj podwójnych przeczeń. | |
| - Upraszczaj złożone konstrukcje. | |
| - Zachowaj naturalny, jasny ton. | |
| ### 6. Końcowa kontrola | |
| - Sprawdź, czy tekst jest: | |
| - zrozumiały, | |
| - poprawny językowo, | |
| - logiczny i spójny. | |
| ### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. | |
| --- | |
| ### Tekst do uproszczenia: | |
| <text>""", | |
| ), | |
| "step_by_step": ( | |
| "Step by step - role-based, numbered editorial guidelines.", | |
| """Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami: | |
| 1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań. | |
| 2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny. | |
| 3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników. | |
| 4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę. | |
| 5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj. | |
| 6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników. | |
| 7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane. | |
| 8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie. | |
| 9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy. | |
| --- | |
| ### Tekst do uproszczenia: | |
| <text>""", | |
| ), | |
| } | |
| # ── PLCC-inspired visual style ────────────────────────────────────────────── | |
| # Mirrors the sdadas/plcc leaderboard: clean white background, a system | |
| # sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded | |
| # (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in | |
| # CSS — a custom gr.themes.* would tint the component label chips blue, which | |
| # is not part of the PLCC look. | |
| PLCC_CSS = """ | |
| .gradio-container { | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, | |
| "Helvetica Neue", Arial, sans-serif !important; | |
| max-width: 1500px !important; | |
| } | |
| /* PLCC-style data tables */ | |
| .plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; } | |
| .plain-table thead th { | |
| background: #f9fafd !important; | |
| border-bottom: 2px solid #ddd !important; | |
| color: #222 !important; | |
| font-weight: 700 !important; | |
| } | |
| .plain-table tbody td { padding: 8px 10px !important; } | |
| .plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; } | |
| .plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; } | |
| /* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */ | |
| .params-col tbody td:nth-child(3), | |
| .params-col thead th:nth-child(3) { | |
| text-align: right !important; | |
| white-space: nowrap; | |
| } | |
| .params-col tbody td:nth-child(3) { color: #999 !important; } | |
| /* Filter bar — the grey rounded block holding the dropdowns */ | |
| .filter-bar { | |
| background: #f9fafd; | |
| border: 1px solid #ddd; | |
| border-radius: 0.5rem; | |
| padding: 10px 14px; | |
| } | |
| """ | |
| # Colour palette for category bars | |
| _CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"] | |
| def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame: | |
| """Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched).""" | |
| if df.empty or "Model" not in df.columns: | |
| return df | |
| return df[df["Model"].isin(allowed)].reset_index(drop=True) | |
| def load_rrf_views( | |
| text_category: str | None = None, | |
| prompt: str | None = None, | |
| size_limit: str | None = None, | |
| model_type: str | None = None, | |
| ) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]: | |
| """Final ranking DataFrame and per-category DataFrames for the selected filters. | |
| Ranks and RRF scores are computed over **all** models (honouring only the | |
| text-category / prompt filters). The size-limit and model-type selections | |
| are then applied as pure row filters that hide models without recomputing | |
| any ranking - so a surviving model keeps the rank it held in the full table. | |
| """ | |
| category_data = [ | |
| (cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES | |
| ] | |
| final_df = build_final_ranking_df(category_data) | |
| allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)} | |
| final_df = _filter_model_rows(final_df, allowed) | |
| category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data] | |
| return final_df, category_data | |
| def _tradeoff_figs( | |
| text_category: str | None = None, | |
| prompt: str | None = None, | |
| size_limit: str | None = None, | |
| model_type: str | None = None, | |
| ) -> tuple[go.Figure, go.Figure]: | |
| """Both trade-off scatters for the selected filters (empty figure when no data).""" | |
| return ( | |
| build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(), | |
| build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(), | |
| ) | |
| def build_app() -> gr.Blocks: | |
| ( | |
| read_orth_df, read_lemma_df, | |
| lex_orth_df, lex_lemma_df, | |
| similarity_df, questeval_df, | |
| markers_df, detail_df, | |
| ) = load_leaderboard_data() | |
| ifeval_cmp_df = load_ifeval_comparison_df() | |
| final_df, category_data = load_rrf_views(None, None) | |
| tc_choices = text_category_choices() | |
| pr_choices = prompt_choices() | |
| size_choices = _visible_size_limits() | |
| tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None) | |
| with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app: | |
| gr.Markdown(INTRO) | |
| if read_orth_df.empty: | |
| gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*") | |
| else: | |
| # Reactive output components, gathered in the order the change | |
| # handler returns them: final table, then one table per in-RRF | |
| # category, then the two trade-off scatters (and the IFEval table). | |
| rrf_outputs: list = [] | |
| with gr.Row(elem_classes=["filter-bar"]): | |
| tc_dropdown = gr.Dropdown( | |
| choices=tc_choices, | |
| value="All", | |
| label="Text category", | |
| info="Filter the RRF rankings to one source-text category.", | |
| ) | |
| pr_dropdown = gr.Dropdown( | |
| choices=pr_choices, | |
| value="All", | |
| label="Simplification prompt", | |
| info="Filter the RRF rankings to one simplification prompt.", | |
| ) | |
| size_dropdown = gr.Dropdown( | |
| choices=size_choices, | |
| value="ALL", | |
| label="Size limit", | |
| info="Keep only models up to this many parameters.", | |
| ) | |
| type_dropdown = gr.Dropdown( | |
| choices=MODEL_TYPES, | |
| value="ALL", | |
| label="Model type", | |
| info="Filter by open- vs closed-weights models.", | |
| ) | |
| with gr.Tabs(): | |
| # ── Final Ranking ────────────────────────────────────────── | |
| with gr.TabItem("Final Ranking"): | |
| gr.Markdown( | |
| "Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. " | |
| "Each category ranks models by its own RRF score; those ranks are then fused into a " | |
| "single **Final RRF** score. Higher = better overall simplification. " | |
| "The **PLCC** column shows the model's score on the external " | |
| "[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence " | |
| "benchmark for reference only - it does not affect the ranking (blank where unavailable)." | |
| ) | |
| final_table = gr.Dataframe( | |
| value=final_df, interactive=False, wrap=True, | |
| elem_classes=["plain-table", "params-col"], | |
| ) | |
| gr.Markdown(N_NOTE) | |
| rrf_outputs += [final_table] | |
| # ── RRF category tabs ────────────────────────────────────── | |
| for cat, cat_df in category_data: | |
| if not cat.get("in_rrf", True): | |
| continue | |
| with gr.TabItem(cat["name"]): | |
| gr.Markdown(cat["description"]) | |
| cat_table = gr.Dataframe( | |
| value=cat_df, interactive=False, wrap=True, | |
| elem_classes=["plain-table", "params-col"], | |
| ) | |
| gr.Markdown(N_NOTE) | |
| rrf_outputs += [cat_table] | |
| # ── Trade-off plots ──────────────────────────────────────── | |
| with gr.TabItem("Trade-off"): | |
| gr.Markdown( | |
| "Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation " | |
| "(QuestEval F1), one point per model. Top-left is ideal: " | |
| "greater complexity reduction **and** faithful to the original." | |
| ) | |
| tradeoff_plot = gr.Plot(value=tradeoff_fig) | |
| gr.Markdown( | |
| "---\n" | |
| "Gunning Fog orth reduction (Δ%) versus NLI F1. " | |
| "Top-left is best: greater complexity reduction **and** strong NLI entailment." | |
| ) | |
| fog_nli_plot = gr.Plot(value=fog_nli_fig) | |
| rrf_outputs += [tradeoff_plot, fog_nli_plot] | |
| with gr.TabItem("Detailed scores", visible=False): | |
| gr.Markdown( | |
| "Average scores before and after simplification, plus absolute (Δ) " | |
| "and percentage (Δ%) change - for all readability, lexical, and marker metrics." | |
| ) | |
| gr.Dataframe( | |
| value=detail_df, interactive=False, wrap=True, | |
| elem_classes=["plain-table"], | |
| ) | |
| # ── IFEval: manual vs automatic ──────────────────────────── | |
| if not ifeval_cmp_df.empty: | |
| with gr.TabItem("IFEval manual vs auto"): | |
| gr.Markdown( | |
| "**Automatic** IFEval constraints are generated by an LLM; " | |
| "**manual** constraints are hand-written gold rules, available for a " | |
| "subset of the prompts. To isolate rule quality from sampling, the " | |
| "comparison is restricted to the texts that carry **both** scores " | |
| "(N = matched texts per model), so these automatic figures differ from " | |
| "the full-sample IFEval used elsewhere.\n\n" | |
| "**include** = fraction of *include* constraints satisfied, " | |
| "**exclude** = fraction of *exclude* constraints satisfied (higher is " | |
| "better for both). **Δ = manual − automatic** (on the matched texts): a " | |
| "negative Δ means the automatic rules were easier to satisfy than the " | |
| "hand-checked ones (more lenient automatic scoring). The **(all)** columns " | |
| "show automatic IFEval over *every* text (the full-sample figure used " | |
| "elsewhere). **Δ (man−auto all)** is manual minus that full-sample " | |
| "automatic value - useful as a sanity check, but note the two cover " | |
| "different text sets (matched subset vs. all texts), so **Δ (man−auto)** " | |
| "is the rigorous like-for-like comparison." | |
| ) | |
| ifeval_cmp_table = gr.Dataframe( | |
| value=ifeval_cmp_df, interactive=False, wrap=True, | |
| elem_classes=["plain-table"], | |
| ) | |
| rrf_outputs.append(ifeval_cmp_table) | |
| # Metric documentation, shown below the results. | |
| gr.Markdown(METRICS_DOC) | |
| # Simplification prompts, documenting the "Simplification prompt" | |
| # filter values — shown below the metric documentation. | |
| gr.Markdown( | |
| "## Simplification prompts\n\n" | |
| "The five prompt templates every model is run with - these are the " | |
| "values of the **Simplification prompt** filter above. Each source " | |
| "text is simplified once per prompt, so they range from a bare " | |
| "one-line instruction to full plain-language guidelines. " | |
| "`<text>` marks where the source text is inserted." | |
| ) | |
| for _name, (_desc, _body) in PROMPTS.items(): | |
| with gr.Accordion(f"{_name} - {_desc}", open=False): | |
| gr.Markdown(f"```\n{_body}\n```") | |
| # Recompute the RRF rankings whenever any filter changes. | |
| _filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown] | |
| def _refresh_rrf( | |
| text_category: str, prompt: str, size_limit: str, model_type: str | |
| ) -> list: | |
| f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type) | |
| updates: list = [f_df] | |
| for cat, df in cat_data: | |
| if not cat.get("in_rrf", True): | |
| continue | |
| updates += [df] | |
| updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type)) | |
| if not ifeval_cmp_df.empty: | |
| updates.append( | |
| load_ifeval_comparison_df(text_category, prompt, size_limit, model_type) | |
| ) | |
| return updates | |
| for _dd in _filters: | |
| _dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs) | |
| return app | |
| app = build_app() | |
| if __name__ == "__main__": | |
| app.launch() | |