Spaces:
Running
Running
Commit ·
3bd48fe
0
Parent(s):
Initial commit
Browse files- .gitattributes +35 -0
- README.md +14 -0
- app.py +1400 -0
- data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2412_2026-06-15_095534_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2512_2026-06-12_082622_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2412_2026-06-02_091112_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2512_2026-06-02_121044_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__PLLuM-12B-chat-2412_2026-06-02_141510_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__PLLuM-12B-chat-2512_2026-06-02_195811_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__PLLuM-12B-instruct-2512_2026-06-10_102424_scored_anon.json +0 -0
- data/current/CYFRAGOVPL__PLLuM-4B-chat-2512_2026-06-02_223411_scored_anon.json +0 -0
- data/current/deepseek__deepseek-v4-pro_reasoning-high_2026-05-31_094932_scored_anon.json +0 -0
- data/current/google__gemini-3.1-pro-preview_2026-06-11_121124_scored_anon.json +0 -0
- data/current/google__gemma-3-4b-it_reasoning-none_2026-06-08_110604_scored_anon.json +0 -0
- data/current/google__gemma-4-26b-a4b-it_reasoning-high_2026-05-31_223337_scored_anon.json +0 -0
- data/current/google__gemma-4-26b-a4b-it_reasoning-none_2026-06-01_020338_scored_anon.json +0 -0
- data/current/google__gemma-4-31b-it_reasoning-high_2026-05-31_124753_scored_anon.json +0 -0
- data/current/google__gemma-4-31b-it_reasoning-none_2026-05-31_200347_scored_anon.json +0 -0
- data/current/meta-llama__llama-3.1-70b-instruct_reasoning-none_2026-06-08_102826_scored_anon.json +0 -0
- data/current/meta-llama__llama-3.1-8b-instruct_reasoning-none_2026-06-08_100015_scored_anon.json +0 -0
- data/current/mistralai__ministral-8b-2512_2026-05-31_083128_scored_anon.json +0 -0
- data/current/mistralai__mistral-nemo_2026-05-31_084528_scored_anon.json +0 -0
- data/current/openai__gpt-oss-120b_2026-06-12_123249_scored_anon.json +0 -0
- data/current/openai__gpt-oss-20b_2026-06-12_133408_scored_anon.json +0 -0
- data/current/qwen__qwen3.5-35b-a3b_reasoning-high_2026-06-01_023022_scored_anon.json +0 -0
- data/current/speakleash__Bielik-11B-v3.0-Instruct_2026-06-01_112337_scored_anon.json +0 -0
- requirements.txt +2 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: PLainBench
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.12.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Benchmark for scoring LLMs in text simplification
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,1400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PLainBench - Polish Text Simplification Leaderboard.
|
| 2 |
+
|
| 3 |
+
Reads scored anon JSON files from the data/current/ directory and displays a
|
| 4 |
+
leaderboard showing how well each LLM simplifies Polish texts, measured
|
| 5 |
+
by readability indices, difficulty markers, reference-based similarity
|
| 6 |
+
metrics, and a QuestEval-style QA consistency score.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from functools import lru_cache
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import plotly.graph_objects as go
|
| 16 |
+
|
| 17 |
+
DATA_DIR = Path(__file__).parent / "data" / "current"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@lru_cache(maxsize=1)
|
| 21 |
+
def load_records() -> tuple[dict, ...]:
|
| 22 |
+
"""Parse every scored anon JSON once and cache the result.
|
| 23 |
+
|
| 24 |
+
The full files are large (~9 MB each, holding per-text records), but the
|
| 25 |
+
app only ever reads ``metadata`` and ``summary``. We keep just those two
|
| 26 |
+
sections so each file is parsed a single time and every loader/refresh
|
| 27 |
+
reuses the in-memory copy instead of re-reading from disk.
|
| 28 |
+
"""
|
| 29 |
+
records: list[dict] = []
|
| 30 |
+
if not DATA_DIR.exists():
|
| 31 |
+
return ()
|
| 32 |
+
for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
|
| 33 |
+
with open(fp, encoding="utf-8") as f:
|
| 34 |
+
data = json.load(f)
|
| 35 |
+
records.append({"metadata": data["metadata"], "summary": data["summary"]})
|
| 36 |
+
return tuple(records)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Model-level filters, mirroring the PLCC benchmark's "Size limit" / "Model type"
|
| 40 |
+
# quick-filters. Size options are *upper bounds* in billions of parameters.
|
| 41 |
+
SIZE_LIMITS = ["ALL", "500B", "100B", "50B", "30B", "20B", "15B", "10B", "5B"]
|
| 42 |
+
MODEL_TYPES = ["ALL", "open-weights", "closed-weights"]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _model_passes(meta: dict, size_limit: str, model_type: str) -> bool:
|
| 46 |
+
"""Whether a model's metadata satisfies the size-limit / model-type filters."""
|
| 47 |
+
if model_type and model_type != "ALL":
|
| 48 |
+
want = "open" if model_type == "open-weights" else "closed"
|
| 49 |
+
if meta.get("weights") != want:
|
| 50 |
+
return False
|
| 51 |
+
if size_limit and size_limit != "ALL":
|
| 52 |
+
cap = float(size_limit.rstrip("B"))
|
| 53 |
+
params = meta.get("total_params_b") or 0
|
| 54 |
+
# Unknown / unreported size (0) can't be placed under a cap, so exclude it.
|
| 55 |
+
if params <= 0 or params > cap:
|
| 56 |
+
return False
|
| 57 |
+
return True
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _filtered_records(
|
| 61 |
+
size_limit: str | None = None, model_type: str | None = None
|
| 62 |
+
) -> list[dict]:
|
| 63 |
+
"""Records whose model passes the size-limit / model-type filters."""
|
| 64 |
+
sl = size_limit or "ALL"
|
| 65 |
+
mt = model_type or "ALL"
|
| 66 |
+
return [d for d in load_records() if _model_passes(d["metadata"], sl, mt)]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _visible_size_limits() -> list[str]:
|
| 70 |
+
"""Prune ``SIZE_LIMITS`` to the caps that actually split the current models.
|
| 71 |
+
|
| 72 |
+
A numeric cap is redundant when it selects the same set of models as the
|
| 73 |
+
next-smaller cap (no model has a size in the band between them) - those
|
| 74 |
+
upper duplicates are hidden, as is any cap below every model. ``"ALL"`` is
|
| 75 |
+
always kept. Recomputed from the data, so adding models later automatically
|
| 76 |
+
re-expands the list.
|
| 77 |
+
"""
|
| 78 |
+
params = [
|
| 79 |
+
p for d in load_records()
|
| 80 |
+
if (p := d["metadata"].get("total_params_b") or 0) > 0
|
| 81 |
+
]
|
| 82 |
+
# Ascending by value: keep the smallest representative of each distinct
|
| 83 |
+
# subset; a larger cap with the same model count is the redundant "upper" one.
|
| 84 |
+
kept: set[str] = set()
|
| 85 |
+
prev_count = -1
|
| 86 |
+
for s in sorted(
|
| 87 |
+
(s for s in SIZE_LIMITS if s != "ALL"), key=lambda s: float(s.rstrip("B"))
|
| 88 |
+
):
|
| 89 |
+
cap = float(s.rstrip("B"))
|
| 90 |
+
count = sum(1 for p in params if p <= cap)
|
| 91 |
+
if count > 0 and count != prev_count:
|
| 92 |
+
kept.add(s)
|
| 93 |
+
prev_count = count
|
| 94 |
+
# Preserve the original descending display order, with ALL first.
|
| 95 |
+
return ["ALL"] + [s for s in SIZE_LIMITS if s != "ALL" and s in kept]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
READABILITY_ORTH_LABELS = {
|
| 99 |
+
"flesch_reading_ease_orth": "Flesch RE",
|
| 100 |
+
"flesch_kincaid_grade_orth": "Flesch-Kincaid",
|
| 101 |
+
"gunning_fog_orth": "Gunning Fog",
|
| 102 |
+
"ari_orth": "ARI",
|
| 103 |
+
"linsear_write_orth": "Linsear Write",
|
| 104 |
+
"smog_grade_orth": "SMOG",
|
| 105 |
+
"coleman_liau_orth": "Coleman-Liau",
|
| 106 |
+
"pisarek_orth": "Pisarek",
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
READABILITY_LEMMA_LABELS = {
|
| 110 |
+
"flesch_reading_ease_lemma": "Flesch RE",
|
| 111 |
+
"flesch_kincaid_grade_lemma": "Flesch-Kincaid",
|
| 112 |
+
"gunning_fog_lemma": "Gunning Fog",
|
| 113 |
+
"ari_lemma": "ARI",
|
| 114 |
+
"linsear_write_lemma": "Linsear Write",
|
| 115 |
+
"smog_grade_lemma": "SMOG",
|
| 116 |
+
"coleman_liau_lemma": "Coleman-Liau",
|
| 117 |
+
"pisarek_lemma": "Pisarek",
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
LEXICAL_ORTH_LABELS = {
|
| 121 |
+
"ttr_orth": "TTR",
|
| 122 |
+
"rttr_orth": "RTTR",
|
| 123 |
+
"cttr_orth": "CTTR",
|
| 124 |
+
"herdan_orth": "Herdan",
|
| 125 |
+
"summer_orth": "Summer",
|
| 126 |
+
"dugast_orth": "Dugast",
|
| 127 |
+
"maas_orth": "Maas",
|
| 128 |
+
"mtld_orth": "MTLD",
|
| 129 |
+
"mattr_orth": "MATTR",
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
LEXICAL_LEMMA_LABELS = {
|
| 133 |
+
"ttr_lemma": "TTR",
|
| 134 |
+
"rttr_lemma": "RTTR",
|
| 135 |
+
"cttr_lemma": "CTTR",
|
| 136 |
+
"herdan_lemma": "Herdan",
|
| 137 |
+
"summer_lemma": "Summer",
|
| 138 |
+
"dugast_lemma": "Dugast",
|
| 139 |
+
"maas_lemma": "Maas",
|
| 140 |
+
"mtld_lemma": "MTLD",
|
| 141 |
+
"mattr_lemma": "MATTR",
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
SIMILARITY_LABELS = {
|
| 145 |
+
"bert_score_precision": "BERTScore P",
|
| 146 |
+
"bert_score_recall": "BERTScore R",
|
| 147 |
+
"bert_score_f1": "BERTScore F1",
|
| 148 |
+
"bleu": "BLEU",
|
| 149 |
+
"chrf": "chrF",
|
| 150 |
+
"chrfpp": "chrF++",
|
| 151 |
+
"nli_precision": "NLI P",
|
| 152 |
+
"nli_recall": "NLI R",
|
| 153 |
+
"nli_f1": "NLI F1",
|
| 154 |
+
"rouge_1_precision": "ROUGE-1 P",
|
| 155 |
+
"rouge_1_recall": "ROUGE-1 R",
|
| 156 |
+
"rouge_1_f1": "ROUGE-1 F1",
|
| 157 |
+
"rouge_2_precision": "ROUGE-2 P",
|
| 158 |
+
"rouge_2_recall": "ROUGE-2 R",
|
| 159 |
+
"rouge_2_f1": "ROUGE-2 F1",
|
| 160 |
+
"rouge_l_precision": "ROUGE-L P",
|
| 161 |
+
"rouge_l_recall": "ROUGE-L R",
|
| 162 |
+
"rouge_l_f1": "ROUGE-L F1",
|
| 163 |
+
"wer": "WER",
|
| 164 |
+
"mer": "MER",
|
| 165 |
+
"wil": "WIL",
|
| 166 |
+
"ne_retention": "NE Retention",
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
MARKER_LABELS = {
|
| 170 |
+
# counts
|
| 171 |
+
"paragraph_count": "Paragraph count",
|
| 172 |
+
"sentence_count": "Sentence count",
|
| 173 |
+
"word_count": "Word count",
|
| 174 |
+
"named_entity_count": "Named entity count",
|
| 175 |
+
"difficult_word_count": "Difficult word count",
|
| 176 |
+
"difficult_word_count_orth": "Difficult word count (orth)",
|
| 177 |
+
# average lengths
|
| 178 |
+
"avg_word_syllables": "Avg word syllables",
|
| 179 |
+
"avg_sentence_length": "Avg sentence length",
|
| 180 |
+
"avg_paragraph_length": "Avg paragraph length",
|
| 181 |
+
# lexical difficulty
|
| 182 |
+
"named_entity_ratio": "Named entity ratio",
|
| 183 |
+
"difficult_word_ratio": "Difficult word ratio",
|
| 184 |
+
"difficult_word_ratio_orth": "Difficult word ratio (orth)",
|
| 185 |
+
# POS ratios
|
| 186 |
+
"noun_ratio": "Noun ratio",
|
| 187 |
+
"difficult_noun_ratio": "Difficult noun ratio",
|
| 188 |
+
"difficult_noun_ratio_orth": "Difficult noun ratio (orth)",
|
| 189 |
+
"verb_ratio": "Verb ratio",
|
| 190 |
+
"difficult_verb_ratio": "Difficult verb ratio",
|
| 191 |
+
"difficult_verb_ratio_orth": "Difficult verb ratio (orth)",
|
| 192 |
+
"adjective_ratio": "Adjective ratio",
|
| 193 |
+
"difficult_adjective_ratio": "Difficult adjective ratio",
|
| 194 |
+
"difficult_adjective_ratio_orth": "Difficult adjective ratio (orth)",
|
| 195 |
+
# POS-to-POS ratios
|
| 196 |
+
"noun_to_verb_ratio": "Noun/verb ratio",
|
| 197 |
+
"verbo_nominal_ratio": "Verbo-nominal ratio",
|
| 198 |
+
"adj_to_verb_ratio": "Adj/verb ratio",
|
| 199 |
+
"adj_to_noun_ratio": "Adj/noun ratio",
|
| 200 |
+
# morphological
|
| 201 |
+
"nie_prefix_ratio": "Nie-prefix ratio",
|
| 202 |
+
"participle_ratio": "Participle ratio",
|
| 203 |
+
"gerund_ratio": "Gerund ratio",
|
| 204 |
+
"osc_noun_ratio": "OSC noun ratio",
|
| 205 |
+
"impersonal_verb_ratio": "Impersonal verb ratio",
|
| 206 |
+
"genitive_noun_ratio": "Genitive noun ratio",
|
| 207 |
+
"avg_genitive_chain_length": "Avg genitive chain",
|
| 208 |
+
# syntactic
|
| 209 |
+
"sentence_length_variance": "Sentence length variance",
|
| 210 |
+
"mean_dependency_distance": "Mean dep. distance",
|
| 211 |
+
"subordination_index": "Subordination index",
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
QUESTEVAL_LABELS = {
|
| 215 |
+
"precision": "QuestEval P",
|
| 216 |
+
"recall": "QuestEval R",
|
| 217 |
+
"f1": "QuestEval F1",
|
| 218 |
+
"answerable_rate_forward": "Answerable (fwd)",
|
| 219 |
+
"answerable_rate_backward": "Answerable (bwd)",
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
RRF_K = 60
|
| 223 |
+
|
| 224 |
+
# Each entry: (source, key, label, ascending_rrf, in_rrf)
|
| 225 |
+
# source — "metrics" | "markers" → use avg_diff_pct (Δ%)
|
| 226 |
+
# "similarity" | "questeval" → use absolute value
|
| 227 |
+
# ascending_rrf — True = lower value is better (rank 1 = smallest)
|
| 228 |
+
# in_rrf — include this metric in category RRF computation
|
| 229 |
+
|
| 230 |
+
CATEGORIES: list[dict] = [
|
| 231 |
+
{
|
| 232 |
+
"name": "Readability",
|
| 233 |
+
"in_rrf": True,
|
| 234 |
+
"rrf_weight": 1,
|
| 235 |
+
"description": (
|
| 236 |
+
"Readability indices - **orth** (surface-form) variants. "
|
| 237 |
+
"Δ% = percentage change after simplification. "
|
| 238 |
+
"For **Flesch RE** positive Δ% is better; for all others negative Δ% is better. "
|
| 239 |
+
"**Flesch RE** rates reading ease from average sentence length and syllables per word (higher → easier, ~0–100). "
|
| 240 |
+
"**Gunning Fog** estimates the schooling years needed to follow the text - 0.4 × (words/sentence + 100 × complex-word share), "
|
| 241 |
+
"where complex words have many syllables (lower → easier). "
|
| 242 |
+
"**Coleman-Liau** grades difficulty from average letters per 100 words and sentences per 100 words (lower → easier). "
|
| 243 |
+
"IFEval exclude is the fraction of 'exclude' constraints satisfied by the simplified text (higher is better)."
|
| 244 |
+
),
|
| 245 |
+
"metrics": [
|
| 246 |
+
("metrics", "flesch_reading_ease_orth", "Flesch RE", False, True),
|
| 247 |
+
("metrics", "gunning_fog_orth", "Gunning Fog", True, True),
|
| 248 |
+
("metrics", "coleman_liau_orth", "Coleman-Liau", True, True),
|
| 249 |
+
("ifeval", "avg_exclude", "IFEval exclude", False, True),
|
| 250 |
+
],
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"name": "Lexical Difficulty",
|
| 254 |
+
"in_rrf": True,
|
| 255 |
+
"rrf_weight": 1,
|
| 256 |
+
"description": (
|
| 257 |
+
"Word-level difficulty markers - **orth** variants where available. "
|
| 258 |
+
"Δ% = percentage change. Negative Δ% indicates reduced lexical difficulty. "
|
| 259 |
+
"**Avg word syllables** is the mean number of syllables per word (higher → longer, harder vocabulary). "
|
| 260 |
+
"**Difficult word ratio** is the share of long words (above a syllable threshold on the surface form, excluding named entities) "
|
| 261 |
+
"(higher → harder). "
|
| 262 |
+
"**Verb ratio** is verbs divided by alphabetic tokens; a more verbal, less nominal style is easier, so higher is better here. "
|
| 263 |
+
"**Difficult noun ratio** is the share of long nouns (above the syllable threshold, excluding named entities) among all words "
|
| 264 |
+
"(higher → more complex nominal vocabulary)."
|
| 265 |
+
),
|
| 266 |
+
"metrics": [
|
| 267 |
+
("markers", "avg_word_syllables", "Avg word syllables", True, True),
|
| 268 |
+
("markers", "difficult_word_ratio_orth", "Difficult word ratio", True, True),
|
| 269 |
+
("markers", "verb_ratio", "Verb ratio", False, True),
|
| 270 |
+
("markers", "difficult_noun_ratio_orth", "Difficult noun ratio", True, True),
|
| 271 |
+
],
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"name": "Syntactic",
|
| 275 |
+
"in_rrf": True,
|
| 276 |
+
"rrf_weight": 1,
|
| 277 |
+
"description": (
|
| 278 |
+
"Sentence and clause structure complexity markers. "
|
| 279 |
+
"Δ% = percentage change. Negative Δ% generally indicates simpler syntax. "
|
| 280 |
+
"**Avg sentence length** is the mean number of words per sentence (higher → harder). "
|
| 281 |
+
"**Mean dep. distance** is the average linear distance between a token and its syntactic head, in tokens (lower → simpler structure). "
|
| 282 |
+
"**Subordination index** is the ratio of subordinate clauses to total clauses (higher → more embedded structure, harder)."
|
| 283 |
+
),
|
| 284 |
+
"metrics": [
|
| 285 |
+
("markers", "avg_sentence_length", "Avg sentence length", True, True),
|
| 286 |
+
("markers", "sentence_length_variance", "Sentence length var.", True, False),
|
| 287 |
+
("markers", "mean_dependency_distance", "Mean dep. distance", True, True),
|
| 288 |
+
("markers", "subordination_index", "Subordination index", True, True),
|
| 289 |
+
],
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"name": "Morphological",
|
| 293 |
+
"in_rrf": True,
|
| 294 |
+
"rrf_weight": 1,
|
| 295 |
+
"description": (
|
| 296 |
+
"Polish-specific morphological complexity markers. "
|
| 297 |
+
"Δ% = percentage change. Negative Δ% indicates reduced morphological complexity. "
|
| 298 |
+
"**Adverbial participle ratio** is the share of adverbial participles (imiesłowy przysłówkowe / converbs, e.g. *czytając*, *przeczytawszy*) "
|
| 299 |
+
"among alphabetic tokens - a bookish, formal construction (higher → more complex). "
|
| 300 |
+
"**Gerund ratio** is the share of Polish verbal nouns (rzeczowniki odsłowne, e.g. *czytanie*) among words "
|
| 301 |
+
"(higher → more nominalised, formal). "
|
| 302 |
+
"**Impersonal verb ratio** is the share of impersonal verb forms among all verbs - impersonal modals (*należy*, *trzeba*, *można*), "
|
| 303 |
+
"passive -no/-to forms (*zrobiono*), reflexive-impersonal *się* (*mówi się*) and infinitives - typical of legal/administrative Polish "
|
| 304 |
+
"(higher → more impersonal, harder). "
|
| 305 |
+
"**Genitive noun ratio** is the share of genitive-case nouns among words - a hallmark of formal, bureaucratic Polish "
|
| 306 |
+
"(higher → harder). "
|
| 307 |
+
"**Avg genitive chain** is the mean length of runs of two or more consecutive genitive nouns (dopełniacz spiętrzony) "
|
| 308 |
+
"(higher → more genitive stacking, harder). "
|
| 309 |
+
"**Verbo-nominal ratio** is the share of light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*) - a hallmark of "
|
| 310 |
+
"administrative Polish (higher → harder). "
|
| 311 |
+
"**OSC noun ratio** is the share of abstract *-ość* nouns (*możliwość*, *konieczność*) among nouns (higher → more abstract, harder)."
|
| 312 |
+
),
|
| 313 |
+
"metrics": [
|
| 314 |
+
("markers", "participle_ratio", "Participle ratio", True, False),
|
| 315 |
+
("markers", "adverbial_participle_ratio", "Adverbial participle ratio", True, True),
|
| 316 |
+
("markers", "gerund_ratio", "Gerund ratio", True, True),
|
| 317 |
+
("markers", "impersonal_verb_ratio", "Impersonal verb ratio", True, True),
|
| 318 |
+
("markers", "genitive_noun_ratio", "Genitive noun ratio", True, True),
|
| 319 |
+
("markers", "avg_genitive_chain_length", "Avg genitive chain", True, True),
|
| 320 |
+
("markers", "verbo_nominal_ratio", "Verbo-nominal ratio", True, True),
|
| 321 |
+
("markers", "osc_noun_ratio", "OSC noun ratio", True, True),
|
| 322 |
+
],
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"name": "Meaning Preservation",
|
| 326 |
+
"in_rrf": True,
|
| 327 |
+
"rrf_weight": 4,
|
| 328 |
+
"description": (
|
| 329 |
+
"Semantic metrics that directly test whether the simplified text says the same thing as the original. "
|
| 330 |
+
"NLI checks bidirectional entailment; QuestEval checks information preservation via QA. "
|
| 331 |
+
"NE Retention measures what fraction of named entities from the original appear in the simplified text "
|
| 332 |
+
"(matched on lemmatised tokens and surface forms, so inflected forms and full-name → surname shortenings are handled). "
|
| 333 |
+
"IFEval include is the fraction of 'include' constraints satisfied by the simplified text. "
|
| 334 |
+
"Higher is better for all."
|
| 335 |
+
),
|
| 336 |
+
"metrics": [
|
| 337 |
+
("similarity", "nli_f1", "NLI F1", False, True),
|
| 338 |
+
("questeval", "f1", "QuestEval F1", False, True),
|
| 339 |
+
("similarity", "ne_retention", "NE Retention", False, True),
|
| 340 |
+
("ifeval", "avg_include", "IFEval include", False, True),
|
| 341 |
+
],
|
| 342 |
+
},
|
| 343 |
+
]
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def _col_name(source: str, label: str) -> str:
|
| 347 |
+
"""Column name used in category DataFrames."""
|
| 348 |
+
if source in ("metrics", "markers"):
|
| 349 |
+
return f"{label} (Δ%)"
|
| 350 |
+
return label
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def _model_label(data: dict) -> str:
|
| 354 |
+
"""Return a unique display name, appending reasoning effort when present.
|
| 355 |
+
|
| 356 |
+
The parameter size is shown separately (see :func:`_params_str`), in its
|
| 357 |
+
own column, mirroring the PLCC leaderboard layout.
|
| 358 |
+
"""
|
| 359 |
+
model = data["metadata"]["model"]
|
| 360 |
+
effort = (
|
| 361 |
+
data["metadata"]
|
| 362 |
+
.get("model_kwargs", {})
|
| 363 |
+
.get("extra_body", {})
|
| 364 |
+
.get("reasoning", {})
|
| 365 |
+
.get("effort")
|
| 366 |
+
)
|
| 367 |
+
if effort is not None:
|
| 368 |
+
return f"{model} [reasoning: {effort}]"
|
| 369 |
+
return model
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def _params_str(params: float | None) -> str | None:
|
| 373 |
+
"""PLCC-style parameter label: ``1.6T`` / ``685B`` / ``8B`` (None if unknown)."""
|
| 374 |
+
p = params or 0
|
| 375 |
+
if p <= 0:
|
| 376 |
+
return None
|
| 377 |
+
return f"{p / 1000:g}T" if p >= 1000 else f"{p:g}B"
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def _params_map() -> dict[str, str]:
|
| 381 |
+
"""Model label → formatted parameter size, read from each file's metadata."""
|
| 382 |
+
out: dict[str, str] = {}
|
| 383 |
+
for data in load_records():
|
| 384 |
+
label = _params_str(data["metadata"].get("total_params_b"))
|
| 385 |
+
if label:
|
| 386 |
+
out[_model_label(data)] = label
|
| 387 |
+
return out
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def _metric_row(
|
| 391 |
+
label_map: dict,
|
| 392 |
+
summary_metrics: dict,
|
| 393 |
+
row: dict,
|
| 394 |
+
detail_row: dict,
|
| 395 |
+
*,
|
| 396 |
+
include_detail: bool = True,
|
| 397 |
+
) -> None:
|
| 398 |
+
"""Populate leaderboard row and detail row from a label→key map."""
|
| 399 |
+
for key, label in label_map.items():
|
| 400 |
+
vals = summary_metrics.get(key, {})
|
| 401 |
+
row[f"{label} (Δ)"] = vals.get("avg_diff")
|
| 402 |
+
row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
|
| 403 |
+
if include_detail:
|
| 404 |
+
detail_row[f"{label} before"] = vals.get("avg_before")
|
| 405 |
+
detail_row[f"{label} after"] = vals.get("avg_after")
|
| 406 |
+
detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
|
| 407 |
+
detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def load_leaderboard_data() -> tuple[pd.DataFrame, ...]:
|
| 411 |
+
"""Load scored JSON files and build leaderboard DataFrames.
|
| 412 |
+
|
| 413 |
+
Returns:
|
| 414 |
+
(readability_orth_df, readability_lemma_df,
|
| 415 |
+
lexical_orth_df, lexical_lemma_df,
|
| 416 |
+
similarity_df, questeval_df, markers_df, detail_df)
|
| 417 |
+
"""
|
| 418 |
+
read_orth_rows, read_lemma_rows = [], []
|
| 419 |
+
lex_orth_rows, lex_lemma_rows = [], []
|
| 420 |
+
similarity_rows, questeval_rows, markers_rows, detail_rows = [], [], [], []
|
| 421 |
+
|
| 422 |
+
if not DATA_DIR.exists():
|
| 423 |
+
empty = pd.DataFrame()
|
| 424 |
+
return empty, empty, empty, empty, empty, empty, empty, empty
|
| 425 |
+
|
| 426 |
+
for data in load_records():
|
| 427 |
+
model = _model_label(data)
|
| 428 |
+
n = data["summary"]["n"]
|
| 429 |
+
metrics = data["summary"]["metrics"]
|
| 430 |
+
similarity = data["summary"].get("similarity", {})
|
| 431 |
+
questeval = data["summary"].get("questeval", {})
|
| 432 |
+
markers = data["summary"].get("markers", {})
|
| 433 |
+
|
| 434 |
+
base = {"Model": model, "N": n}
|
| 435 |
+
read_orth_row = dict(base)
|
| 436 |
+
read_lemma_row = dict(base)
|
| 437 |
+
lex_orth_row = dict(base)
|
| 438 |
+
lex_lemma_row = dict(base)
|
| 439 |
+
similarity_row = dict(base)
|
| 440 |
+
questeval_row = dict(base)
|
| 441 |
+
markers_row = dict(base)
|
| 442 |
+
detail_row = dict(base)
|
| 443 |
+
|
| 444 |
+
_metric_row(READABILITY_ORTH_LABELS, metrics, read_orth_row, detail_row)
|
| 445 |
+
_metric_row(READABILITY_LEMMA_LABELS, metrics, read_lemma_row, detail_row)
|
| 446 |
+
_metric_row(LEXICAL_ORTH_LABELS, metrics, lex_orth_row, detail_row)
|
| 447 |
+
_metric_row(LEXICAL_LEMMA_LABELS, metrics, lex_lemma_row, detail_row)
|
| 448 |
+
|
| 449 |
+
for key, label in SIMILARITY_LABELS.items():
|
| 450 |
+
similarity_row[label] = similarity.get(key)
|
| 451 |
+
|
| 452 |
+
for key, label in QUESTEVAL_LABELS.items():
|
| 453 |
+
questeval_row[label] = questeval.get(key)
|
| 454 |
+
|
| 455 |
+
for key, label in MARKER_LABELS.items():
|
| 456 |
+
vals = markers.get(key, {})
|
| 457 |
+
markers_row[f"{label} (Δ)"] = vals.get("avg_diff")
|
| 458 |
+
markers_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
|
| 459 |
+
detail_row[f"{label} before"] = vals.get("avg_before")
|
| 460 |
+
detail_row[f"{label} after"] = vals.get("avg_after")
|
| 461 |
+
detail_row[f"{label} (Δ)"] = vals.get("avg_diff")
|
| 462 |
+
detail_row[f"{label} (Δ%)"] = vals.get("avg_diff_pct")
|
| 463 |
+
|
| 464 |
+
read_orth_rows.append(read_orth_row)
|
| 465 |
+
read_lemma_rows.append(read_lemma_row)
|
| 466 |
+
lex_orth_rows.append(lex_orth_row)
|
| 467 |
+
lex_lemma_rows.append(lex_lemma_row)
|
| 468 |
+
similarity_rows.append(similarity_row)
|
| 469 |
+
questeval_rows.append(questeval_row)
|
| 470 |
+
markers_rows.append(markers_row)
|
| 471 |
+
detail_rows.append(detail_row)
|
| 472 |
+
|
| 473 |
+
dfs = [
|
| 474 |
+
pd.DataFrame(read_orth_rows),
|
| 475 |
+
pd.DataFrame(read_lemma_rows),
|
| 476 |
+
pd.DataFrame(lex_orth_rows),
|
| 477 |
+
pd.DataFrame(lex_lemma_rows),
|
| 478 |
+
pd.DataFrame(similarity_rows),
|
| 479 |
+
pd.DataFrame(questeval_rows),
|
| 480 |
+
pd.DataFrame(markers_rows),
|
| 481 |
+
pd.DataFrame(detail_rows),
|
| 482 |
+
]
|
| 483 |
+
for df in dfs:
|
| 484 |
+
num_cols = df.select_dtypes(include="number").columns
|
| 485 |
+
df[num_cols] = df[num_cols].round(4)
|
| 486 |
+
|
| 487 |
+
return tuple(dfs)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
@lru_cache(maxsize=1)
|
| 491 |
+
def _load_ifeval_records() -> tuple[tuple[str, tuple[tuple, ...]], ...]:
|
| 492 |
+
"""Per-model matched IFEval records, cached once.
|
| 493 |
+
|
| 494 |
+
Manual IFEval rules are hand-written for a subset of the prompts, so the
|
| 495 |
+
comparison only makes sense on records carrying *both* an automatic and a
|
| 496 |
+
manual score. This reads the per-text ``results`` arrays (which
|
| 497 |
+
``load_records`` discards) once and keeps, per model, the tuples
|
| 498 |
+
``(category, prompt_id, auto_include, auto_exclude, man_include,
|
| 499 |
+
man_exclude)`` so the dropdown filters can re-aggregate cheaply.
|
| 500 |
+
"""
|
| 501 |
+
out: list[tuple[str, tuple[tuple, ...]]] = []
|
| 502 |
+
if not DATA_DIR.exists():
|
| 503 |
+
return ()
|
| 504 |
+
for fp in sorted(DATA_DIR.glob("*_scored_anon.json")):
|
| 505 |
+
with open(fp, encoding="utf-8") as f:
|
| 506 |
+
data = json.load(f)
|
| 507 |
+
model = _model_label(data)
|
| 508 |
+
recs: list[tuple] = []
|
| 509 |
+
for rec in data["results"]:
|
| 510 |
+
man = rec.get("ifeval_manual")
|
| 511 |
+
auto = rec.get("ifeval")
|
| 512 |
+
if not man or not auto:
|
| 513 |
+
continue
|
| 514 |
+
recs.append((
|
| 515 |
+
rec.get("category"),
|
| 516 |
+
rec.get("prompt_id"),
|
| 517 |
+
auto.get("include"), auto.get("exclude"),
|
| 518 |
+
man.get("include"), man.get("exclude"),
|
| 519 |
+
))
|
| 520 |
+
if recs:
|
| 521 |
+
out.append((model, tuple(recs)))
|
| 522 |
+
return tuple(out)
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def load_ifeval_comparison_df(
|
| 526 |
+
text_category: str | None = None,
|
| 527 |
+
prompt: str | None = None,
|
| 528 |
+
size_limit: str | None = None,
|
| 529 |
+
model_type: str | None = None,
|
| 530 |
+
) -> pd.DataFrame:
|
| 531 |
+
"""Compare manual (gold) IFEval against automatic IFEval, per model.
|
| 532 |
+
|
| 533 |
+
The comparison is restricted to records carrying *both* an automatic and a
|
| 534 |
+
manual score - the very same texts scored both ways, which isolates the
|
| 535 |
+
rule-quality gap from sampling differences (the overall ``ifeval`` summary
|
| 536 |
+
averages over ~5× more texts and so is not directly comparable). ``Δ``
|
| 537 |
+
columns are manual − automatic: a negative value means the automatic
|
| 538 |
+
constraints were easier to satisfy than the hand-checked ones, i.e. the
|
| 539 |
+
automatic rules are more lenient.
|
| 540 |
+
|
| 541 |
+
``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
|
| 542 |
+
restrict the matched records to one source-text category and/or one
|
| 543 |
+
simplification prompt, mirroring the RRF dropdown filters.
|
| 544 |
+
"""
|
| 545 |
+
tc = None if text_category in (None, "All") else text_category
|
| 546 |
+
pr = None if prompt in (None, "All") else prompt
|
| 547 |
+
|
| 548 |
+
# Automatic IFEval over *all* records (not just the manual-matched subset),
|
| 549 |
+
# from the summary buckets, so it tracks the same category/prompt filters.
|
| 550 |
+
# Restricted to models passing the size / model-type filters.
|
| 551 |
+
allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
|
| 552 |
+
summaries = {
|
| 553 |
+
_model_label(data): data["summary"]
|
| 554 |
+
for data in load_records()
|
| 555 |
+
if _model_label(data) in allowed
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
rows: list[dict] = []
|
| 559 |
+
for model, recs in _load_ifeval_records():
|
| 560 |
+
if model not in allowed:
|
| 561 |
+
continue
|
| 562 |
+
ai = ae = mi = me = 0.0
|
| 563 |
+
ni = ne = 0
|
| 564 |
+
for cat, prompt_id, a_inc, a_exc, m_inc, m_exc in recs:
|
| 565 |
+
if tc and cat != tc:
|
| 566 |
+
continue
|
| 567 |
+
if pr and prompt_id != pr:
|
| 568 |
+
continue
|
| 569 |
+
if m_inc is not None and a_inc is not None:
|
| 570 |
+
ai += a_inc; mi += m_inc; ni += 1
|
| 571 |
+
if m_exc is not None and a_exc is not None:
|
| 572 |
+
ae += a_exc; me += m_exc; ne += 1
|
| 573 |
+
if ni == 0 and ne == 0:
|
| 574 |
+
continue
|
| 575 |
+
auto_inc = ai / ni if ni else None
|
| 576 |
+
man_inc = mi / ni if ni else None
|
| 577 |
+
auto_exc = ae / ne if ne else None
|
| 578 |
+
man_exc = me / ne if ne else None
|
| 579 |
+
auto_all = _source_bucket(summaries.get(model, {}), "ifeval", tc, pr)
|
| 580 |
+
all_inc = auto_all.get("avg_include")
|
| 581 |
+
all_exc = auto_all.get("avg_exclude")
|
| 582 |
+
rows.append({
|
| 583 |
+
"Model": model,
|
| 584 |
+
"N": ni or ne,
|
| 585 |
+
"Manual include": man_inc,
|
| 586 |
+
"Manual exclude": man_exc,
|
| 587 |
+
"Auto include": auto_inc,
|
| 588 |
+
"Auto include (all)": all_inc,
|
| 589 |
+
"Δ include (man−auto)": (man_inc - auto_inc) if ni else None,
|
| 590 |
+
"Δ include (man−auto all)": (man_inc - all_inc) if (ni and all_inc is not None) else None,
|
| 591 |
+
"Auto exclude": auto_exc,
|
| 592 |
+
"Auto exclude (all)": all_exc,
|
| 593 |
+
"Δ exclude (man−auto)": (man_exc - auto_exc) if ne else None,
|
| 594 |
+
"Δ exclude (man−auto all)": (man_exc - all_exc) if (ne and all_exc is not None) else None,
|
| 595 |
+
})
|
| 596 |
+
|
| 597 |
+
df = pd.DataFrame(rows)
|
| 598 |
+
if df.empty:
|
| 599 |
+
return df
|
| 600 |
+
df = df.sort_values("Model").reset_index(drop=True)
|
| 601 |
+
num_cols = df.select_dtypes(include="number").columns
|
| 602 |
+
df[num_cols] = df[num_cols].round(4)
|
| 603 |
+
return df
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def text_category_choices() -> list[str]:
|
| 607 |
+
"""All source-text categories present in the data, prefixed with 'All'."""
|
| 608 |
+
cats: set[str] = set()
|
| 609 |
+
for data in load_records():
|
| 610 |
+
cats.update(data["summary"].get("metrics_by_category", {}).keys())
|
| 611 |
+
return ["All"] + sorted(cats)
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
def prompt_choices() -> list[str]:
|
| 615 |
+
"""All simplification prompts present in the data, prefixed with 'All'."""
|
| 616 |
+
prompts: set[str] = set()
|
| 617 |
+
for data in load_records():
|
| 618 |
+
prompts.update(data["summary"].get("metrics_by_prompt", {}).keys())
|
| 619 |
+
return ["All"] + sorted(prompts)
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
def _source_bucket(s: dict, source: str, tc: str | None, prompt: str | None) -> dict:
|
| 623 |
+
"""Return the metric bucket for one source, filtered by text category and/or prompt.
|
| 624 |
+
|
| 625 |
+
Picks the overall summary when neither filter is set, the ``*_by_category`` /
|
| 626 |
+
``*_by_prompt`` bucket when one is set, and the ``*_by_category_prompt`` bucket
|
| 627 |
+
(keyed ``"CATEGORY/PROMPT"``) when both are set.
|
| 628 |
+
"""
|
| 629 |
+
if source in ("metrics", "markers", "similarity"):
|
| 630 |
+
if tc and prompt:
|
| 631 |
+
return s.get(f"{source}_by_category_prompt", {}).get(f"{tc}/{prompt}", {})
|
| 632 |
+
if tc:
|
| 633 |
+
return s.get(f"{source}_by_category", {}).get(tc, {})
|
| 634 |
+
if prompt:
|
| 635 |
+
return s.get(f"{source}_by_prompt", {}).get(prompt, {})
|
| 636 |
+
return s.get(source, {})
|
| 637 |
+
# questeval / ifeval keep their per-filter buckets nested under the source object
|
| 638 |
+
src = s.get(source, {})
|
| 639 |
+
if tc and prompt:
|
| 640 |
+
return src.get("by_category_prompt", {}).get(f"{tc}/{prompt}", {})
|
| 641 |
+
if tc:
|
| 642 |
+
return src.get("by_category", {}).get(tc, {})
|
| 643 |
+
if prompt:
|
| 644 |
+
return src.get("by_prompt", {}).get(prompt, {})
|
| 645 |
+
return src
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
def _bucket_n(s: dict, tc: str | None, prompt: str | None) -> int | None:
|
| 649 |
+
"""Sample count for the selected filters, from whichever source records it."""
|
| 650 |
+
for src in ("questeval", "ifeval"):
|
| 651 |
+
n = _source_bucket(s, src, tc, prompt).get("n")
|
| 652 |
+
if n is not None:
|
| 653 |
+
return n
|
| 654 |
+
return None
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
def load_category_df(
|
| 658 |
+
category: dict,
|
| 659 |
+
text_category: str | None = None,
|
| 660 |
+
prompt: str | None = None,
|
| 661 |
+
) -> pd.DataFrame:
|
| 662 |
+
"""Build a DataFrame for one metric category with a per-category RRF score.
|
| 663 |
+
|
| 664 |
+
``text_category`` and ``prompt`` (each ignored when ``None`` or ``"All"``)
|
| 665 |
+
restrict the metrics to one source-text category and/or one simplification
|
| 666 |
+
prompt via the matching ``*_by_category`` / ``*_by_prompt`` /
|
| 667 |
+
``*_by_category_prompt`` buckets; otherwise the overall summary is used.
|
| 668 |
+
The RRF is always computed over **all** models; the size-limit / model-type
|
| 669 |
+
filters are applied afterwards (in ``load_rrf_views``) as pure row filters,
|
| 670 |
+
so they never change a model's rank or score.
|
| 671 |
+
"""
|
| 672 |
+
rows: list[dict] = []
|
| 673 |
+
tc = None if text_category in (None, "All") else text_category
|
| 674 |
+
pr = None if prompt in (None, "All") else prompt
|
| 675 |
+
|
| 676 |
+
for data in load_records():
|
| 677 |
+
s = data["summary"]
|
| 678 |
+
model = _model_label(data)
|
| 679 |
+
n = (_bucket_n(s, tc, pr) or s["n"]) if (tc or pr) else s["n"]
|
| 680 |
+
row: dict = {"Model": model, "N": n}
|
| 681 |
+
|
| 682 |
+
for source, key, label, _asc, in_rrf in category["metrics"]:
|
| 683 |
+
if not in_rrf:
|
| 684 |
+
continue
|
| 685 |
+
col = _col_name(source, label)
|
| 686 |
+
bucket = _source_bucket(s, source, tc, pr)
|
| 687 |
+
if source in ("metrics", "markers"):
|
| 688 |
+
row[col] = bucket.get(key, {}).get("avg_diff_pct")
|
| 689 |
+
else: # similarity, questeval, ifeval store the value directly
|
| 690 |
+
row[col] = bucket.get(key)
|
| 691 |
+
|
| 692 |
+
rows.append(row)
|
| 693 |
+
|
| 694 |
+
df = pd.DataFrame(rows)
|
| 695 |
+
if df.empty:
|
| 696 |
+
return df
|
| 697 |
+
|
| 698 |
+
num_cols = df.select_dtypes(include="number").columns
|
| 699 |
+
df[num_cols] = df[num_cols].round(4)
|
| 700 |
+
|
| 701 |
+
rrf = pd.Series(0.0, index=df.index)
|
| 702 |
+
for source, key, label, ascending, in_rrf in category["metrics"]:
|
| 703 |
+
if not in_rrf:
|
| 704 |
+
continue
|
| 705 |
+
col = _col_name(source, label)
|
| 706 |
+
if col not in df.columns or df[col].isna().all():
|
| 707 |
+
continue
|
| 708 |
+
rrf += 1.0 / (RRF_K + df[col].rank(ascending=ascending, method="min", na_option="bottom"))
|
| 709 |
+
|
| 710 |
+
df.insert(2, "RRF Score", rrf.round(4))
|
| 711 |
+
df = df.sort_values("RRF Score", ascending=False).reset_index(drop=True)
|
| 712 |
+
df.insert(0, "Rank", range(1, len(df) + 1))
|
| 713 |
+
df.insert(2, "Params", df["Model"].map(_params_map()).fillna(""))
|
| 714 |
+
return df
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
def _plcc_overall_map() -> dict[str, float]:
|
| 718 |
+
"""Model label → external PLCC overall score, read from each file's metadata.
|
| 719 |
+
|
| 720 |
+
PLCC is a separate Polish-language-competence benchmark (sdadas/plcc); the
|
| 721 |
+
score is carried verbatim in ``metadata.plcc.overall`` and shown for
|
| 722 |
+
reference only - it does not feed the RRF ranking. Models without a PLCC
|
| 723 |
+
entry are omitted (mapped to NaN in the table).
|
| 724 |
+
"""
|
| 725 |
+
out: dict[str, float] = {}
|
| 726 |
+
for data in load_records():
|
| 727 |
+
plcc = data["metadata"].get("plcc") or {}
|
| 728 |
+
overall = plcc.get("overall")
|
| 729 |
+
if overall is not None:
|
| 730 |
+
out[_model_label(data)] = overall
|
| 731 |
+
return out
|
| 732 |
+
|
| 733 |
+
|
| 734 |
+
def build_final_ranking_df(category_data: list[tuple[dict, pd.DataFrame]]) -> pd.DataFrame:
|
| 735 |
+
"""Fuse per-category RRF scores into a final ranking via RRF.
|
| 736 |
+
|
| 737 |
+
Each category column shows the model's **rank within that category** (1 = best);
|
| 738 |
+
those ranks are what the RRF fusion uses to produce the overall ``Final RRF``.
|
| 739 |
+
A reference ``PLCC`` column carries the external PLCC benchmark score and does
|
| 740 |
+
not influence the ranking.
|
| 741 |
+
"""
|
| 742 |
+
merged: pd.DataFrame | None = None
|
| 743 |
+
for cat, cat_df in category_data:
|
| 744 |
+
if not cat.get("in_rrf", True) or cat_df.empty:
|
| 745 |
+
continue
|
| 746 |
+
sub = cat_df[["Model", "RRF Score"]].rename(columns={"RRF Score": cat["name"]})
|
| 747 |
+
merged = sub if merged is None else merged.merge(sub, on="Model", how="outer")
|
| 748 |
+
|
| 749 |
+
if merged is None or merged.empty:
|
| 750 |
+
return pd.DataFrame()
|
| 751 |
+
|
| 752 |
+
# N (sample count) is identical across categories for a given model, so take
|
| 753 |
+
# it from whichever category table carries it.
|
| 754 |
+
n_map: dict = {}
|
| 755 |
+
for _cat, cat_df in category_data:
|
| 756 |
+
if not cat_df.empty and {"Model", "N"} <= set(cat_df.columns):
|
| 757 |
+
n_map = dict(zip(cat_df["Model"], cat_df["N"]))
|
| 758 |
+
break
|
| 759 |
+
|
| 760 |
+
score_cols = [c for c in merged.columns if c != "Model"]
|
| 761 |
+
weights = {cat["name"]: cat.get("rrf_weight", 1) for cat in CATEGORIES}
|
| 762 |
+
|
| 763 |
+
out = merged[["Model"]].copy()
|
| 764 |
+
rrf = pd.Series(0.0, index=merged.index)
|
| 765 |
+
rank_cols: dict[str, pd.Series] = {}
|
| 766 |
+
for col in score_cols:
|
| 767 |
+
ranks = merged[col].rank(ascending=False, method="min", na_option="bottom").astype(int)
|
| 768 |
+
rrf += weights.get(col, 1) / (RRF_K + ranks)
|
| 769 |
+
rank_cols[col] = ranks
|
| 770 |
+
|
| 771 |
+
out.insert(1, "Final RRF", rrf.round(4))
|
| 772 |
+
out.insert(2, "PLCC", out["Model"].map(_plcc_overall_map()).round(2))
|
| 773 |
+
for name, ranks in rank_cols.items():
|
| 774 |
+
out[name] = ranks
|
| 775 |
+
out = out.sort_values("Final RRF", ascending=False).reset_index(drop=True)
|
| 776 |
+
out.insert(0, "Rank", range(1, len(out) + 1))
|
| 777 |
+
out.insert(2, "Params", out["Model"].map(_params_map()).fillna(""))
|
| 778 |
+
out.insert(3, "N", out["Model"].map(n_map).astype("Int64"))
|
| 779 |
+
return out
|
| 780 |
+
|
| 781 |
+
|
| 782 |
+
def build_tradeoff_scatter(
|
| 783 |
+
text_category: str | None = None,
|
| 784 |
+
prompt: str | None = None,
|
| 785 |
+
size_limit: str | None = None,
|
| 786 |
+
model_type: str | None = None,
|
| 787 |
+
) -> go.Figure | None:
|
| 788 |
+
"""Scatter of Gunning Fog reduction vs meaning preservation, one point per model.
|
| 789 |
+
|
| 790 |
+
X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
|
| 791 |
+
Y: QuestEval F1 (higher = better meaning preservation)
|
| 792 |
+
|
| 793 |
+
Honours the same text-category / prompt / size / model-type filters as the
|
| 794 |
+
RRF rankings.
|
| 795 |
+
"""
|
| 796 |
+
tc = None if text_category in (None, "All") else text_category
|
| 797 |
+
pr = None if prompt in (None, "All") else prompt
|
| 798 |
+
points = []
|
| 799 |
+
for data in _filtered_records(size_limit, model_type):
|
| 800 |
+
s = data["summary"]
|
| 801 |
+
model = _model_label(data)
|
| 802 |
+
x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
|
| 803 |
+
y = _source_bucket(s, "questeval", tc, pr).get("f1")
|
| 804 |
+
if x is None or y is None:
|
| 805 |
+
continue
|
| 806 |
+
points.append((model, x, y))
|
| 807 |
+
|
| 808 |
+
if not points:
|
| 809 |
+
return None
|
| 810 |
+
|
| 811 |
+
models, xs, ys = zip(*points)
|
| 812 |
+
|
| 813 |
+
fig = go.Figure()
|
| 814 |
+
fig.add_trace(
|
| 815 |
+
go.Scatter(
|
| 816 |
+
x=xs,
|
| 817 |
+
y=ys,
|
| 818 |
+
mode="markers+text",
|
| 819 |
+
text=models,
|
| 820 |
+
textposition="top center",
|
| 821 |
+
textfont={"size": 10},
|
| 822 |
+
marker={"size": 12, "color": "#4C78A8", "line": {"width": 1, "color": "white"}},
|
| 823 |
+
hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>QuestEval F1: %{y:.3f}<extra></extra>",
|
| 824 |
+
)
|
| 825 |
+
)
|
| 826 |
+
|
| 827 |
+
x_mid = (min(xs) + max(xs)) / 2
|
| 828 |
+
y_mid = (min(ys) + max(ys)) / 2
|
| 829 |
+
fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
|
| 830 |
+
fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
|
| 831 |
+
|
| 832 |
+
fig.update_layout(
|
| 833 |
+
title="Complexity reduction vs meaning preservation",
|
| 834 |
+
xaxis_title="Gunning Fog orth Δ% (← easier text)",
|
| 835 |
+
yaxis_title="QuestEval F1 (↑ meaning preserved)",
|
| 836 |
+
height=560,
|
| 837 |
+
margin={"l": 60, "r": 40, "t": 60, "b": 60},
|
| 838 |
+
plot_bgcolor="white",
|
| 839 |
+
)
|
| 840 |
+
fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
|
| 841 |
+
fig.update_yaxes(showgrid=True, gridcolor="#EEE")
|
| 842 |
+
|
| 843 |
+
return fig
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
def build_fog_nli_scatter(
|
| 847 |
+
text_category: str | None = None,
|
| 848 |
+
prompt: str | None = None,
|
| 849 |
+
size_limit: str | None = None,
|
| 850 |
+
model_type: str | None = None,
|
| 851 |
+
) -> go.Figure | None:
|
| 852 |
+
"""Scatter of Gunning Fog reduction vs NLI F1, one point per model.
|
| 853 |
+
|
| 854 |
+
X: Gunning Fog orth Δ% (more negative = greater complexity reduction)
|
| 855 |
+
Y: NLI F1 (higher = stronger entailment / meaning preserved)
|
| 856 |
+
|
| 857 |
+
Honours the same text-category / prompt / size / model-type filters as the
|
| 858 |
+
RRF rankings.
|
| 859 |
+
"""
|
| 860 |
+
tc = None if text_category in (None, "All") else text_category
|
| 861 |
+
pr = None if prompt in (None, "All") else prompt
|
| 862 |
+
points = []
|
| 863 |
+
for data in _filtered_records(size_limit, model_type):
|
| 864 |
+
s = data["summary"]
|
| 865 |
+
model = _model_label(data)
|
| 866 |
+
x = _source_bucket(s, "metrics", tc, pr).get("gunning_fog_orth", {}).get("avg_diff_pct")
|
| 867 |
+
y = _source_bucket(s, "similarity", tc, pr).get("nli_f1")
|
| 868 |
+
if x is None or y is None:
|
| 869 |
+
continue
|
| 870 |
+
points.append((model, x, y))
|
| 871 |
+
|
| 872 |
+
if not points:
|
| 873 |
+
return None
|
| 874 |
+
|
| 875 |
+
models, xs, ys = zip(*points)
|
| 876 |
+
|
| 877 |
+
fig = go.Figure()
|
| 878 |
+
fig.add_trace(
|
| 879 |
+
go.Scatter(
|
| 880 |
+
x=xs,
|
| 881 |
+
y=ys,
|
| 882 |
+
mode="markers+text",
|
| 883 |
+
text=models,
|
| 884 |
+
textposition="top center",
|
| 885 |
+
textfont={"size": 10},
|
| 886 |
+
marker={"size": 12, "color": "#E45756", "line": {"width": 1, "color": "white"}},
|
| 887 |
+
hovertemplate="<b>%{text}</b><br>Gunning Fog Δ%: %{x:.2f}<br>NLI F1: %{y:.3f}<extra></extra>",
|
| 888 |
+
)
|
| 889 |
+
)
|
| 890 |
+
|
| 891 |
+
x_mid = (min(xs) + max(xs)) / 2
|
| 892 |
+
y_mid = (min(ys) + max(ys)) / 2
|
| 893 |
+
fig.add_hline(y=y_mid, line_dash="dot", line_color="lightgray")
|
| 894 |
+
fig.add_vline(x=x_mid, line_dash="dot", line_color="lightgray")
|
| 895 |
+
|
| 896 |
+
fig.update_layout(
|
| 897 |
+
title="Complexity reduction vs NLI consistency",
|
| 898 |
+
xaxis_title="Gunning Fog orth Δ% (← easier text)",
|
| 899 |
+
yaxis_title="NLI F1 (↑ meaning preserved)",
|
| 900 |
+
height=560,
|
| 901 |
+
margin={"l": 60, "r": 40, "t": 60, "b": 60},
|
| 902 |
+
plot_bgcolor="white",
|
| 903 |
+
)
|
| 904 |
+
fig.update_xaxes(showgrid=True, gridcolor="#EEE", zeroline=True, zerolinecolor="#CCC")
|
| 905 |
+
fig.update_yaxes(showgrid=True, gridcolor="#EEE")
|
| 906 |
+
|
| 907 |
+
return fig
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
INTRO = """\
|
| 911 |
+
# PLainBench - Polish Text Simplification Leaderboard
|
| 912 |
+
|
| 913 |
+
This benchmark evaluates how well LLMs simplify difficult Polish texts -
|
| 914 |
+
drawn from legal/administrative (BIP/GOV), finance, and science domains - while
|
| 915 |
+
preserving the original meaning. Each model simplifies 210 source texts under
|
| 916 |
+
5 simplification prompts (1050 outputs per model). Outputs are scored on
|
| 917 |
+
readability indices, fine-grained difficulty markers (lexical, syntactic,
|
| 918 |
+
morphological), meaning preservation (NLI entailment, QuestEval QA consistency,
|
| 919 |
+
named-entity retention), and instruction following (IFEval include/exclude).
|
| 920 |
+
The per-category scores are fused into an overall **Final RRF** ranking.
|
| 921 |
+
"""
|
| 922 |
+
|
| 923 |
+
METRICS_DOC = """\
|
| 924 |
+
## Metrics
|
| 925 |
+
|
| 926 |
+
### Readability indices
|
| 927 |
+
|
| 928 |
+
All indices are adapted for Polish syllable counting via `pyhyphen` (pl_PL
|
| 929 |
+
dictionary) and counted on surface (orthographic) word forms.
|
| 930 |
+
|
| 931 |
+
Δ is the absolute change (after − before); Δ% is the average percentage change
|
| 932 |
+
from the original text to the simplified text.
|
| 933 |
+
|
| 934 |
+
| Metric | Formula | Interpretation |
|
| 935 |
+
|---|---|---|
|
| 936 |
+
| **Flesch Reading Ease** | `206.835 − 1.015 × (words/sentences) − 84.6 × (syllables/words)` | Higher → easier text (0–100 typical range). **Desired Δ%: positive (+)** |
|
| 937 |
+
| **Gunning Fog** | `0.4 × [(words/sentences) + 100 × (complex_words/words)]` | School years needed (complex = ≥ 4 syllables). Lower → easier. **Desired Δ%: negative (−)** |
|
| 938 |
+
| **Coleman-Liau** | `0.0588 × L − 0.296 × S − 15.8` | Character-based grade level. Lower → easier. **Desired Δ%: negative (−)** |
|
| 939 |
+
|
| 940 |
+
### Difficulty markers
|
| 941 |
+
|
| 942 |
+
Fine-grained syntactic, morphological, and lexical features.
|
| 943 |
+
Δ is absolute change; Δ% is percentage change.
|
| 944 |
+
Difficult words are defined as not a named entity, ≥ 4 syllables, counted on the
|
| 945 |
+
surface (orthographic) form.
|
| 946 |
+
|
| 947 |
+
| Marker | Description | Desired Δ% |
|
| 948 |
+
|---|---|---|
|
| 949 |
+
| **Avg word syllables** | Mean syllable count per word | − (shorter words) |
|
| 950 |
+
| **Difficult word ratio (orth)** | Difficult words / all words (surface, excl. NEs) | − |
|
| 951 |
+
| **Difficult noun ratio (orth)** | Difficult nouns / all tokens (surface, excl. NEs) | − |
|
| 952 |
+
| **Verb ratio** | Verbs / all tokens | + (more verbal, less nominal) |
|
| 953 |
+
| **Avg sentence length** | Mean tokens per sentence | − (shorter sentences) |
|
| 954 |
+
| **Mean dep. distance** | Avg linear head-dependent distance (syntax complexity) | − (flatter syntax) |
|
| 955 |
+
| **Subordination index** | Subordinate clauses / total clauses | − |
|
| 956 |
+
| **Adverbial participle ratio** | Adverbial participles (converbs, e.g. *czytając*, *przeczytawszy*) / all tokens | − |
|
| 957 |
+
| **Gerund ratio** | Gerunds / all tokens | − |
|
| 958 |
+
| **Impersonal verb ratio** | Impersonal verb forms (modals *należy*/*trzeba*, -no/-to passives, reflexive *się*, infinitives) / all verbs | − |
|
| 959 |
+
| **Genitive noun ratio** | Nouns in genitive case / all tokens | − |
|
| 960 |
+
| **Avg genitive chain** | Mean length of consecutive genitive noun phrases | − |
|
| 961 |
+
| **Verbo-nominal ratio** | Light-verb + noun periphrases (*dokonać wpłaty*, *podjąć decyzję*); administrative style | − |
|
| 962 |
+
| **OSC noun ratio** | Abstract *-ość* nouns (*możliwość*, *konieczność*) / all nouns | − |
|
| 963 |
+
|
| 964 |
+
### Similarity metrics
|
| 965 |
+
|
| 966 |
+
Reference-based metrics comparing simplified text against the original.
|
| 967 |
+
|
| 968 |
+
| Metric | Description | Direction |
|
| 969 |
+
|---|---|---|
|
| 970 |
+
| **NLI P / R / F1** | NLI consistency via stella embeddings + mDeBERTa cross-encoder | Higher = stronger entailment |
|
| 971 |
+
| **NE Retention** | Fraction of named entities from the original kept in the simplified text | Higher = more entities preserved |
|
| 972 |
+
|
| 973 |
+
*Only **NLI F1** feeds the RRF score; P and R are shown for context.*
|
| 974 |
+
|
| 975 |
+
### QuestEval - QA consistency
|
| 976 |
+
|
| 977 |
+
| Metric | Description | Direction |
|
| 978 |
+
|---|---|---|
|
| 979 |
+
| **QuestEval P** | Backward precision - grounding of simplified claims | Higher = fewer hallucinations |
|
| 980 |
+
| **QuestEval R** | Forward recall - information preserved | Higher = less content dropped |
|
| 981 |
+
| **QuestEval F1** | Harmonic mean of P and R | Higher = better meaning preservation |
|
| 982 |
+
| **Answerable (fwd)** | Fraction of forward questions answerable | Higher = stays on-topic |
|
| 983 |
+
| **Answerable (bwd)** | Fraction of backward questions answerable | Higher = claims traceable to original |
|
| 984 |
+
|
| 985 |
+
*Only **QuestEval F1** feeds the RRF score; the other rows are shown for context.*
|
| 986 |
+
|
| 987 |
+
### IFEval - instruction following
|
| 988 |
+
|
| 989 |
+
| Metric | Description | Direction |
|
| 990 |
+
|---|---|---|
|
| 991 |
+
| **IFEval include** | Fraction of *include* constraints (terms the simplification must keep) satisfied | Higher = better |
|
| 992 |
+
| **IFEval exclude** | Fraction of *exclude* constraints (terms the simplification must avoid) satisfied | Higher = better |
|
| 993 |
+
"""
|
| 994 |
+
|
| 995 |
+
# Sample-count note shown under each table that carries an ``N`` column.
|
| 996 |
+
N_NOTE = "**N** = number of prompt × text evaluations per model."
|
| 997 |
+
|
| 998 |
+
# The five simplification prompts every model is run with. The keys match the
|
| 999 |
+
# "Simplification prompt" filter values (and the ``*_by_prompt`` summary
|
| 1000 |
+
# buckets); each value is ``(short description, user-message template)``, where
|
| 1001 |
+
# ``<text>`` marks where the source text is inserted. Kept in sync with
|
| 1002 |
+
# generation/prompting/instruction.py. Ordered from least to most detailed.
|
| 1003 |
+
PROMPTS: dict[str, tuple[str, str]] = {
|
| 1004 |
+
"mini": (
|
| 1005 |
+
"Minimal - a single-line instruction, no rules.",
|
| 1006 |
+
"Uprość podany tekst, w odpowiedzi podaj jedynie uproszczony tekst, "
|
| 1007 |
+
"bez dodatkowych komentarzy. Tekst do uproszczenia:\n\n<text>",
|
| 1008 |
+
),
|
| 1009 |
+
"compact": (
|
| 1010 |
+
"Compact - a short bulleted rule set.",
|
| 1011 |
+
"""Uprość poniższy tekst, tak aby był jasny, krótki i zrozumiały dla osoby bez wiedzy specjalistycznej.
|
| 1012 |
+
|
| 1013 |
+
Zasady:
|
| 1014 |
+
- Skup się na najważniejszych informacjach, usuń zbędne treści.
|
| 1015 |
+
- Uporządkuj tekst: najważniejsze na początku, podziel na krótkie akapity.
|
| 1016 |
+
- Używaj prostego, codziennego słownictwa; trudne pojęcia zastępuj lub krótko wyjaśniaj.
|
| 1017 |
+
- Twórz krótkie zdania (jedna myśl = jedno zdanie).
|
| 1018 |
+
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
|
| 1019 |
+
- Unikaj żargonu, urzędowego stylu, skomplikowanych konstrukcji i podwójnych przeczeń.
|
| 1020 |
+
- Zachowaj poprawność językową i logiczną spójność.
|
| 1021 |
+
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
|
| 1022 |
+
|
| 1023 |
+
---
|
| 1024 |
+
|
| 1025 |
+
### Tekst do uproszczenia:
|
| 1026 |
+
|
| 1027 |
+
<text>""",
|
| 1028 |
+
),
|
| 1029 |
+
"medium": (
|
| 1030 |
+
"Medium - moderately detailed rules with sub-points.",
|
| 1031 |
+
"""Uprość poniższy tekst tak, aby był zrozumiały dla osoby bez wiedzy specjalistycznej, zachowując jego sens.
|
| 1032 |
+
|
| 1033 |
+
### Zasady:
|
| 1034 |
+
- Skup się na celu tekstu i najważniejszych informacjach — usuń dygresje i zbędne treści.
|
| 1035 |
+
- Uporządkuj treść: zacznij od najważniejszych informacji, podziel tekst na krótkie akapity.
|
| 1036 |
+
- Stosuj proste i naturalne słownictwo:
|
| 1037 |
+
- zamieniaj trudne lub specjalistyczne wyrazy na prostsze,
|
| 1038 |
+
- jeśli trzeba — krótko je wyjaśnij lub podaj przykład.
|
| 1039 |
+
- Buduj krótkie zdania (jedna myśl = jedno zdanie, ok. 15–20 słów).
|
| 1040 |
+
- Pisz bezpośrednio do odbiorcy i używaj strony czynnej.
|
| 1041 |
+
- Unikaj:
|
| 1042 |
+
- żargonu, stylu urzędowego i zapożyczeń,
|
| 1043 |
+
- form bezosobowych i strony biernej (jeśli nie są konieczne),
|
| 1044 |
+
- nadmiaru rzeczowników odczasownikowych,
|
| 1045 |
+
- podwójnych przeczeń i zawiłych konstrukcji.
|
| 1046 |
+
- Zachowaj poprawność językową, spójność i logiczny układ tekstu.
|
| 1047 |
+
- W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
|
| 1048 |
+
|
| 1049 |
+
---
|
| 1050 |
+
|
| 1051 |
+
### Tekst do uproszczenia:
|
| 1052 |
+
|
| 1053 |
+
<text>""",
|
| 1054 |
+
),
|
| 1055 |
+
"long": (
|
| 1056 |
+
"Long - full, sectioned plain-language guidelines.",
|
| 1057 |
+
"""Uprość poniższy tekst zgodnie z zasadami prostego języka.
|
| 1058 |
+
|
| 1059 |
+
### 1. Cel i odbiorca
|
| 1060 |
+
- Dostosuj tekst do przeciętnego odbiorcy (zakładaj brak wiedzy specjalistycznej).
|
| 1061 |
+
- Skup się na najważniejszych informacjach.
|
| 1062 |
+
|
| 1063 |
+
### 2. Struktura
|
| 1064 |
+
- Usuń informacje zbędne i poboczne.
|
| 1065 |
+
- Uporządkuj treść: najważniejsze informacje podaj na początku.
|
| 1066 |
+
- Podziel tekst na krótkie akapity (1 akapit = 1 myśl).
|
| 1067 |
+
- Jeśli tekst jest dłuższy, użyj nagłówków lub list.
|
| 1068 |
+
|
| 1069 |
+
### 3. Słownictwo
|
| 1070 |
+
- Zastępuj trudne słowa prostszymi.
|
| 1071 |
+
- Unikaj:
|
| 1072 |
+
- terminów specjalistycznych (chyba że je wyjaśnisz),
|
| 1073 |
+
- słów rzadkich, książkowych i urzędowych,
|
| 1074 |
+
- zapożyczeń i modnych zwrotów,
|
| 1075 |
+
- skrótów niezrozumiałych dla odbiorcy.
|
| 1076 |
+
- W razie potrzeby:
|
| 1077 |
+
- wyjaśnij trudne pojęcia,
|
| 1078 |
+
- podaj przykłady,
|
| 1079 |
+
- używaj konkretnych nazw zamiast ogólników.
|
| 1080 |
+
|
| 1081 |
+
### 4. Składnia
|
| 1082 |
+
- Twórz krótkie zdania (ok. 20 słów).
|
| 1083 |
+
- Jedno zdanie = jedna myśl.
|
| 1084 |
+
- Używaj zdań twierdzących.
|
| 1085 |
+
- Pisz bezpośrednio do odbiorcy (np. „Ty”, „możesz”, „zrób”).
|
| 1086 |
+
- Używaj strony czynnej zamiast biernej.
|
| 1087 |
+
- Unikaj form bezosobowych i skomplikowanych konstrukcji.
|
| 1088 |
+
- Ogranicz rzeczowniki odczasownikowe (zamieniaj je na czasowniki).
|
| 1089 |
+
|
| 1090 |
+
### 5. Styl
|
| 1091 |
+
- Unikaj podwójnych przeczeń.
|
| 1092 |
+
- Upraszczaj złożone konstrukcje.
|
| 1093 |
+
- Zachowaj naturalny, jasny ton.
|
| 1094 |
+
|
| 1095 |
+
### 6. Końcowa kontrola
|
| 1096 |
+
- Sprawdź, czy tekst jest:
|
| 1097 |
+
- zrozumiały,
|
| 1098 |
+
- poprawny językowo,
|
| 1099 |
+
- logiczny i spójny.
|
| 1100 |
+
|
| 1101 |
+
### 7. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
|
| 1102 |
+
|
| 1103 |
+
---
|
| 1104 |
+
|
| 1105 |
+
### Tekst do uproszczenia:
|
| 1106 |
+
|
| 1107 |
+
<text>""",
|
| 1108 |
+
),
|
| 1109 |
+
"step_by_step": (
|
| 1110 |
+
"Step by step - role-based, numbered editorial guidelines.",
|
| 1111 |
+
"""Jesteś redaktorem odpowiedzialnym za wydawanie tekstów pisanych prostym językiem. Krok po kroku upraszczaj tekst kierując się poniższymi regułami:
|
| 1112 |
+
|
| 1113 |
+
1. Zachowaj prostotę - unikaj skomplikowanych i trudnych słów oraz zdań.
|
| 1114 |
+
2. Używaj zrozumiałego języka - pamiętaj, że tekst ma być czytelny dla szerokiego grona odbiorców, nie tylko dla osób z wiedzą specjalistyczną z danej dziedziny.
|
| 1115 |
+
3. Unikaj języka żargonowego czy specjalistycznego, jeśli czujesz, że nie jest on zrozumiały dla większości czytelników.
|
| 1116 |
+
4. Unikaj zbytnio skomplikowanych zwrotów - stawiaj na klarowność i prostotę.
|
| 1117 |
+
5. Utrzymuj spójność - trzymaj się jednego stylu pisania i konsekwentnie go stosuj.
|
| 1118 |
+
6. Unikaj nadmiernego użycia skrótów czy akronimów – jeśli ich używasz, to upewnij się, że są one łatwe do zrozumienia dla wszystkich czytelników.
|
| 1119 |
+
7. Przemyśl kolejność i strukturę informacji - rozmieszczenie treści powinno być logiczne i przemyślane.
|
| 1120 |
+
8. Unikaj zbędnego wydłużania tekstu, pisz zwięźle i konkretnie.
|
| 1121 |
+
9. W odpowiedzi podaj jedynie uproszczony tekst, bez dodatkowych komentarzy.
|
| 1122 |
+
|
| 1123 |
+
---
|
| 1124 |
+
|
| 1125 |
+
### Tekst do uproszczenia:
|
| 1126 |
+
|
| 1127 |
+
<text>""",
|
| 1128 |
+
),
|
| 1129 |
+
}
|
| 1130 |
+
|
| 1131 |
+
# ── PLCC-inspired visual style ──────────────────────────────────────────────
|
| 1132 |
+
# Mirrors the sdadas/plcc leaderboard: clean white background, a system
|
| 1133 |
+
# sans-serif stack, a blue accent (#2c7be5), and flat tables with shaded
|
| 1134 |
+
# (#f9fafd) headers, #ddd borders and faintly striped rows. Done entirely in
|
| 1135 |
+
# CSS — a custom gr.themes.* would tint the component label chips blue, which
|
| 1136 |
+
# is not part of the PLCC look.
|
| 1137 |
+
PLCC_CSS = """
|
| 1138 |
+
.gradio-container {
|
| 1139 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
|
| 1140 |
+
"Helvetica Neue", Arial, sans-serif !important;
|
| 1141 |
+
max-width: 1500px !important;
|
| 1142 |
+
}
|
| 1143 |
+
/* PLCC-style data tables */
|
| 1144 |
+
.plain-table table { border: 1px solid #ddd !important; font-size: 0.9em; }
|
| 1145 |
+
.plain-table thead th {
|
| 1146 |
+
background: #f9fafd !important;
|
| 1147 |
+
border-bottom: 2px solid #ddd !important;
|
| 1148 |
+
color: #222 !important;
|
| 1149 |
+
font-weight: 700 !important;
|
| 1150 |
+
}
|
| 1151 |
+
.plain-table tbody td { padding: 8px 10px !important; }
|
| 1152 |
+
.plain-table tbody tr:nth-child(even) > td { background: rgba(0, 0, 0, 0.02) !important; }
|
| 1153 |
+
.plain-table tbody tr:hover > td { background: rgba(44, 123, 229, 0.06) !important; }
|
| 1154 |
+
/* Params column (3rd) — right-aligned, PLCC-style; cells muted grey, header black */
|
| 1155 |
+
.params-col tbody td:nth-child(3),
|
| 1156 |
+
.params-col thead th:nth-child(3) {
|
| 1157 |
+
text-align: right !important;
|
| 1158 |
+
white-space: nowrap;
|
| 1159 |
+
}
|
| 1160 |
+
.params-col tbody td:nth-child(3) { color: #999 !important; }
|
| 1161 |
+
/* Filter bar — the grey rounded block holding the dropdowns */
|
| 1162 |
+
.filter-bar {
|
| 1163 |
+
background: #f9fafd;
|
| 1164 |
+
border: 1px solid #ddd;
|
| 1165 |
+
border-radius: 0.5rem;
|
| 1166 |
+
padding: 10px 14px;
|
| 1167 |
+
}
|
| 1168 |
+
"""
|
| 1169 |
+
|
| 1170 |
+
# Colour palette for category bars
|
| 1171 |
+
_CAT_COLORS = ["#4C78A8", "#72B7B2", "#54A24B", "#EECA3B", "#B279A2", "#E45756", "#F58518"]
|
| 1172 |
+
|
| 1173 |
+
|
| 1174 |
+
def _filter_model_rows(df: pd.DataFrame, allowed: set[str]) -> pd.DataFrame:
|
| 1175 |
+
"""Keep only rows whose ``Model`` is in ``allowed`` (ranks/scores untouched)."""
|
| 1176 |
+
if df.empty or "Model" not in df.columns:
|
| 1177 |
+
return df
|
| 1178 |
+
return df[df["Model"].isin(allowed)].reset_index(drop=True)
|
| 1179 |
+
|
| 1180 |
+
|
| 1181 |
+
def load_rrf_views(
|
| 1182 |
+
text_category: str | None = None,
|
| 1183 |
+
prompt: str | None = None,
|
| 1184 |
+
size_limit: str | None = None,
|
| 1185 |
+
model_type: str | None = None,
|
| 1186 |
+
) -> tuple[pd.DataFrame, list[tuple[dict, pd.DataFrame]]]:
|
| 1187 |
+
"""Final ranking DataFrame and per-category DataFrames for the selected filters.
|
| 1188 |
+
|
| 1189 |
+
Ranks and RRF scores are computed over **all** models (honouring only the
|
| 1190 |
+
text-category / prompt filters). The size-limit and model-type selections
|
| 1191 |
+
are then applied as pure row filters that hide models without recomputing
|
| 1192 |
+
any ranking - so a surviving model keeps the rank it held in the full table.
|
| 1193 |
+
"""
|
| 1194 |
+
category_data = [
|
| 1195 |
+
(cat, load_category_df(cat, text_category, prompt)) for cat in CATEGORIES
|
| 1196 |
+
]
|
| 1197 |
+
final_df = build_final_ranking_df(category_data)
|
| 1198 |
+
|
| 1199 |
+
allowed = {_model_label(d) for d in _filtered_records(size_limit, model_type)}
|
| 1200 |
+
final_df = _filter_model_rows(final_df, allowed)
|
| 1201 |
+
category_data = [(cat, _filter_model_rows(df, allowed)) for cat, df in category_data]
|
| 1202 |
+
return final_df, category_data
|
| 1203 |
+
|
| 1204 |
+
|
| 1205 |
+
def _tradeoff_figs(
|
| 1206 |
+
text_category: str | None = None,
|
| 1207 |
+
prompt: str | None = None,
|
| 1208 |
+
size_limit: str | None = None,
|
| 1209 |
+
model_type: str | None = None,
|
| 1210 |
+
) -> tuple[go.Figure, go.Figure]:
|
| 1211 |
+
"""Both trade-off scatters for the selected filters (empty figure when no data)."""
|
| 1212 |
+
return (
|
| 1213 |
+
build_tradeoff_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
|
| 1214 |
+
build_fog_nli_scatter(text_category, prompt, size_limit, model_type) or go.Figure(),
|
| 1215 |
+
)
|
| 1216 |
+
|
| 1217 |
+
|
| 1218 |
+
def build_app() -> gr.Blocks:
|
| 1219 |
+
(
|
| 1220 |
+
read_orth_df, read_lemma_df,
|
| 1221 |
+
lex_orth_df, lex_lemma_df,
|
| 1222 |
+
similarity_df, questeval_df,
|
| 1223 |
+
markers_df, detail_df,
|
| 1224 |
+
) = load_leaderboard_data()
|
| 1225 |
+
|
| 1226 |
+
ifeval_cmp_df = load_ifeval_comparison_df()
|
| 1227 |
+
final_df, category_data = load_rrf_views(None, None)
|
| 1228 |
+
tc_choices = text_category_choices()
|
| 1229 |
+
pr_choices = prompt_choices()
|
| 1230 |
+
size_choices = _visible_size_limits()
|
| 1231 |
+
tradeoff_fig, fog_nli_fig = _tradeoff_figs(None, None)
|
| 1232 |
+
|
| 1233 |
+
with gr.Blocks(title="PLainBench", css=PLCC_CSS) as app:
|
| 1234 |
+
gr.Markdown(INTRO)
|
| 1235 |
+
|
| 1236 |
+
if read_orth_df.empty:
|
| 1237 |
+
gr.Markdown("*No data found. Upload scored anon JSON files to the `data/current/` directory.*")
|
| 1238 |
+
else:
|
| 1239 |
+
# Reactive output components, gathered in the order the change
|
| 1240 |
+
# handler returns them: final table, then one table per in-RRF
|
| 1241 |
+
# category, then the two trade-off scatters (and the IFEval table).
|
| 1242 |
+
rrf_outputs: list = []
|
| 1243 |
+
|
| 1244 |
+
with gr.Row(elem_classes=["filter-bar"]):
|
| 1245 |
+
tc_dropdown = gr.Dropdown(
|
| 1246 |
+
choices=tc_choices,
|
| 1247 |
+
value="All",
|
| 1248 |
+
label="Text category",
|
| 1249 |
+
info="Filter the RRF rankings to one source-text category.",
|
| 1250 |
+
)
|
| 1251 |
+
pr_dropdown = gr.Dropdown(
|
| 1252 |
+
choices=pr_choices,
|
| 1253 |
+
value="All",
|
| 1254 |
+
label="Simplification prompt",
|
| 1255 |
+
info="Filter the RRF rankings to one simplification prompt.",
|
| 1256 |
+
)
|
| 1257 |
+
size_dropdown = gr.Dropdown(
|
| 1258 |
+
choices=size_choices,
|
| 1259 |
+
value="ALL",
|
| 1260 |
+
label="Size limit",
|
| 1261 |
+
info="Keep only models up to this many parameters.",
|
| 1262 |
+
)
|
| 1263 |
+
type_dropdown = gr.Dropdown(
|
| 1264 |
+
choices=MODEL_TYPES,
|
| 1265 |
+
value="ALL",
|
| 1266 |
+
label="Model type",
|
| 1267 |
+
info="Filter by open- vs closed-weights models.",
|
| 1268 |
+
)
|
| 1269 |
+
|
| 1270 |
+
with gr.Tabs():
|
| 1271 |
+
|
| 1272 |
+
# ── Final Ranking ──────────────────────────────────────────
|
| 1273 |
+
with gr.TabItem("Final Ranking"):
|
| 1274 |
+
gr.Markdown(
|
| 1275 |
+
"Final model ranking via **Reciprocal Rank Fusion** (k=60) over per-category RRF scores. "
|
| 1276 |
+
"Each category ranks models by its own RRF score; those ranks are then fused into a "
|
| 1277 |
+
"single **Final RRF** score. Higher = better overall simplification. "
|
| 1278 |
+
"The **PLCC** column shows the model's score on the external "
|
| 1279 |
+
"[PLCC](https://huggingface.co/spaces/sdadas/plcc) Polish-language-competence "
|
| 1280 |
+
"benchmark for reference only - it does not affect the ranking (blank where unavailable)."
|
| 1281 |
+
)
|
| 1282 |
+
final_table = gr.Dataframe(
|
| 1283 |
+
value=final_df, interactive=False, wrap=True,
|
| 1284 |
+
elem_classes=["plain-table", "params-col"],
|
| 1285 |
+
)
|
| 1286 |
+
gr.Markdown(N_NOTE)
|
| 1287 |
+
rrf_outputs += [final_table]
|
| 1288 |
+
|
| 1289 |
+
# ── RRF category tabs ──────────────────────────────────────
|
| 1290 |
+
for cat, cat_df in category_data:
|
| 1291 |
+
if not cat.get("in_rrf", True):
|
| 1292 |
+
continue
|
| 1293 |
+
with gr.TabItem(cat["name"]):
|
| 1294 |
+
gr.Markdown(cat["description"])
|
| 1295 |
+
cat_table = gr.Dataframe(
|
| 1296 |
+
value=cat_df, interactive=False, wrap=True,
|
| 1297 |
+
elem_classes=["plain-table", "params-col"],
|
| 1298 |
+
)
|
| 1299 |
+
gr.Markdown(N_NOTE)
|
| 1300 |
+
rrf_outputs += [cat_table]
|
| 1301 |
+
|
| 1302 |
+
# ── Trade-off plots ────────────────────────────────────────
|
| 1303 |
+
with gr.TabItem("Trade-off"):
|
| 1304 |
+
gr.Markdown(
|
| 1305 |
+
"Complexity reduction (Gunning Fog orth Δ%) versus meaning preservation "
|
| 1306 |
+
"(QuestEval F1), one point per model. Top-left is ideal: "
|
| 1307 |
+
"greater complexity reduction **and** faithful to the original."
|
| 1308 |
+
)
|
| 1309 |
+
tradeoff_plot = gr.Plot(value=tradeoff_fig)
|
| 1310 |
+
gr.Markdown(
|
| 1311 |
+
"---\n"
|
| 1312 |
+
"Gunning Fog orth reduction (Δ%) versus NLI F1. "
|
| 1313 |
+
"Top-left is best: greater complexity reduction **and** strong NLI entailment."
|
| 1314 |
+
)
|
| 1315 |
+
fog_nli_plot = gr.Plot(value=fog_nli_fig)
|
| 1316 |
+
rrf_outputs += [tradeoff_plot, fog_nli_plot]
|
| 1317 |
+
|
| 1318 |
+
with gr.TabItem("Detailed scores", visible=False):
|
| 1319 |
+
gr.Markdown(
|
| 1320 |
+
"Average scores before and after simplification, plus absolute (Δ) "
|
| 1321 |
+
"and percentage (Δ%) change - for all readability, lexical, and marker metrics."
|
| 1322 |
+
)
|
| 1323 |
+
gr.Dataframe(
|
| 1324 |
+
value=detail_df, interactive=False, wrap=True,
|
| 1325 |
+
elem_classes=["plain-table"],
|
| 1326 |
+
)
|
| 1327 |
+
|
| 1328 |
+
# ── IFEval: manual vs automatic ────────────────────────────
|
| 1329 |
+
if not ifeval_cmp_df.empty:
|
| 1330 |
+
with gr.TabItem("IFEval manual vs auto"):
|
| 1331 |
+
gr.Markdown(
|
| 1332 |
+
"**Automatic** IFEval constraints are generated by an LLM; "
|
| 1333 |
+
"**manual** constraints are hand-written gold rules, available for a "
|
| 1334 |
+
"subset of the prompts. To isolate rule quality from sampling, the "
|
| 1335 |
+
"comparison is restricted to the texts that carry **both** scores "
|
| 1336 |
+
"(N = matched texts per model), so these automatic figures differ from "
|
| 1337 |
+
"the full-sample IFEval used elsewhere.\n\n"
|
| 1338 |
+
"**include** = fraction of *include* constraints satisfied, "
|
| 1339 |
+
"**exclude** = fraction of *exclude* constraints satisfied (higher is "
|
| 1340 |
+
"better for both). **Δ = manual − automatic** (on the matched texts): a "
|
| 1341 |
+
"negative Δ means the automatic rules were easier to satisfy than the "
|
| 1342 |
+
"hand-checked ones (more lenient automatic scoring). The **(all)** columns "
|
| 1343 |
+
"show automatic IFEval over *every* text (the full-sample figure used "
|
| 1344 |
+
"elsewhere). **Δ (man−auto all)** is manual minus that full-sample "
|
| 1345 |
+
"automatic value - useful as a sanity check, but note the two cover "
|
| 1346 |
+
"different text sets (matched subset vs. all texts), so **Δ (man−auto)** "
|
| 1347 |
+
"is the rigorous like-for-like comparison."
|
| 1348 |
+
)
|
| 1349 |
+
ifeval_cmp_table = gr.Dataframe(
|
| 1350 |
+
value=ifeval_cmp_df, interactive=False, wrap=True,
|
| 1351 |
+
elem_classes=["plain-table"],
|
| 1352 |
+
)
|
| 1353 |
+
rrf_outputs.append(ifeval_cmp_table)
|
| 1354 |
+
|
| 1355 |
+
# Metric documentation, shown below the results.
|
| 1356 |
+
gr.Markdown(METRICS_DOC)
|
| 1357 |
+
|
| 1358 |
+
# Simplification prompts, documenting the "Simplification prompt"
|
| 1359 |
+
# filter values — shown below the metric documentation.
|
| 1360 |
+
gr.Markdown(
|
| 1361 |
+
"## Simplification prompts\n\n"
|
| 1362 |
+
"The five prompt templates every model is run with - these are the "
|
| 1363 |
+
"values of the **Simplification prompt** filter above. Each source "
|
| 1364 |
+
"text is simplified once per prompt, so they range from a bare "
|
| 1365 |
+
"one-line instruction to full plain-language guidelines. "
|
| 1366 |
+
"`<text>` marks where the source text is inserted."
|
| 1367 |
+
)
|
| 1368 |
+
for _name, (_desc, _body) in PROMPTS.items():
|
| 1369 |
+
with gr.Accordion(f"{_name} - {_desc}", open=False):
|
| 1370 |
+
gr.Markdown(f"```\n{_body}\n```")
|
| 1371 |
+
|
| 1372 |
+
# Recompute the RRF rankings whenever any filter changes.
|
| 1373 |
+
_filters = [tc_dropdown, pr_dropdown, size_dropdown, type_dropdown]
|
| 1374 |
+
|
| 1375 |
+
def _refresh_rrf(
|
| 1376 |
+
text_category: str, prompt: str, size_limit: str, model_type: str
|
| 1377 |
+
) -> list:
|
| 1378 |
+
f_df, cat_data = load_rrf_views(text_category, prompt, size_limit, model_type)
|
| 1379 |
+
updates: list = [f_df]
|
| 1380 |
+
for cat, df in cat_data:
|
| 1381 |
+
if not cat.get("in_rrf", True):
|
| 1382 |
+
continue
|
| 1383 |
+
updates += [df]
|
| 1384 |
+
updates += list(_tradeoff_figs(text_category, prompt, size_limit, model_type))
|
| 1385 |
+
if not ifeval_cmp_df.empty:
|
| 1386 |
+
updates.append(
|
| 1387 |
+
load_ifeval_comparison_df(text_category, prompt, size_limit, model_type)
|
| 1388 |
+
)
|
| 1389 |
+
return updates
|
| 1390 |
+
|
| 1391 |
+
for _dd in _filters:
|
| 1392 |
+
_dd.change(_refresh_rrf, inputs=_filters, outputs=rrf_outputs)
|
| 1393 |
+
|
| 1394 |
+
return app
|
| 1395 |
+
|
| 1396 |
+
|
| 1397 |
+
app = build_app()
|
| 1398 |
+
|
| 1399 |
+
if __name__ == "__main__":
|
| 1400 |
+
app.launch()
|
data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2412_2026-06-15_095534_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__Llama-PLLuM-70B-chat-2512_2026-06-12_082622_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2412_2026-06-02_091112_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__Llama-PLLuM-8B-chat-2512_2026-06-02_121044_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__PLLuM-12B-chat-2412_2026-06-02_141510_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__PLLuM-12B-chat-2512_2026-06-02_195811_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__PLLuM-12B-instruct-2512_2026-06-10_102424_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/CYFRAGOVPL__PLLuM-4B-chat-2512_2026-06-02_223411_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/deepseek__deepseek-v4-pro_reasoning-high_2026-05-31_094932_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/google__gemini-3.1-pro-preview_2026-06-11_121124_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/google__gemma-3-4b-it_reasoning-none_2026-06-08_110604_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/google__gemma-4-26b-a4b-it_reasoning-high_2026-05-31_223337_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/google__gemma-4-26b-a4b-it_reasoning-none_2026-06-01_020338_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/google__gemma-4-31b-it_reasoning-high_2026-05-31_124753_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/google__gemma-4-31b-it_reasoning-none_2026-05-31_200347_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/meta-llama__llama-3.1-70b-instruct_reasoning-none_2026-06-08_102826_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/meta-llama__llama-3.1-8b-instruct_reasoning-none_2026-06-08_100015_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/mistralai__ministral-8b-2512_2026-05-31_083128_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/mistralai__mistral-nemo_2026-05-31_084528_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/openai__gpt-oss-120b_2026-06-12_123249_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/openai__gpt-oss-20b_2026-06-12_133408_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/qwen__qwen3.5-35b-a3b_reasoning-high_2026-06-01_023022_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/current/speakleash__Bielik-11B-v3.0-Instruct_2026-06-01_112337_scored_anon.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
plotly
|