Spaces:
Running
Running
Commit ·
d473371
0
Parent(s):
Initial commit
Browse files- .gitattributes +35 -0
- .gitignore +13 -0
- .pre-commit-config.yaml +53 -0
- Makefile +13 -0
- README.md +48 -0
- app.py +180 -0
- d3LLM_Code/aup_utils.py +63 -0
- d3LLM_Code/data_dream.yaml +136 -0
- d3LLM_Code/data_dream_coder.yaml +52 -0
- d3LLM_Code/data_llada.yaml +117 -0
- pyproject.toml +13 -0
- requirements.txt +4 -0
- src/display/css_html_js.py +224 -0
- src/display/formatting.py +2 -0
- src/display/utils.py +2 -0
- src/display/visualization.py +250 -0
- src/envs.py +1 -0
- src/leaderboard/read_evals.py +133 -0
- src/populate.py +1 -0
- src/submission/check_validity.py +1 -0
- src/submission/submit.py +1 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
auto_evals/
|
| 2 |
+
venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
.env
|
| 5 |
+
.ipynb_checkpoints
|
| 6 |
+
*ipynb
|
| 7 |
+
.vscode/
|
| 8 |
+
|
| 9 |
+
eval-queue/
|
| 10 |
+
eval-results/
|
| 11 |
+
eval-queue-bk/
|
| 12 |
+
eval-results-bk/
|
| 13 |
+
logs/
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
default_language_version:
|
| 16 |
+
python: python3
|
| 17 |
+
|
| 18 |
+
ci:
|
| 19 |
+
autofix_prs: true
|
| 20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
| 21 |
+
autoupdate_schedule: quarterly
|
| 22 |
+
|
| 23 |
+
repos:
|
| 24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 25 |
+
rev: v4.3.0
|
| 26 |
+
hooks:
|
| 27 |
+
- id: check-yaml
|
| 28 |
+
- id: check-case-conflict
|
| 29 |
+
- id: detect-private-key
|
| 30 |
+
- id: check-added-large-files
|
| 31 |
+
args: ['--maxkb=1000']
|
| 32 |
+
- id: requirements-txt-fixer
|
| 33 |
+
- id: end-of-file-fixer
|
| 34 |
+
- id: trailing-whitespace
|
| 35 |
+
|
| 36 |
+
- repo: https://github.com/PyCQA/isort
|
| 37 |
+
rev: 5.12.0
|
| 38 |
+
hooks:
|
| 39 |
+
- id: isort
|
| 40 |
+
name: Format imports
|
| 41 |
+
|
| 42 |
+
- repo: https://github.com/psf/black
|
| 43 |
+
rev: 22.12.0
|
| 44 |
+
hooks:
|
| 45 |
+
- id: black
|
| 46 |
+
name: Format code
|
| 47 |
+
additional_dependencies: ['click==8.0.2']
|
| 48 |
+
|
| 49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
| 50 |
+
# Ruff version.
|
| 51 |
+
rev: 'v0.0.267'
|
| 52 |
+
hooks:
|
| 53 |
+
- id: ruff
|
Makefile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: style format
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
style:
|
| 5 |
+
python -m black --line-length 119 .
|
| 6 |
+
python -m isort .
|
| 7 |
+
ruff check --fix .
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
quality:
|
| 11 |
+
python -m black --check --line-length 119 .
|
| 12 |
+
python -m isort --check-only .
|
| 13 |
+
ruff check .
|
README.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: dLLM Leaderboard
|
| 3 |
+
emoji: 🏆
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: true
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: A leaderboard of Diffusion Large Language Models (dLLMs)
|
| 11 |
+
sdk_version: 5.43.1
|
| 12 |
+
tags:
|
| 13 |
+
- leaderboard
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# Start the configuration
|
| 17 |
+
|
| 18 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
| 19 |
+
|
| 20 |
+
Results files should have the following format and be stored as json files:
|
| 21 |
+
```json
|
| 22 |
+
{
|
| 23 |
+
"config": {
|
| 24 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
| 25 |
+
"model_name": "path of the model on the hub: org/model",
|
| 26 |
+
"model_sha": "revision on the hub",
|
| 27 |
+
},
|
| 28 |
+
"results": {
|
| 29 |
+
"task_name": {
|
| 30 |
+
"metric_name": score,
|
| 31 |
+
},
|
| 32 |
+
"task_name2": {
|
| 33 |
+
"metric_name": score,
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
Request files are created automatically by this tool.
|
| 40 |
+
|
| 41 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
| 42 |
+
|
| 43 |
+
# Code logic for more complex edits
|
| 44 |
+
|
| 45 |
+
You'll find
|
| 46 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
| 47 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
| 48 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from src.leaderboard.read_evals import get_leaderboard_df, get_tasks, get_raw_data
|
| 4 |
+
from src.display.visualization import create_radar_chart, create_group_bar_chart, create_aup_curve_chart
|
| 5 |
+
from src.display.css_html_js import custom_css, sort_table_js, get_foundation_class
|
| 6 |
+
|
| 7 |
+
CITATION_HTML = """
|
| 8 |
+
<div style="max-width: 800px; margin: 30px auto 0 auto; padding: 20px; background: #f8f7ff; border-radius: 12px; border-left: 4px solid #5a3d8a;">
|
| 9 |
+
<p style="margin: 0 0 12px 0; color: #5a3d8a; font-weight: 600;">📝 If you find this Leaderboard useful for your research, please star <a href="https://github.com/hao-ai-lab/d3llm" target="_blank" style="color: #5a3d8a;">our GitHub repo</a> and cite our work:</p>
|
| 10 |
+
<pre style="background: #fff; padding: 15px; border-radius: 8px; overflow-x: auto; font-size: 12px; margin: 0; color: #333; white-space: pre-wrap; word-wrap: break-word;">@article{preprint'25:d3llm,
|
| 11 |
+
author = {Yu-Yang Qian and Junda Su and Lanxiang Hu and Peiyuan Zhang and Zhijie Deng and Peng Zhao and Hao Zhang},
|
| 12 |
+
title = {d3LLM: Ultra-Fast Diffusion LLM using Pseudo-Trajectory Distillation},
|
| 13 |
+
journal = {ArXiv preprint},
|
| 14 |
+
volume = {to appear},
|
| 15 |
+
note = {\\url{https://github.com/hao-ai-lab/d3LLM} [Accessed: 2025-12-11]},
|
| 16 |
+
year = {2025}
|
| 17 |
+
}</pre>
|
| 18 |
+
</div>
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def create_leaderboard_html(df, tasks):
|
| 22 |
+
"""Generate HTML table for detailed results."""
|
| 23 |
+
rows_html = ""
|
| 24 |
+
for rank, (_, row) in enumerate(df.iterrows(), 1):
|
| 25 |
+
medal = f'<span class="top-medal">{["🥇", "🥈", "🥉"][rank-1]}</span>' if rank <= 3 else str(rank)
|
| 26 |
+
|
| 27 |
+
# Method with link
|
| 28 |
+
method = row['Method']
|
| 29 |
+
link = row.get('Link', '')
|
| 30 |
+
method_html = f'<a href="{link}" target="_blank">{method}</a>' if link else method
|
| 31 |
+
|
| 32 |
+
# Type badge
|
| 33 |
+
type_val = row.get('Type', '?')
|
| 34 |
+
type_display = 'dLLM' if type_val == 'dLLM' else type_val
|
| 35 |
+
type_class = 'ar' if type_val == 'AR' else 'dllm'
|
| 36 |
+
|
| 37 |
+
# Foundation badge
|
| 38 |
+
foundation = row.get('Foundation', '?')
|
| 39 |
+
foundation_class = get_foundation_class(foundation)
|
| 40 |
+
|
| 41 |
+
# Build cells for each task
|
| 42 |
+
task_cells = ""
|
| 43 |
+
for task in tasks:
|
| 44 |
+
aup = row.get(f'{task}_AUP')
|
| 45 |
+
tpf = row.get(f'{task}_TPF')
|
| 46 |
+
acc = row.get(f'{task}_Acc')
|
| 47 |
+
if pd.notna(aup):
|
| 48 |
+
task_cells += f'''<td>
|
| 49 |
+
<span class="aup-score">{aup:.1f}</span>
|
| 50 |
+
<span class="sub-metrics">TPF:{tpf:.2f} Acc:{acc:.1f}</span>
|
| 51 |
+
</td>'''
|
| 52 |
+
else:
|
| 53 |
+
task_cells += '<td><span class="aup-score">-</span></td>'
|
| 54 |
+
|
| 55 |
+
# Avg AUP
|
| 56 |
+
avg_aup = row.get('Avg_AUP', 0)
|
| 57 |
+
|
| 58 |
+
rows_html += f'''<tr>
|
| 59 |
+
<td class="rank-cell"><span class="rank-medal">{medal}</span></td>
|
| 60 |
+
<td class="method-cell">{method_html}</td>
|
| 61 |
+
<td class="type-cell"><span class="type-badge {type_class}">{type_display}</span></td>
|
| 62 |
+
<td class="foundation-cell"><span class="foundation-badge {foundation_class}">{foundation}</span></td>
|
| 63 |
+
{task_cells}
|
| 64 |
+
<td class="avg-cell"><span class="aup-score">{avg_aup:.1f}</span></td>
|
| 65 |
+
</tr>'''
|
| 66 |
+
|
| 67 |
+
task_headers = ''.join(f'<th>{t}</th>' for t in tasks)
|
| 68 |
+
|
| 69 |
+
return f'''
|
| 70 |
+
{sort_table_js}
|
| 71 |
+
<div class="table-wrapper">
|
| 72 |
+
<table class="leaderboard-table">
|
| 73 |
+
<thead><tr>
|
| 74 |
+
<th>Rank</th><th>Method</th><th>Type</th><th>Foundation Model</th>
|
| 75 |
+
{task_headers}
|
| 76 |
+
<th>Avg AUP</th>
|
| 77 |
+
</tr></thead>
|
| 78 |
+
<tbody>{rows_html}</tbody>
|
| 79 |
+
</table>
|
| 80 |
+
</div>
|
| 81 |
+
'''
|
| 82 |
+
|
| 83 |
+
def update_charts(top_n):
|
| 84 |
+
df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
|
| 85 |
+
return create_radar_chart(df, tasks, top_n), create_group_bar_chart(df, tasks, top_n), create_aup_curve_chart(raw_data, tasks, df, top_n)
|
| 86 |
+
|
| 87 |
+
# Load data
|
| 88 |
+
df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
|
| 89 |
+
default_top_n = min(15, len(df))
|
| 90 |
+
|
| 91 |
+
with gr.Blocks(css=custom_css, title="dLLM Leaderboard", fill_height=False) as demo:
|
| 92 |
+
gr.HTML('''
|
| 93 |
+
<div class="welcome-banner">
|
| 94 |
+
<h2>🫧 Welcome to dLLM Leaderboard! 🏆</h2>
|
| 95 |
+
<p>Benchmarking various Diffusion Large Language Models (dLLMs) with <i><a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank" style="color: inherit; text-decoration: underline;">AUP (Accuracy Under Parallelism)</a></i>, considering both accuracy and parallelism.</p>
|
| 96 |
+
</div>
|
| 97 |
+
''')
|
| 98 |
+
|
| 99 |
+
with gr.Tabs():
|
| 100 |
+
with gr.TabItem("📊 Leaderboard"):
|
| 101 |
+
with gr.Row():
|
| 102 |
+
top_n_slider = gr.Slider(minimum=3, maximum=len(df), value=default_top_n, step=1,
|
| 103 |
+
label="Number of Top Methods to Display")
|
| 104 |
+
|
| 105 |
+
with gr.Row():
|
| 106 |
+
radar_plot = gr.Plot(value=create_radar_chart(df, tasks, default_top_n))
|
| 107 |
+
with gr.Row():
|
| 108 |
+
bar_plot = gr.Plot(value=create_group_bar_chart(df, tasks, default_top_n))
|
| 109 |
+
with gr.Row():
|
| 110 |
+
curve_plot = gr.Plot(value=create_aup_curve_chart(raw_data, tasks, df, default_top_n))
|
| 111 |
+
|
| 112 |
+
top_n_slider.change(fn=update_charts, inputs=[top_n_slider], outputs=[radar_plot, bar_plot, curve_plot])
|
| 113 |
+
|
| 114 |
+
gr.Markdown("### 🏆 Detailed Leaderboard")
|
| 115 |
+
gr.HTML(create_leaderboard_html(df, tasks))
|
| 116 |
+
gr.HTML(CITATION_HTML)
|
| 117 |
+
|
| 118 |
+
with gr.TabItem("📤 Submit Result"):
|
| 119 |
+
gr.HTML("""
|
| 120 |
+
<div class="content-wrapper">
|
| 121 |
+
<div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
|
| 122 |
+
<h2>Submit Your Results</h2>
|
| 123 |
+
<p>We welcome contributions to the dLLM Leaderboard! To submit your method's results:</p>
|
| 124 |
+
|
| 125 |
+
<h3>Step 1: Evaluate Your Method</h3>
|
| 126 |
+
<p>Follow the evaluation protocol in the <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">d3LLM repository</a>.
|
| 127 |
+
Refer to the <code>eval_scripts</code> folder for benchmark evaluation scripts, and <code>AUP_leaderboard</code> folder for AUP calculation utilities.</p>
|
| 128 |
+
|
| 129 |
+
<h3>Step 2: Prepare Your Evaluation Results</h3>
|
| 130 |
+
<p>Add your results to the appropriate YAML file following this format:</p>
|
| 131 |
+
<pre style="background: #f5f5f5; padding: 15px; border-radius: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word;">_meta:
|
| 132 |
+
YourMethod:
|
| 133 |
+
type: dLLM # or AR
|
| 134 |
+
foundation: YourFoundation
|
| 135 |
+
link: https://link/to/your/method
|
| 136 |
+
|
| 137 |
+
TaskName:
|
| 138 |
+
YourMethod:
|
| 139 |
+
- [rho_1, accuracy_1] # (parallelism, accuracy) pairs
|
| 140 |
+
- [rho_2, accuracy_2]</pre>
|
| 141 |
+
|
| 142 |
+
<h3>Step 3: Submit a Pull Request</h3>
|
| 143 |
+
<ol>
|
| 144 |
+
<li>Fork the repository</li>
|
| 145 |
+
<li>Add your results to the YAML files</li>
|
| 146 |
+
<li>Submit a PR with your method name, description, and evaluation details</li>
|
| 147 |
+
</ol>
|
| 148 |
+
|
| 149 |
+
<p><strong>Questions?</strong> Open an issue on <a href="https://github.com/hao-ai-lab/d3LLM/issues" target="_blank">GitHub</a>.</p>
|
| 150 |
+
</div>
|
| 151 |
+
</div>
|
| 152 |
+
""" + CITATION_HTML)
|
| 153 |
+
|
| 154 |
+
with gr.TabItem("ℹ️ About"):
|
| 155 |
+
gr.HTML("""
|
| 156 |
+
<div class="content-wrapper">
|
| 157 |
+
<div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
|
| 158 |
+
<h2>About dLLM Leaderboard</h2>
|
| 159 |
+
<p>This leaderboard evaluates <strong>Diffusion Large Language Models (dLLMs)</strong> using the <strong>AUP (Accuracy Under Parallelism)</strong> metric.</p>
|
| 160 |
+
|
| 161 |
+
<h3>Metrics</h3>
|
| 162 |
+
<ul>
|
| 163 |
+
<li><strong>AUP</strong>: Primary metric - measures efficiency-accuracy trade-off (higher is better)</li>
|
| 164 |
+
<li><strong>TPF</strong>: Tokens Per Forward - parallelism level achieved</li>
|
| 165 |
+
<li><strong>Acc</strong>: Accuracy at maximum parallelism</li>
|
| 166 |
+
</ul>
|
| 167 |
+
|
| 168 |
+
<h3>Benchmarks</h3>
|
| 169 |
+
<p>GSM8K-CoT, MATH, HumanEval, MBPP, Long-GSM8K</p>
|
| 170 |
+
|
| 171 |
+
<h3>References</h3>
|
| 172 |
+
<p>
|
| 173 |
+
GitHub Code Repo: <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">https://github.com/hao-ai-lab/d3LLM</a><br>
|
| 174 |
+
Blog: <a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank">https://hao-ai-lab.github.io/blogs/text-diffusion/</a>
|
| 175 |
+
</p>
|
| 176 |
+
</div>
|
| 177 |
+
</div>
|
| 178 |
+
""" + CITATION_HTML)
|
| 179 |
+
|
| 180 |
+
demo.launch()
|
d3LLM_Code/aup_utils.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AUP (Accuracy Under Parallelism) measure for parallel decoders
|
| 2 |
+
# See paper for detailed definition and motivation
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float:
|
| 6 |
+
"""Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)"""
|
| 7 |
+
return min(math.exp(-alpha * (1 - y / y_max)), 1.0)
|
| 8 |
+
|
| 9 |
+
def get_aup(rho: list[float], y: list[float], y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0, is_print: bool = False) -> float:
|
| 10 |
+
"""
|
| 11 |
+
Calculate the Accuracy Under Parallelism (AUP) of parallelism-accuracy pairs.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
rho: list of parallelism values (TPF, tokens per forward)
|
| 15 |
+
y: list of accuracy values in [0, 100] (percentage)
|
| 16 |
+
y_max: maximum accuracy across all methods (for normalization)
|
| 17 |
+
alpha: penalty factor for accuracy degradation (default: 3.0)
|
| 18 |
+
y_min_offset: minimum accuracy threshold offset (default: 5.0, i.e., 5%)
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
AUP score (scalar value)
|
| 22 |
+
"""
|
| 23 |
+
assert len(rho) == len(y), "rho and y must have the same length"
|
| 24 |
+
assert len(rho) > 0, "rho and y must not be empty"
|
| 25 |
+
assert all(r > 0 for r in rho), "all rho must be positive"
|
| 26 |
+
|
| 27 |
+
# Check if y values are in [0, 100] range
|
| 28 |
+
if any(acc < 1.0 for acc in y):
|
| 29 |
+
print("\033[91mWarning: Detected accuracy values < 1.0. Please check if accuracy should be in percentage (0-100) instead of (0-1).\033[0m")
|
| 30 |
+
|
| 31 |
+
# Sort by rho
|
| 32 |
+
sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0])
|
| 33 |
+
sorted_rho, sorted_y = zip(*sorted_pairs)
|
| 34 |
+
sorted_rho, sorted_y = list(sorted_rho), list(sorted_y)
|
| 35 |
+
|
| 36 |
+
# Filter by y_min threshold (y_1 - y_min_offset)
|
| 37 |
+
y_1 = sorted_y[0]
|
| 38 |
+
assert y_1 - sorted_y[-1] <= y_min_offset, f"Accuracy degradation is too large: minimum accuracy should be at least {y_min_offset:.2f} lower than the maximum accuracy. Max Acc: {y_1}, min Acc: {sorted_y[-1]}"
|
| 39 |
+
y_min = y_1 - y_min_offset
|
| 40 |
+
filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min]
|
| 41 |
+
assert len(filtered_pairs) > 0, f"No valid pairs after filtering with y_min={y_min}"
|
| 42 |
+
|
| 43 |
+
filtered_rho, filtered_y = zip(*filtered_pairs)
|
| 44 |
+
filtered_rho, filtered_y = list(filtered_rho), list(filtered_y)
|
| 45 |
+
|
| 46 |
+
# Calculate AUP: first term + trapezoidal sum
|
| 47 |
+
aup = filtered_rho[0] * filtered_y[0]
|
| 48 |
+
formula_parts = [f"{filtered_rho[0]:.2f} * {filtered_y[0]:.2f}"]
|
| 49 |
+
|
| 50 |
+
for i in range(1, len(filtered_rho)):
|
| 51 |
+
y_i = filtered_y[i]
|
| 52 |
+
y_prev = filtered_y[i-1]
|
| 53 |
+
w_i = weight_function(y_i, y_max, alpha)
|
| 54 |
+
w_prev = weight_function(y_prev, y_max, alpha)
|
| 55 |
+
term = 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev)
|
| 56 |
+
aup += term
|
| 57 |
+
formula_parts.append(f"({filtered_rho[i]:.2f}-{filtered_rho[i-1]:.2f}) * ({y_i:.2f} * {w_i:.4f} + {y_prev:.2f} * {w_prev:.4f})")
|
| 58 |
+
|
| 59 |
+
if is_print:
|
| 60 |
+
formula = f" AUP = " + " + ".join(formula_parts) + f" = {aup:.2f}"
|
| 61 |
+
print(formula)
|
| 62 |
+
|
| 63 |
+
return aup
|
d3LLM_Code/data_dream.yaml
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AUP Benchmark Data
|
| 2 |
+
# Format: task -> method -> list of (rho, accuracy) pairs
|
| 3 |
+
# rho: parallelism (tokens per forward)
|
| 4 |
+
# accuracy: model accuracy (0-1 scale)
|
| 5 |
+
|
| 6 |
+
# Model metadata: type (AR/dLLM), foundation model, link
|
| 7 |
+
_meta:
|
| 8 |
+
Qwen-2.5-7B-it:
|
| 9 |
+
type: AR
|
| 10 |
+
foundation: Qwen-2.5-7B-it
|
| 11 |
+
link: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
|
| 12 |
+
EAGLE-3:
|
| 13 |
+
type: AR
|
| 14 |
+
foundation: Llama-3.1-8B-it
|
| 15 |
+
link: https://github.com/SafeAILab/EAGLE
|
| 16 |
+
Dream:
|
| 17 |
+
type: dLLM
|
| 18 |
+
foundation: Dream-v0-it-7B
|
| 19 |
+
link: https://github.com/DreamLM/Dream
|
| 20 |
+
Fast-dLLM-Dream:
|
| 21 |
+
type: dLLM
|
| 22 |
+
foundation: Dream-v0-it-7B
|
| 23 |
+
link: https://github.com/NVlabs/Fast-dLLM
|
| 24 |
+
Fast-dLLM-v2:
|
| 25 |
+
type: dLLM
|
| 26 |
+
foundation: Qwen-2.5-7B-it
|
| 27 |
+
link: https://github.com/NVlabs/Fast-dLLM/tree/main/v2
|
| 28 |
+
dParallel-Dream:
|
| 29 |
+
type: dLLM
|
| 30 |
+
foundation: Dream-v0-it-7B
|
| 31 |
+
link: https://github.com/czg1225/dParallel
|
| 32 |
+
d3LLM-Dream:
|
| 33 |
+
type: dLLM
|
| 34 |
+
foundation: Dream-v0-it-7B
|
| 35 |
+
link: https://github.com/hao-ai-lab/d3llm
|
| 36 |
+
|
| 37 |
+
GSM8K-CoT:
|
| 38 |
+
Qwen-2.5-7B-it:
|
| 39 |
+
- [1.0, 74.1]
|
| 40 |
+
EAGLE-3:
|
| 41 |
+
- [1.0, 76.57]
|
| 42 |
+
- [5.12, 76.57]
|
| 43 |
+
Dream:
|
| 44 |
+
- [1.0, 83.94]
|
| 45 |
+
Fast-dLLM-Dream:
|
| 46 |
+
- [1.0, 83.68]
|
| 47 |
+
- [1.44, 79.0]
|
| 48 |
+
Fast-dLLM-v2:
|
| 49 |
+
- [1.0, 82.82]
|
| 50 |
+
- [2.21, 81.48]
|
| 51 |
+
dParallel-Dream:
|
| 52 |
+
- [1.0, 83.8]
|
| 53 |
+
- [3.02, 82.12]
|
| 54 |
+
d3LLM-Dream:
|
| 55 |
+
- [1.0, 83.47]
|
| 56 |
+
- [4.94, 81.36]
|
| 57 |
+
MATH:
|
| 58 |
+
Qwen-2.5-7B-it:
|
| 59 |
+
- [1.0, 41.15]
|
| 60 |
+
EAGLE-3:
|
| 61 |
+
- [1.0, 39.80]
|
| 62 |
+
- [5.72, 39.80]
|
| 63 |
+
Dream:
|
| 64 |
+
- [1.0, 39.63]
|
| 65 |
+
Fast-dLLM-Dream:
|
| 66 |
+
- [1.0, 39.53]
|
| 67 |
+
- [1.78, 38.3]
|
| 68 |
+
Fast-dLLM-v2:
|
| 69 |
+
- [1.0, 49.92]
|
| 70 |
+
- [2.61, 48.74]
|
| 71 |
+
dParallel-Dream:
|
| 72 |
+
- [1.0, 39.06]
|
| 73 |
+
- [2.94, 38.72]
|
| 74 |
+
d3LLM-Dream:
|
| 75 |
+
- [1.0, 39.38]
|
| 76 |
+
- [3.92, 38.21]
|
| 77 |
+
MBPP-Instruct:
|
| 78 |
+
Qwen-2.5-7B-it:
|
| 79 |
+
- [1.0, 63.8]
|
| 80 |
+
EAGLE-3:
|
| 81 |
+
- [1.0, 60.20]
|
| 82 |
+
- [5.69, 60.20]
|
| 83 |
+
Dream:
|
| 84 |
+
- [1.0, 57.2]
|
| 85 |
+
Fast-dLLM-Dream:
|
| 86 |
+
- [1.0, 56.38]
|
| 87 |
+
- [1.2, 53.2]
|
| 88 |
+
Fast-dLLM-v2:
|
| 89 |
+
- [1.0, 61.23]
|
| 90 |
+
- [2.04, 59.12]
|
| 91 |
+
dParallel-Dream:
|
| 92 |
+
- [1.0, 57.8]
|
| 93 |
+
- [2.24, 55.4]
|
| 94 |
+
d3LLM-Dream:
|
| 95 |
+
- [1.0, 58.8]
|
| 96 |
+
- [2.96, 55.60]
|
| 97 |
+
HumanEval-Instruct:
|
| 98 |
+
Qwen-2.5-7B-it:
|
| 99 |
+
- [1.0, 72.25]
|
| 100 |
+
EAGLE-3:
|
| 101 |
+
- [1.0, 67.58]
|
| 102 |
+
- [5.98, 67.58]
|
| 103 |
+
Dream:
|
| 104 |
+
- [1.0, 55.2]
|
| 105 |
+
Fast-dLLM-Dream:
|
| 106 |
+
- [1.0, 54.86]
|
| 107 |
+
- [1.33, 54.27]
|
| 108 |
+
Fast-dLLM-v2:
|
| 109 |
+
- [1.0, 63.2]
|
| 110 |
+
- [2.58, 61.7]
|
| 111 |
+
dParallel-Dream:
|
| 112 |
+
- [1.0, 56.08]
|
| 113 |
+
- [2.57, 54.27]
|
| 114 |
+
d3LLM-Dream:
|
| 115 |
+
- [1.0, 58.86]
|
| 116 |
+
- [3.20, 57.10]
|
| 117 |
+
Long-GSM8K:
|
| 118 |
+
Qwen-2.5-7B-it:
|
| 119 |
+
- [1.0, 82.56]
|
| 120 |
+
EAGLE-3:
|
| 121 |
+
- [1.0, 80.52]
|
| 122 |
+
- [5.57, 80.52]
|
| 123 |
+
Dream:
|
| 124 |
+
- [1.0, 78.95]
|
| 125 |
+
Fast-dLLM-Dream:
|
| 126 |
+
- [1.0, 78.83]
|
| 127 |
+
- [1.79, 76.57]
|
| 128 |
+
Fast-dLLM-v2:
|
| 129 |
+
- [1.0, 82.34]
|
| 130 |
+
- [2.58, 80.97]
|
| 131 |
+
dParallel-Dream:
|
| 132 |
+
- [1.0, 81.27]
|
| 133 |
+
- [3.49, 78.56]
|
| 134 |
+
d3LLM-Dream:
|
| 135 |
+
- [1.0, 81.2]
|
| 136 |
+
- [4.80, 77.18]
|
d3LLM_Code/data_dream_coder.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AUP Benchmark Data
|
| 2 |
+
# Format: task -> method -> list of (rho, accuracy) pairs
|
| 3 |
+
# rho: parallelism (tokens per forward)
|
| 4 |
+
# accuracy: model accuracy (0-1 scale)
|
| 5 |
+
|
| 6 |
+
# Model metadata: type (AR/dLLM), foundation model, link
|
| 7 |
+
_meta:
|
| 8 |
+
Qwen2.5-Coder-7B-it:
|
| 9 |
+
type: AR
|
| 10 |
+
foundation: Qwen2.5-Coder-7B-it
|
| 11 |
+
link: https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct
|
| 12 |
+
Dream-Coder-7B:
|
| 13 |
+
type: dLLM
|
| 14 |
+
foundation: Dream-Coder-v0-it-7B
|
| 15 |
+
link: https://github.com/DreamLM/Dream-Coder
|
| 16 |
+
d3LLM-Coder-7B:
|
| 17 |
+
type: dLLM
|
| 18 |
+
foundation: Dream-Coder-v0-it-7B
|
| 19 |
+
link: https://github.com/hao-ai-lab/d3llm
|
| 20 |
+
|
| 21 |
+
HumanEval:
|
| 22 |
+
Qwen2.5-Coder-7B-it:
|
| 23 |
+
- [1.0, 86.6]
|
| 24 |
+
Dream-Coder-7B:
|
| 25 |
+
- [1.0, 82.9]
|
| 26 |
+
d3LLM-Coder-7B:
|
| 27 |
+
- [1.0, 82.4]
|
| 28 |
+
- [2.88, 79.7]
|
| 29 |
+
HumanEval+:
|
| 30 |
+
Qwen2.5-Coder-7B-it:
|
| 31 |
+
- [1.0, 82.3]
|
| 32 |
+
Dream-Coder-7B:
|
| 33 |
+
- [1.0, 76.8]
|
| 34 |
+
d3LLM-Coder-7B:
|
| 35 |
+
- [1.0, 74.4]
|
| 36 |
+
- [2.88, 71.3]
|
| 37 |
+
MBPP:
|
| 38 |
+
Qwen2.5-Coder-7B-it:
|
| 39 |
+
- [1.0, 83.5]
|
| 40 |
+
Dream-Coder-7B:
|
| 41 |
+
- [1.0, 79.9]
|
| 42 |
+
d3LLM-Coder-7B:
|
| 43 |
+
- [1.0, 80.10]
|
| 44 |
+
- [2.5, 80.00]
|
| 45 |
+
MBPP+:
|
| 46 |
+
Qwen2.5-Coder-7B-it:
|
| 47 |
+
- [1.0, 70.1]
|
| 48 |
+
Dream-Coder-7B:
|
| 49 |
+
- [1.0, 68.8]
|
| 50 |
+
d3LLM-Coder-7B:
|
| 51 |
+
- [1.0, 69.6]
|
| 52 |
+
- [2.5, 69.3]
|
d3LLM_Code/data_llada.yaml
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AUP Benchmark Data
|
| 2 |
+
# Format: task -> method -> list of (rho, accuracy) pairs
|
| 3 |
+
# rho: parallelism (tokens per forward)
|
| 4 |
+
# accuracy: model accuracy (0-1 scale)
|
| 5 |
+
|
| 6 |
+
# Model metadata: type (AR/dLLM), foundation model, link
|
| 7 |
+
_meta:
|
| 8 |
+
Qwen-2.5-7B-it:
|
| 9 |
+
type: AR
|
| 10 |
+
foundation: Qwen-2.5-7B-it
|
| 11 |
+
link: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
|
| 12 |
+
LLaDA:
|
| 13 |
+
type: dLLM
|
| 14 |
+
foundation: LLaDA-8B-it
|
| 15 |
+
link: https://github.com/ML-GSAI/LLaDA
|
| 16 |
+
Fast-dLLM-LLaDA:
|
| 17 |
+
type: dLLM
|
| 18 |
+
foundation: LLaDA-8B-it
|
| 19 |
+
link: https://github.com/NVlabs/Fast-dLLM
|
| 20 |
+
D2F-LLaDA:
|
| 21 |
+
type: dLLM
|
| 22 |
+
foundation: LLaDA-8B-it
|
| 23 |
+
link: https://github.com/zhijie-group/Discrete-Diffusion-Forcing
|
| 24 |
+
dParallel-LLaDA:
|
| 25 |
+
type: dLLM
|
| 26 |
+
foundation: LLaDA-8B-it
|
| 27 |
+
link: https://github.com/czg1225/dParallel
|
| 28 |
+
d3LLM-LLaDA:
|
| 29 |
+
type: dLLM
|
| 30 |
+
foundation: LLaDA-8B-it
|
| 31 |
+
link: https://github.com/hao-ai-lab/d3llm
|
| 32 |
+
|
| 33 |
+
GSM8K-CoT:
|
| 34 |
+
Qwen-2.5-7B-it:
|
| 35 |
+
- [1.0, 74.1]
|
| 36 |
+
LLaDA:
|
| 37 |
+
- [1.0, 72.55]
|
| 38 |
+
Fast-dLLM-LLaDA:
|
| 39 |
+
- [1.0, 74.79]
|
| 40 |
+
- [2.77, 74.68]
|
| 41 |
+
D2F-LLaDA:
|
| 42 |
+
- [1.0, 74.98]
|
| 43 |
+
- [2.88, 74.39]
|
| 44 |
+
dParallel-LLaDA:
|
| 45 |
+
- [1.0, 74.0]
|
| 46 |
+
- [5.14, 72.63]
|
| 47 |
+
d3LLM-LLaDA:
|
| 48 |
+
- [1.0, 74.02]
|
| 49 |
+
- [9.11, 73.09]
|
| 50 |
+
MATH:
|
| 51 |
+
Qwen-2.5-7B-it:
|
| 52 |
+
- [1.0, 41.15]
|
| 53 |
+
LLaDA:
|
| 54 |
+
- [1.0, 32.2]
|
| 55 |
+
Fast-dLLM-LLaDA:
|
| 56 |
+
- [1.0, 32.1]
|
| 57 |
+
- [1.97, 30.82]
|
| 58 |
+
D2F-LLaDA:
|
| 59 |
+
- [1.0, 29.1]
|
| 60 |
+
- [2.66, 28.94]
|
| 61 |
+
dParallel-LLaDA:
|
| 62 |
+
- [1.0, 32.0]
|
| 63 |
+
- [3.17, 30.18]
|
| 64 |
+
d3LLM-LLaDA:
|
| 65 |
+
- [1.0, 32.76]
|
| 66 |
+
- [5.74, 30.36]
|
| 67 |
+
MBPP:
|
| 68 |
+
Qwen-2.5-7B-it:
|
| 69 |
+
- [1.0, 63.6]
|
| 70 |
+
LLaDA:
|
| 71 |
+
- [1.0, 41.72]
|
| 72 |
+
Fast-dLLM-LLaDA:
|
| 73 |
+
- [1.0, 41.58]
|
| 74 |
+
- [2.13, 38.6]
|
| 75 |
+
D2F-LLaDA:
|
| 76 |
+
- [1.0, 39.10]
|
| 77 |
+
- [2.13, 39.00]
|
| 78 |
+
dParallel-LLaDA:
|
| 79 |
+
- [1.0, 41.62]
|
| 80 |
+
- [2.35, 40.0]
|
| 81 |
+
d3LLM-LLaDA:
|
| 82 |
+
- [1.0, 42.0]
|
| 83 |
+
- [4.21, 40.60]
|
| 84 |
+
HumanEval:
|
| 85 |
+
Qwen-2.5-7B-it:
|
| 86 |
+
- [1.0, 67.73]
|
| 87 |
+
LLaDA:
|
| 88 |
+
- [1.0, 38.28]
|
| 89 |
+
Fast-dLLM-LLaDA:
|
| 90 |
+
- [1.0, 38.16]
|
| 91 |
+
- [2.56, 37.8]
|
| 92 |
+
D2F-LLaDA:
|
| 93 |
+
- [1.0, 41.02]
|
| 94 |
+
- [2.69, 40.64]
|
| 95 |
+
dParallel-LLaDA:
|
| 96 |
+
- [1.0, 39.68]
|
| 97 |
+
- [4.93, 39.02]
|
| 98 |
+
d3LLM-LLaDA:
|
| 99 |
+
- [1.0, 39.8]
|
| 100 |
+
- [5.95, 39.63]
|
| 101 |
+
Long-GSM8K:
|
| 102 |
+
Qwen-2.5-7B-it:
|
| 103 |
+
- [1.0, 82.56]
|
| 104 |
+
LLaDA:
|
| 105 |
+
- [1.0, 78.58]
|
| 106 |
+
Fast-dLLM-LLaDA:
|
| 107 |
+
- [1.0, 78.45]
|
| 108 |
+
- [2.45, 78.01]
|
| 109 |
+
D2F-LLaDA:
|
| 110 |
+
- [1.0, 76.00]
|
| 111 |
+
- [2.7, 75.66]
|
| 112 |
+
dParallel-LLaDA:
|
| 113 |
+
- [1.0, 79.15]
|
| 114 |
+
- [4.49, 76.65]
|
| 115 |
+
d3LLM-LLaDA:
|
| 116 |
+
- [1.0, 78.32]
|
| 117 |
+
- [6.95, 74.22]
|
pyproject.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.ruff]
|
| 2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
| 3 |
+
select = ["E", "F"]
|
| 4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
| 5 |
+
line-length = 119
|
| 6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
| 7 |
+
|
| 8 |
+
[tool.isort]
|
| 9 |
+
profile = "black"
|
| 10 |
+
line_length = 119
|
| 11 |
+
|
| 12 |
+
[tool.black]
|
| 13 |
+
line-length = 119
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
| 3 |
+
plotly
|
| 4 |
+
pyyaml
|
src/display/css_html_js.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
custom_css = """
|
| 2 |
+
/* Reset any Gradio overflow restrictions */
|
| 3 |
+
* { box-sizing: border-box; }
|
| 4 |
+
|
| 5 |
+
.markdown-text { font-size: 16px !important; }
|
| 6 |
+
|
| 7 |
+
.welcome-banner {
|
| 8 |
+
background: linear-gradient(135deg, #a8b5f7 0%, #c9a8f7 100%);
|
| 9 |
+
color: #333;
|
| 10 |
+
padding: 25px;
|
| 11 |
+
border-radius: 12px;
|
| 12 |
+
margin: 20px 10px;
|
| 13 |
+
text-align: center;
|
| 14 |
+
box-shadow: 0 4px 15px rgba(168, 181, 247, 0.3);
|
| 15 |
+
}
|
| 16 |
+
.welcome-banner h2 { margin: 0 0 10px 0; font-size: 1.5em; color: #333; }
|
| 17 |
+
.welcome-banner p { margin: 0; font-size: 1.1em; color: #444; }
|
| 18 |
+
|
| 19 |
+
/* Modern, clean leaderboard table - no border */
|
| 20 |
+
.leaderboard-table {
|
| 21 |
+
width: 100%;
|
| 22 |
+
min-width: 30px;
|
| 23 |
+
border-collapse: collapse;
|
| 24 |
+
background: #fff;
|
| 25 |
+
border-radius: 12px;
|
| 26 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
| 27 |
+
}
|
| 28 |
+
.leaderboard-table thead {
|
| 29 |
+
background: linear-gradient(135deg, #a8b5f7 0%, #c9a8f7 100%);
|
| 30 |
+
}
|
| 31 |
+
.leaderboard-table th {
|
| 32 |
+
padding: 16px 10px;
|
| 33 |
+
text-align: center;
|
| 34 |
+
font-weight: 700;
|
| 35 |
+
font-size: 14px;
|
| 36 |
+
cursor: pointer;
|
| 37 |
+
user-select: none;
|
| 38 |
+
border: none;
|
| 39 |
+
color: #4a3a6e;
|
| 40 |
+
}
|
| 41 |
+
.leaderboard-table th:hover { background: rgba(255,255,255,0.2); }
|
| 42 |
+
.leaderboard-table th.sortable::after { content: ' ↕'; opacity: 0.5; font-size: 11px; }
|
| 43 |
+
.leaderboard-table th.sort-asc::after { content: ' ↑'; opacity: 1; }
|
| 44 |
+
.leaderboard-table th.sort-desc::after { content: ' ↓'; opacity: 1; }
|
| 45 |
+
|
| 46 |
+
.leaderboard-table td {
|
| 47 |
+
padding: 12px 8px;
|
| 48 |
+
text-align: center;
|
| 49 |
+
font-size: 12px;
|
| 50 |
+
color: #333;
|
| 51 |
+
border-bottom: 1px solid #f0f0f0;
|
| 52 |
+
border-left: none;
|
| 53 |
+
border-right: none;
|
| 54 |
+
}
|
| 55 |
+
.leaderboard-table tbody tr:hover { background-color: #fafafa; }
|
| 56 |
+
.leaderboard-table tbody tr:last-child td { border-bottom: none; }
|
| 57 |
+
|
| 58 |
+
.aup-score { font-size: 17px; font-weight: 700; color: #333; display: block; }
|
| 59 |
+
.sub-metrics { font-size: 9px; color: #999; display: block; margin-top: 2px; }
|
| 60 |
+
|
| 61 |
+
.method-cell {
|
| 62 |
+
text-align: center !important;
|
| 63 |
+
font-weight: 600;
|
| 64 |
+
min-width: 100px;
|
| 65 |
+
}
|
| 66 |
+
.method-cell a {
|
| 67 |
+
color: #5a3d8a;
|
| 68 |
+
text-decoration: none;
|
| 69 |
+
transition: color 0.2s;
|
| 70 |
+
font-size: 15px;
|
| 71 |
+
}
|
| 72 |
+
.method-cell a:hover { color: #3d2760; text-decoration: underline; }
|
| 73 |
+
|
| 74 |
+
.rank-cell { font-size: 16px; }
|
| 75 |
+
.type-cell { min-width: 45px; font-size: 9px; }
|
| 76 |
+
.foundation-cell { min-width: 60px; font-size: 9px; }
|
| 77 |
+
.avg-cell { background-color: #f8f7ff; }
|
| 78 |
+
.avg-cell .aup-score { font-size: 18px; color: #5a3d8a; font-weight: 700; }
|
| 79 |
+
|
| 80 |
+
/* Type badges - rounded pill style */
|
| 81 |
+
.type-badge {
|
| 82 |
+
display: inline-block;
|
| 83 |
+
padding: 2px 6px;
|
| 84 |
+
border-radius: 10px;
|
| 85 |
+
font-size: 8px;
|
| 86 |
+
font-weight: 600;
|
| 87 |
+
}
|
| 88 |
+
.type-badge.ar { background-color: #B93413; color: #fff; }
|
| 89 |
+
.type-badge.dllm { background-color: #193D3A; color: #fff; }
|
| 90 |
+
|
| 91 |
+
/* Foundation badges - low saturation colors, pill style */
|
| 92 |
+
.foundation-badge {
|
| 93 |
+
display: inline-block;
|
| 94 |
+
padding: 2px 6px;
|
| 95 |
+
border-radius: 10px;
|
| 96 |
+
font-size: 8px;
|
| 97 |
+
font-weight: 500;
|
| 98 |
+
}
|
| 99 |
+
.foundation-badge.f0 { background-color: #e8e4f0; color: #5a4a6e; }
|
| 100 |
+
.foundation-badge.f1 { background-color: #e4ecf0; color: #4a5a6e; }
|
| 101 |
+
.foundation-badge.f2 { background-color: #e4f0e8; color: #4a6e5a; }
|
| 102 |
+
.foundation-badge.f3 { background-color: #f0e8e4; color: #6e5a4a; }
|
| 103 |
+
.foundation-badge.f4 { background-color: #f0e4ec; color: #6e4a5a; }
|
| 104 |
+
.foundation-badge.f5 { background-color: #ecf0e4; color: #5a6e4a; }
|
| 105 |
+
.foundation-badge.f6 { background-color: #e4e8f0; color: #4a5a6e; }
|
| 106 |
+
.foundation-badge.f7 { background-color: #f0ece4; color: #6e5a4a; }
|
| 107 |
+
.foundation-badge.f8 { background-color: #e8f0ec; color: #4a6e5e; }
|
| 108 |
+
.foundation-badge.f9 { background-color: #f0e4e4; color: #6e4a4a; }
|
| 109 |
+
|
| 110 |
+
.rank-medal { font-size: 16px; }
|
| 111 |
+
.rank-medal .top-medal { font-size: 24px; }
|
| 112 |
+
.tab-buttons button { font-size: 16px; }
|
| 113 |
+
.tip-text { font-size: 12px; color: #888; font-style: italic; margin: 8px 0 15px 0; }
|
| 114 |
+
|
| 115 |
+
/* Container and layout - force overflow to work */
|
| 116 |
+
html, body { overflow-x: auto !important; }
|
| 117 |
+
.gradio-container {
|
| 118 |
+
max-width: 1400px !important;
|
| 119 |
+
margin: auto !important;
|
| 120 |
+
padding: 0 5px !important;
|
| 121 |
+
}
|
| 122 |
+
.tabs, .tabitem, .tabitem > div, #component-0, .contain, .block, .wrap, .prose {
|
| 123 |
+
width: 100% !important;
|
| 124 |
+
height: auto !important;
|
| 125 |
+
min-height: auto !important;
|
| 126 |
+
max-height: none !important;
|
| 127 |
+
}
|
| 128 |
+
.tabitem[style*="display: none"] { display: none !important; }
|
| 129 |
+
|
| 130 |
+
/* Table wrapper - critical for horizontal scroll */
|
| 131 |
+
.table-wrapper {
|
| 132 |
+
width: 100%;
|
| 133 |
+
overflow-x: scroll !important;
|
| 134 |
+
overflow-y: visible;
|
| 135 |
+
-webkit-overflow-scrolling: touch;
|
| 136 |
+
display: block;
|
| 137 |
+
padding-bottom: 15px;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/* Content wrapper */
|
| 141 |
+
.content-wrapper {
|
| 142 |
+
width: 100%;
|
| 143 |
+
max-width: 100%;
|
| 144 |
+
overflow-x: auto;
|
| 145 |
+
-webkit-overflow-scrolling: touch;
|
| 146 |
+
box-sizing: border-box;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
/* Responsive font sizes for smaller screens */
|
| 150 |
+
@media (max-width: 1000px) {
|
| 151 |
+
.welcome-banner { padding: 20px 15px; margin: 15px 5px; }
|
| 152 |
+
.welcome-banner h2 { font-size: 1.3em; }
|
| 153 |
+
.welcome-banner p { font-size: 1em; }
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
@media (max-width: 600px) {
|
| 157 |
+
.welcome-banner { padding: 15px 10px; margin: 10px 5px; }
|
| 158 |
+
.welcome-banner h2 { font-size: 1.1em; }
|
| 159 |
+
.welcome-banner p { font-size: 0.9em; }
|
| 160 |
+
}
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
sort_table_js = """
|
| 164 |
+
<script>
|
| 165 |
+
(function() {
|
| 166 |
+
function initSort() {
|
| 167 |
+
const table = document.querySelector('.leaderboard-table');
|
| 168 |
+
if (!table) { setTimeout(initSort, 100); return; }
|
| 169 |
+
|
| 170 |
+
const headers = table.querySelectorAll('th');
|
| 171 |
+
let currentSort = { col: -1, dir: 'desc' };
|
| 172 |
+
|
| 173 |
+
headers.forEach((th, idx) => {
|
| 174 |
+
if (idx < 4) return;
|
| 175 |
+
th.classList.add('sortable');
|
| 176 |
+
th.onclick = function() { sortTable(idx); };
|
| 177 |
+
});
|
| 178 |
+
|
| 179 |
+
function sortTable(colIdx) {
|
| 180 |
+
const tbody = table.querySelector('tbody');
|
| 181 |
+
const rows = Array.from(tbody.querySelectorAll('tr'));
|
| 182 |
+
const dir = (currentSort.col === colIdx && currentSort.dir === 'desc') ? 'asc' : 'desc';
|
| 183 |
+
currentSort = { col: colIdx, dir };
|
| 184 |
+
|
| 185 |
+
headers.forEach((h, i) => {
|
| 186 |
+
h.classList.remove('sort-asc', 'sort-desc');
|
| 187 |
+
if (i === colIdx) h.classList.add('sort-' + dir);
|
| 188 |
+
});
|
| 189 |
+
|
| 190 |
+
rows.sort((a, b) => {
|
| 191 |
+
const aEl = a.cells[colIdx].querySelector('.aup-score');
|
| 192 |
+
const bEl = b.cells[colIdx].querySelector('.aup-score');
|
| 193 |
+
const aVal = parseFloat(aEl ? aEl.textContent : '0') || 0;
|
| 194 |
+
const bVal = parseFloat(bEl ? bEl.textContent : '0') || 0;
|
| 195 |
+
return dir === 'desc' ? bVal - aVal : aVal - bVal;
|
| 196 |
+
});
|
| 197 |
+
|
| 198 |
+
rows.forEach((row, i) => {
|
| 199 |
+
const rankCell = row.cells[0];
|
| 200 |
+
const medal = i < 3 ? ['🥇', '🥈', '🥉'][i] : (i + 1);
|
| 201 |
+
rankCell.innerHTML = '<span class="rank-medal">' + medal + '</span>';
|
| 202 |
+
tbody.appendChild(row);
|
| 203 |
+
});
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
if (document.readyState === 'loading') {
|
| 207 |
+
document.addEventListener('DOMContentLoaded', initSort);
|
| 208 |
+
} else {
|
| 209 |
+
initSort();
|
| 210 |
+
}
|
| 211 |
+
})();
|
| 212 |
+
</script>
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
# Foundation model to badge class mapping
|
| 216 |
+
FOUNDATION_COLORS = {}
|
| 217 |
+
_foundation_idx = 0
|
| 218 |
+
|
| 219 |
+
def get_foundation_class(foundation):
|
| 220 |
+
global _foundation_idx, FOUNDATION_COLORS
|
| 221 |
+
if foundation not in FOUNDATION_COLORS:
|
| 222 |
+
FOUNDATION_COLORS[foundation] = f"f{_foundation_idx % 10}"
|
| 223 |
+
_foundation_idx += 1
|
| 224 |
+
return FOUNDATION_COLORS[foundation]
|
src/display/formatting.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Formatting utilities for display
|
| 2 |
+
# Currently not used - keeping for potential future extensions
|
src/display/utils.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utility functions for display formatting
|
| 2 |
+
# Currently not used - keeping for potential future extensions
|
src/display/visualization.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import plotly.graph_objects as go
|
| 2 |
+
from plotly.subplots import make_subplots
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
# 30 distinct colors - assigned by Avg AUP rank
|
| 6 |
+
COLOR_PALETTE = [
|
| 7 |
+
"#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA",
|
| 8 |
+
"#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B",
|
| 9 |
+
"#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42",
|
| 10 |
+
"#009688", "#673AB7", "#F44336", "#3F51B5", "#795548",
|
| 11 |
+
"#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800",
|
| 12 |
+
"#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
def get_model_colors(df):
|
| 16 |
+
"""Assign colors to methods by Avg AUP rank (descending)."""
|
| 17 |
+
models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
|
| 18 |
+
return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)}
|
| 19 |
+
|
| 20 |
+
def get_model_ranks(df):
|
| 21 |
+
"""Get rank for each method by Avg AUP."""
|
| 22 |
+
models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
|
| 23 |
+
return {model: i + 1 for i, model in enumerate(models_sorted)}
|
| 24 |
+
|
| 25 |
+
def hex_to_rgba(hex_color, alpha=0.25):
|
| 26 |
+
hex_color = hex_color.lstrip('#')
|
| 27 |
+
r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
|
| 28 |
+
return f'rgba({r},{g},{b},{alpha})'
|
| 29 |
+
|
| 30 |
+
def create_radar_chart(df, tasks, top_n=15):
|
| 31 |
+
"""Create radar chart for top N methods showing original AUP scores (independent axes)."""
|
| 32 |
+
df_top = df.head(top_n).copy()
|
| 33 |
+
model_colors = get_model_colors(df)
|
| 34 |
+
model_ranks = get_model_ranks(df)
|
| 35 |
+
|
| 36 |
+
all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"]
|
| 37 |
+
categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"]
|
| 38 |
+
|
| 39 |
+
# Compute min/max per column for normalization (for radar display only)
|
| 40 |
+
col_stats = {}
|
| 41 |
+
for col in all_cols:
|
| 42 |
+
vals = df_top[col].dropna().astype(float)
|
| 43 |
+
col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0,
|
| 44 |
+
'max': vals.max() if len(vals) > 0 else 100}
|
| 45 |
+
|
| 46 |
+
fig = go.Figure()
|
| 47 |
+
|
| 48 |
+
for _, row in df_top.iterrows():
|
| 49 |
+
method = row["Method"]
|
| 50 |
+
rank = model_ranks.get(method, 0)
|
| 51 |
+
color = model_colors.get(method, "#808080")
|
| 52 |
+
display_name = f"#{rank} {method}"
|
| 53 |
+
|
| 54 |
+
# Original AUP values for hover display
|
| 55 |
+
original_vals = [row.get(col, 0) or 0 for col in all_cols]
|
| 56 |
+
|
| 57 |
+
# Normalized values for radar shape (0-100 scale per axis)
|
| 58 |
+
normalized = []
|
| 59 |
+
for col, val in zip(all_cols, original_vals):
|
| 60 |
+
stats = col_stats[col]
|
| 61 |
+
range_val = stats['max'] - stats['min']
|
| 62 |
+
if range_val > 0:
|
| 63 |
+
norm = ((val - stats['min']) / range_val) * 80 + 10 # Scale to 10-90
|
| 64 |
+
else:
|
| 65 |
+
norm = 50
|
| 66 |
+
normalized.append(norm)
|
| 67 |
+
|
| 68 |
+
# Custom hover text showing original AUP scores
|
| 69 |
+
hover_texts = [f"<b>{display_name}</b><br>{cat}: <b>{val:.1f}</b>"
|
| 70 |
+
for cat, val in zip(categories, original_vals)]
|
| 71 |
+
|
| 72 |
+
fig.add_trace(go.Scatterpolar(
|
| 73 |
+
r=normalized + [normalized[0]],
|
| 74 |
+
theta=categories + [categories[0]],
|
| 75 |
+
mode='lines+markers', fill='toself', name=display_name,
|
| 76 |
+
line=dict(color=color, width=2), marker=dict(color=color, size=6),
|
| 77 |
+
fillcolor=hex_to_rgba(color, 0.15), opacity=0.9,
|
| 78 |
+
text=hover_texts + [hover_texts[0]],
|
| 79 |
+
hovertemplate='%{text}<extra></extra>'
|
| 80 |
+
))
|
| 81 |
+
|
| 82 |
+
fig.update_layout(
|
| 83 |
+
height=600, margin=dict(l=100, r=250, t=80, b=60),
|
| 84 |
+
title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
|
| 85 |
+
# title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
|
| 86 |
+
polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11),
|
| 87 |
+
tickvals=[], showticklabels=False)),
|
| 88 |
+
legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)',
|
| 89 |
+
bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))),
|
| 90 |
+
hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333")
|
| 91 |
+
)
|
| 92 |
+
return fig
|
| 93 |
+
|
| 94 |
+
def create_group_bar_chart(df, tasks, top_n=15):
|
| 95 |
+
"""Create grouped bar chart with Avg AUP included and rank numbers."""
|
| 96 |
+
df_top = df.head(top_n).copy()
|
| 97 |
+
methods = df_top["Method"].tolist()
|
| 98 |
+
model_colors = get_model_colors(df)
|
| 99 |
+
model_ranks = get_model_ranks(df)
|
| 100 |
+
|
| 101 |
+
all_benchmarks = tasks + ["Avg_AUP"]
|
| 102 |
+
fig = go.Figure()
|
| 103 |
+
|
| 104 |
+
for method in methods:
|
| 105 |
+
row = df_top[df_top["Method"] == method].iloc[0]
|
| 106 |
+
color = model_colors.get(method, "#808080")
|
| 107 |
+
rank = model_ranks.get(method, 0)
|
| 108 |
+
display_name = f"#{rank} {method}"
|
| 109 |
+
|
| 110 |
+
y_vals, x_vals = [], []
|
| 111 |
+
for bench in all_benchmarks:
|
| 112 |
+
aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP")
|
| 113 |
+
if aup is not None and not (isinstance(aup, float) and aup != aup):
|
| 114 |
+
y_vals.append(aup)
|
| 115 |
+
x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench)
|
| 116 |
+
|
| 117 |
+
if y_vals:
|
| 118 |
+
fig.add_trace(go.Bar(
|
| 119 |
+
name=display_name, x=x_vals, y=y_vals, marker_color=color,
|
| 120 |
+
hovertemplate=f"<b>{display_name}</b><br>%{{x}}: %{{y:.1f}}<extra></extra>"
|
| 121 |
+
))
|
| 122 |
+
|
| 123 |
+
fig.update_layout(
|
| 124 |
+
height=550, margin=dict(l=60, r=250, t=80, b=100),
|
| 125 |
+
title=dict(text=f"📊 Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)),
|
| 126 |
+
# title=dict(text=f"📊 Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)),
|
| 127 |
+
xaxis_title="Benchmark", yaxis_title="AUP Score",
|
| 128 |
+
barmode='group', bargap=0.2, bargroupgap=0.05,
|
| 129 |
+
legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
|
| 130 |
+
bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))),
|
| 131 |
+
hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
|
| 132 |
+
)
|
| 133 |
+
return fig
|
| 134 |
+
|
| 135 |
+
def create_aup_curve_chart(raw_data, tasks, df, top_n=15):
|
| 136 |
+
"""Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py)."""
|
| 137 |
+
df_top = df.head(top_n).copy()
|
| 138 |
+
model_colors = get_model_colors(df)
|
| 139 |
+
model_ranks = get_model_ranks(df)
|
| 140 |
+
methods_to_show = set(df_top["Method"].tolist())
|
| 141 |
+
|
| 142 |
+
# Build per-task data: {task: {method: [(rho, y), ...]}}
|
| 143 |
+
task_data = {t: {} for t in tasks}
|
| 144 |
+
for task in tasks:
|
| 145 |
+
for method, pairs in raw_data.get(task, {}).items():
|
| 146 |
+
if method in methods_to_show:
|
| 147 |
+
task_data[task][method] = [(p[0], p[1]) for p in pairs]
|
| 148 |
+
|
| 149 |
+
# Compute average data: average TPF and Acc by index across tasks (all tasks have same length)
|
| 150 |
+
avg_data = {}
|
| 151 |
+
for method in methods_to_show:
|
| 152 |
+
task_points = [task_data.get(t, {}).get(method, []) for t in tasks]
|
| 153 |
+
task_points = [p for p in task_points if p] # filter empty
|
| 154 |
+
if not task_points:
|
| 155 |
+
continue
|
| 156 |
+
n_points = len(task_points[0])
|
| 157 |
+
avg_data[method] = [
|
| 158 |
+
(np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5)
|
| 159 |
+
for i in range(n_points)
|
| 160 |
+
]
|
| 161 |
+
|
| 162 |
+
# 6 subplots: 5 tasks + 1 Average at (2,3)
|
| 163 |
+
titles = tasks + ["Average"]
|
| 164 |
+
fig = make_subplots(rows=2, cols=3, subplot_titles=titles,
|
| 165 |
+
horizontal_spacing=0.08, vertical_spacing=0.15)
|
| 166 |
+
|
| 167 |
+
# Track which methods have been added to legend
|
| 168 |
+
legend_added = set()
|
| 169 |
+
|
| 170 |
+
def get_pos(idx):
|
| 171 |
+
if idx < 3:
|
| 172 |
+
return (1, idx + 1)
|
| 173 |
+
return (2, idx - 2) # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3)
|
| 174 |
+
|
| 175 |
+
# Helper to draw curve for a given subplot
|
| 176 |
+
def draw_curve(pairs, method, row, col):
|
| 177 |
+
nonlocal legend_added
|
| 178 |
+
if not pairs:
|
| 179 |
+
return
|
| 180 |
+
color = model_colors.get(method, "#808080")
|
| 181 |
+
rank = model_ranks.get(method, 0)
|
| 182 |
+
display_name = f"#{rank} {method}"
|
| 183 |
+
show_legend = method not in legend_added
|
| 184 |
+
if show_legend:
|
| 185 |
+
legend_added.add(method)
|
| 186 |
+
|
| 187 |
+
rho, y = zip(*sorted(pairs, key=lambda x: x[0]))
|
| 188 |
+
rho, y = np.array(rho), np.array(y)
|
| 189 |
+
|
| 190 |
+
# Generate smooth curve (quadratic fitting, same as plot_lines.py)
|
| 191 |
+
if len(rho) >= 3:
|
| 192 |
+
z = np.polyfit(rho, y, 2)
|
| 193 |
+
p = np.poly1d(z)
|
| 194 |
+
x_smooth = np.linspace(rho.min(), rho.max(), 300)
|
| 195 |
+
y_smooth = p(x_smooth)
|
| 196 |
+
elif len(rho) == 2:
|
| 197 |
+
x_smooth = np.linspace(rho.min(), rho.max(), 300)
|
| 198 |
+
if rho[1] != rho[0]:
|
| 199 |
+
a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2)
|
| 200 |
+
y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0]
|
| 201 |
+
else:
|
| 202 |
+
y_smooth = np.linspace(y[0], y[1], 300)
|
| 203 |
+
else:
|
| 204 |
+
x_smooth, y_smooth = rho, y
|
| 205 |
+
|
| 206 |
+
# Add fitted curve
|
| 207 |
+
fig.add_trace(go.Scatter(
|
| 208 |
+
x=x_smooth, y=y_smooth, mode='lines', name=display_name,
|
| 209 |
+
line=dict(color=color, width=2.5), opacity=0.85,
|
| 210 |
+
showlegend=show_legend, legendgroup=method,
|
| 211 |
+
hoverinfo='skip'
|
| 212 |
+
), row=row, col=col)
|
| 213 |
+
|
| 214 |
+
# Add markers at original data points
|
| 215 |
+
fig.add_trace(go.Scatter(
|
| 216 |
+
x=rho, y=y, mode='markers', name=display_name,
|
| 217 |
+
marker=dict(color='white', size=8, line=dict(color=color, width=2)),
|
| 218 |
+
showlegend=False, legendgroup=method,
|
| 219 |
+
hovertemplate=f"<b>{display_name}</b><br>TPF: %{{x:.2f}}<br>Acc: %{{y:.1f}}<extra></extra>"
|
| 220 |
+
), row=row, col=col)
|
| 221 |
+
|
| 222 |
+
# Draw 5 task subplots
|
| 223 |
+
for idx, task in enumerate(tasks):
|
| 224 |
+
row, col = get_pos(idx)
|
| 225 |
+
data = task_data.get(task, {})
|
| 226 |
+
for method in df_top["Method"].tolist():
|
| 227 |
+
if method in data:
|
| 228 |
+
draw_curve(data[method], method, row, col)
|
| 229 |
+
|
| 230 |
+
# Draw Average subplot at (2, 3)
|
| 231 |
+
for method in df_top["Method"].tolist():
|
| 232 |
+
if method in avg_data:
|
| 233 |
+
draw_curve(avg_data[method], method, 2, 3)
|
| 234 |
+
|
| 235 |
+
fig.update_layout(
|
| 236 |
+
height=550, margin=dict(l=60, r=250, t=80, b=60),
|
| 237 |
+
title=dict(text=f"📈 Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)),
|
| 238 |
+
legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
|
| 239 |
+
bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)),
|
| 240 |
+
tracegroupgap=1, itemsizing='constant'),
|
| 241 |
+
hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# Update axes labels for 6 subplots
|
| 245 |
+
for idx in range(6):
|
| 246 |
+
row, col = get_pos(idx)
|
| 247 |
+
fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col)
|
| 248 |
+
fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col)
|
| 249 |
+
|
| 250 |
+
return fig
|
src/envs.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Environment configuration - not used in current implementation
|
src/leaderboard/read_evals.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import yaml
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# AUP calculation (from d3LLM_Code/aup_utils.py)
|
| 7 |
+
def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float:
|
| 8 |
+
"""Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)"""
|
| 9 |
+
return min(math.exp(-alpha * (1 - y / y_max)), 1.0)
|
| 10 |
+
|
| 11 |
+
def get_aup(rho: list, y: list, y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0) -> float:
|
| 12 |
+
"""Calculate AUP (Accuracy Under Parallelism) score."""
|
| 13 |
+
if len(rho) == 0:
|
| 14 |
+
return 0.0
|
| 15 |
+
sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0])
|
| 16 |
+
sorted_rho, sorted_y = zip(*sorted_pairs)
|
| 17 |
+
sorted_rho, sorted_y = list(sorted_rho), list(sorted_y)
|
| 18 |
+
|
| 19 |
+
y_1 = sorted_y[0]
|
| 20 |
+
y_min = y_1 - y_min_offset
|
| 21 |
+
filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min]
|
| 22 |
+
if len(filtered_pairs) == 0:
|
| 23 |
+
return sorted_rho[0] * sorted_y[0]
|
| 24 |
+
|
| 25 |
+
filtered_rho, filtered_y = zip(*filtered_pairs)
|
| 26 |
+
filtered_rho, filtered_y = list(filtered_rho), list(filtered_y)
|
| 27 |
+
|
| 28 |
+
aup = filtered_rho[0] * filtered_y[0]
|
| 29 |
+
for i in range(1, len(filtered_rho)):
|
| 30 |
+
y_i, y_prev = filtered_y[i], filtered_y[i-1]
|
| 31 |
+
w_i = weight_function(y_i, y_max, alpha)
|
| 32 |
+
w_prev = weight_function(y_prev, y_max, alpha)
|
| 33 |
+
aup += 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev)
|
| 34 |
+
return aup
|
| 35 |
+
|
| 36 |
+
DATA_DIR = Path(__file__).parent.parent.parent / "d3LLM_Code"
|
| 37 |
+
DATA_FILES = ["data_dream.yaml", "data_llada.yaml", "data_dream_coder.yaml"]
|
| 38 |
+
|
| 39 |
+
# Merge HumanEval-Instruct -> HumanEval, MBPP-Instruct -> MBPP; exclude HumanEval+, MBPP+
|
| 40 |
+
TASK_MERGE = {"HumanEval-Instruct": "HumanEval", "MBPP-Instruct": "MBPP"}
|
| 41 |
+
TASK_EXCLUDE = {"HumanEval+", "MBPP+"}
|
| 42 |
+
TASK_ORDER = ["GSM8K-CoT", "MATH", "MBPP", "HumanEval", "Long-GSM8K"]
|
| 43 |
+
AVG_AUP_DIVISOR = 5
|
| 44 |
+
|
| 45 |
+
def load_yaml_data():
|
| 46 |
+
"""Load YAML files separately, compute y_max per file/task, then merge."""
|
| 47 |
+
all_results = {} # {method: {task: (aup, tpf, acc)}}
|
| 48 |
+
all_meta = {}
|
| 49 |
+
all_tasks = set()
|
| 50 |
+
raw_data = {} # {task: {method: [(rho, y), ...]}} for curve plotting
|
| 51 |
+
|
| 52 |
+
for filename in DATA_FILES:
|
| 53 |
+
filepath = DATA_DIR / filename
|
| 54 |
+
if not filepath.exists():
|
| 55 |
+
continue
|
| 56 |
+
with open(filepath, 'r') as f:
|
| 57 |
+
data = yaml.safe_load(f)
|
| 58 |
+
|
| 59 |
+
meta = data.pop('_meta', {})
|
| 60 |
+
all_meta.update(meta)
|
| 61 |
+
|
| 62 |
+
# Compute y_max per task WITHIN this file only (as per main.py)
|
| 63 |
+
file_tasks = {k: v for k, v in data.items() if k not in TASK_EXCLUDE}
|
| 64 |
+
y_max_per_task = {}
|
| 65 |
+
for task, methods in file_tasks.items():
|
| 66 |
+
y_max_per_task[task] = max(y for pairs in methods.values() for _, y in pairs)
|
| 67 |
+
|
| 68 |
+
# Calculate AUP for each method/task in this file
|
| 69 |
+
for task, methods in file_tasks.items():
|
| 70 |
+
target_task = TASK_MERGE.get(task, task)
|
| 71 |
+
all_tasks.add(target_task)
|
| 72 |
+
y_max = y_max_per_task[task]
|
| 73 |
+
|
| 74 |
+
# Store raw data for curve plotting
|
| 75 |
+
if target_task not in raw_data:
|
| 76 |
+
raw_data[target_task] = {}
|
| 77 |
+
|
| 78 |
+
for method, pairs in methods.items():
|
| 79 |
+
if method not in all_results:
|
| 80 |
+
all_results[method] = {}
|
| 81 |
+
|
| 82 |
+
rho_list = [p[0] for p in pairs]
|
| 83 |
+
y_list = [p[1] for p in pairs]
|
| 84 |
+
aup = get_aup(rho_list, y_list, y_max)
|
| 85 |
+
tpf = max(rho_list)
|
| 86 |
+
acc = pairs[0][1] if len(pairs) == 1 else [p[1] for p in pairs if p[0] == max(rho_list)][0]
|
| 87 |
+
all_results[method][target_task] = (round(aup, 1), round(tpf, 2), round(acc, 1))
|
| 88 |
+
raw_data[target_task][method] = pairs
|
| 89 |
+
|
| 90 |
+
# Return tasks in specified order
|
| 91 |
+
ordered_tasks = [t for t in TASK_ORDER if t in all_tasks]
|
| 92 |
+
return all_results, all_meta, ordered_tasks, raw_data
|
| 93 |
+
|
| 94 |
+
def compute_leaderboard():
|
| 95 |
+
"""Compute leaderboard DataFrame from YAML data."""
|
| 96 |
+
results_dict, meta, tasks, raw_data = load_yaml_data()
|
| 97 |
+
|
| 98 |
+
results = []
|
| 99 |
+
for method in sorted(results_dict.keys()):
|
| 100 |
+
method_meta = meta.get(method, {})
|
| 101 |
+
row = {
|
| 102 |
+
"Method": method,
|
| 103 |
+
"Type": method_meta.get("type", "?"),
|
| 104 |
+
"Foundation": method_meta.get("foundation", "?"),
|
| 105 |
+
"Link": method_meta.get("link", ""),
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
aup_sum = 0.0
|
| 109 |
+
for task in tasks:
|
| 110 |
+
if task in results_dict[method]:
|
| 111 |
+
aup, tpf, acc = results_dict[method][task]
|
| 112 |
+
row[f"{task}_AUP"], row[f"{task}_TPF"], row[f"{task}_Acc"] = aup, tpf, acc
|
| 113 |
+
aup_sum += aup
|
| 114 |
+
else:
|
| 115 |
+
row[f"{task}_AUP"] = row[f"{task}_TPF"] = row[f"{task}_Acc"] = None
|
| 116 |
+
|
| 117 |
+
row["Avg_AUP"] = round(aup_sum / AVG_AUP_DIVISOR, 1)
|
| 118 |
+
results.append(row)
|
| 119 |
+
|
| 120 |
+
df = pd.DataFrame(results).sort_values("Avg_AUP", ascending=False).reset_index(drop=True)
|
| 121 |
+
return df, tasks, raw_data
|
| 122 |
+
|
| 123 |
+
def get_leaderboard_df():
|
| 124 |
+
df, _, _ = compute_leaderboard()
|
| 125 |
+
return df
|
| 126 |
+
|
| 127 |
+
def get_tasks():
|
| 128 |
+
_, tasks, _ = compute_leaderboard()
|
| 129 |
+
return tasks
|
| 130 |
+
|
| 131 |
+
def get_raw_data():
|
| 132 |
+
_, _, raw_data = compute_leaderboard()
|
| 133 |
+
return raw_data
|
src/populate.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Population utilities - not used in current implementation
|
src/submission/check_validity.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Submission validation - not used in current implementation
|
src/submission/submit.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Submission handling - not used in current implementation
|