d3LLM-Data-LLaDA commited on
Commit
d473371
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: dLLM Leaderboard
3
+ emoji: 🏆
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ short_description: A leaderboard of Diffusion Large Language Models (dLLMs)
11
+ sdk_version: 5.43.1
12
+ tags:
13
+ - leaderboard
14
+ ---
15
+
16
+ # Start the configuration
17
+
18
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
19
+
20
+ Results files should have the following format and be stored as json files:
21
+ ```json
22
+ {
23
+ "config": {
24
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
25
+ "model_name": "path of the model on the hub: org/model",
26
+ "model_sha": "revision on the hub",
27
+ },
28
+ "results": {
29
+ "task_name": {
30
+ "metric_name": score,
31
+ },
32
+ "task_name2": {
33
+ "metric_name": score,
34
+ }
35
+ }
36
+ }
37
+ ```
38
+
39
+ Request files are created automatically by this tool.
40
+
41
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
42
+
43
+ # Code logic for more complex edits
44
+
45
+ You'll find
46
+ - the main table' columns names and properties in `src/display/utils.py`
47
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
48
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from src.leaderboard.read_evals import get_leaderboard_df, get_tasks, get_raw_data
4
+ from src.display.visualization import create_radar_chart, create_group_bar_chart, create_aup_curve_chart
5
+ from src.display.css_html_js import custom_css, sort_table_js, get_foundation_class
6
+
7
+ CITATION_HTML = """
8
+ <div style="max-width: 800px; margin: 30px auto 0 auto; padding: 20px; background: #f8f7ff; border-radius: 12px; border-left: 4px solid #5a3d8a;">
9
+ <p style="margin: 0 0 12px 0; color: #5a3d8a; font-weight: 600;">📝 If you find this Leaderboard useful for your research, please star <a href="https://github.com/hao-ai-lab/d3llm" target="_blank" style="color: #5a3d8a;">our GitHub repo</a> and cite our work:</p>
10
+ <pre style="background: #fff; padding: 15px; border-radius: 8px; overflow-x: auto; font-size: 12px; margin: 0; color: #333; white-space: pre-wrap; word-wrap: break-word;">@article{preprint'25:d3llm,
11
+ author = {Yu-Yang Qian and Junda Su and Lanxiang Hu and Peiyuan Zhang and Zhijie Deng and Peng Zhao and Hao Zhang},
12
+ title = {d3LLM: Ultra-Fast Diffusion LLM using Pseudo-Trajectory Distillation},
13
+ journal = {ArXiv preprint},
14
+ volume = {to appear},
15
+ note = {\\url{https://github.com/hao-ai-lab/d3LLM} [Accessed: 2025-12-11]},
16
+ year = {2025}
17
+ }</pre>
18
+ </div>
19
+ """
20
+
21
+ def create_leaderboard_html(df, tasks):
22
+ """Generate HTML table for detailed results."""
23
+ rows_html = ""
24
+ for rank, (_, row) in enumerate(df.iterrows(), 1):
25
+ medal = f'<span class="top-medal">{["🥇", "🥈", "🥉"][rank-1]}</span>' if rank <= 3 else str(rank)
26
+
27
+ # Method with link
28
+ method = row['Method']
29
+ link = row.get('Link', '')
30
+ method_html = f'<a href="{link}" target="_blank">{method}</a>' if link else method
31
+
32
+ # Type badge
33
+ type_val = row.get('Type', '?')
34
+ type_display = 'dLLM' if type_val == 'dLLM' else type_val
35
+ type_class = 'ar' if type_val == 'AR' else 'dllm'
36
+
37
+ # Foundation badge
38
+ foundation = row.get('Foundation', '?')
39
+ foundation_class = get_foundation_class(foundation)
40
+
41
+ # Build cells for each task
42
+ task_cells = ""
43
+ for task in tasks:
44
+ aup = row.get(f'{task}_AUP')
45
+ tpf = row.get(f'{task}_TPF')
46
+ acc = row.get(f'{task}_Acc')
47
+ if pd.notna(aup):
48
+ task_cells += f'''<td>
49
+ <span class="aup-score">{aup:.1f}</span>
50
+ <span class="sub-metrics">TPF:{tpf:.2f} Acc:{acc:.1f}</span>
51
+ </td>'''
52
+ else:
53
+ task_cells += '<td><span class="aup-score">-</span></td>'
54
+
55
+ # Avg AUP
56
+ avg_aup = row.get('Avg_AUP', 0)
57
+
58
+ rows_html += f'''<tr>
59
+ <td class="rank-cell"><span class="rank-medal">{medal}</span></td>
60
+ <td class="method-cell">{method_html}</td>
61
+ <td class="type-cell"><span class="type-badge {type_class}">{type_display}</span></td>
62
+ <td class="foundation-cell"><span class="foundation-badge {foundation_class}">{foundation}</span></td>
63
+ {task_cells}
64
+ <td class="avg-cell"><span class="aup-score">{avg_aup:.1f}</span></td>
65
+ </tr>'''
66
+
67
+ task_headers = ''.join(f'<th>{t}</th>' for t in tasks)
68
+
69
+ return f'''
70
+ {sort_table_js}
71
+ <div class="table-wrapper">
72
+ <table class="leaderboard-table">
73
+ <thead><tr>
74
+ <th>Rank</th><th>Method</th><th>Type</th><th>Foundation Model</th>
75
+ {task_headers}
76
+ <th>Avg AUP</th>
77
+ </tr></thead>
78
+ <tbody>{rows_html}</tbody>
79
+ </table>
80
+ </div>
81
+ '''
82
+
83
+ def update_charts(top_n):
84
+ df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
85
+ return create_radar_chart(df, tasks, top_n), create_group_bar_chart(df, tasks, top_n), create_aup_curve_chart(raw_data, tasks, df, top_n)
86
+
87
+ # Load data
88
+ df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
89
+ default_top_n = min(15, len(df))
90
+
91
+ with gr.Blocks(css=custom_css, title="dLLM Leaderboard", fill_height=False) as demo:
92
+ gr.HTML('''
93
+ <div class="welcome-banner">
94
+ <h2>🫧 Welcome to dLLM Leaderboard! 🏆</h2>
95
+ <p>Benchmarking various Diffusion Large Language Models (dLLMs) with <i><a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank" style="color: inherit; text-decoration: underline;">AUP (Accuracy Under Parallelism)</a></i>, considering both accuracy and parallelism.</p>
96
+ </div>
97
+ ''')
98
+
99
+ with gr.Tabs():
100
+ with gr.TabItem("📊 Leaderboard"):
101
+ with gr.Row():
102
+ top_n_slider = gr.Slider(minimum=3, maximum=len(df), value=default_top_n, step=1,
103
+ label="Number of Top Methods to Display")
104
+
105
+ with gr.Row():
106
+ radar_plot = gr.Plot(value=create_radar_chart(df, tasks, default_top_n))
107
+ with gr.Row():
108
+ bar_plot = gr.Plot(value=create_group_bar_chart(df, tasks, default_top_n))
109
+ with gr.Row():
110
+ curve_plot = gr.Plot(value=create_aup_curve_chart(raw_data, tasks, df, default_top_n))
111
+
112
+ top_n_slider.change(fn=update_charts, inputs=[top_n_slider], outputs=[radar_plot, bar_plot, curve_plot])
113
+
114
+ gr.Markdown("### 🏆 Detailed Leaderboard")
115
+ gr.HTML(create_leaderboard_html(df, tasks))
116
+ gr.HTML(CITATION_HTML)
117
+
118
+ with gr.TabItem("📤 Submit Result"):
119
+ gr.HTML("""
120
+ <div class="content-wrapper">
121
+ <div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
122
+ <h2>Submit Your Results</h2>
123
+ <p>We welcome contributions to the dLLM Leaderboard! To submit your method's results:</p>
124
+
125
+ <h3>Step 1: Evaluate Your Method</h3>
126
+ <p>Follow the evaluation protocol in the <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">d3LLM repository</a>.
127
+ Refer to the <code>eval_scripts</code> folder for benchmark evaluation scripts, and <code>AUP_leaderboard</code> folder for AUP calculation utilities.</p>
128
+
129
+ <h3>Step 2: Prepare Your Evaluation Results</h3>
130
+ <p>Add your results to the appropriate YAML file following this format:</p>
131
+ <pre style="background: #f5f5f5; padding: 15px; border-radius: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word;">_meta:
132
+ YourMethod:
133
+ type: dLLM # or AR
134
+ foundation: YourFoundation
135
+ link: https://link/to/your/method
136
+
137
+ TaskName:
138
+ YourMethod:
139
+ - [rho_1, accuracy_1] # (parallelism, accuracy) pairs
140
+ - [rho_2, accuracy_2]</pre>
141
+
142
+ <h3>Step 3: Submit a Pull Request</h3>
143
+ <ol>
144
+ <li>Fork the repository</li>
145
+ <li>Add your results to the YAML files</li>
146
+ <li>Submit a PR with your method name, description, and evaluation details</li>
147
+ </ol>
148
+
149
+ <p><strong>Questions?</strong> Open an issue on <a href="https://github.com/hao-ai-lab/d3LLM/issues" target="_blank">GitHub</a>.</p>
150
+ </div>
151
+ </div>
152
+ """ + CITATION_HTML)
153
+
154
+ with gr.TabItem("ℹ️ About"):
155
+ gr.HTML("""
156
+ <div class="content-wrapper">
157
+ <div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
158
+ <h2>About dLLM Leaderboard</h2>
159
+ <p>This leaderboard evaluates <strong>Diffusion Large Language Models (dLLMs)</strong> using the <strong>AUP (Accuracy Under Parallelism)</strong> metric.</p>
160
+
161
+ <h3>Metrics</h3>
162
+ <ul>
163
+ <li><strong>AUP</strong>: Primary metric - measures efficiency-accuracy trade-off (higher is better)</li>
164
+ <li><strong>TPF</strong>: Tokens Per Forward - parallelism level achieved</li>
165
+ <li><strong>Acc</strong>: Accuracy at maximum parallelism</li>
166
+ </ul>
167
+
168
+ <h3>Benchmarks</h3>
169
+ <p>GSM8K-CoT, MATH, HumanEval, MBPP, Long-GSM8K</p>
170
+
171
+ <h3>References</h3>
172
+ <p>
173
+ GitHub Code Repo: <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">https://github.com/hao-ai-lab/d3LLM</a><br>
174
+ Blog: <a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank">https://hao-ai-lab.github.io/blogs/text-diffusion/</a>
175
+ </p>
176
+ </div>
177
+ </div>
178
+ """ + CITATION_HTML)
179
+
180
+ demo.launch()
d3LLM_Code/aup_utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUP (Accuracy Under Parallelism) measure for parallel decoders
2
+ # See paper for detailed definition and motivation
3
+ import math
4
+
5
+ def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float:
6
+ """Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)"""
7
+ return min(math.exp(-alpha * (1 - y / y_max)), 1.0)
8
+
9
+ def get_aup(rho: list[float], y: list[float], y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0, is_print: bool = False) -> float:
10
+ """
11
+ Calculate the Accuracy Under Parallelism (AUP) of parallelism-accuracy pairs.
12
+
13
+ Args:
14
+ rho: list of parallelism values (TPF, tokens per forward)
15
+ y: list of accuracy values in [0, 100] (percentage)
16
+ y_max: maximum accuracy across all methods (for normalization)
17
+ alpha: penalty factor for accuracy degradation (default: 3.0)
18
+ y_min_offset: minimum accuracy threshold offset (default: 5.0, i.e., 5%)
19
+
20
+ Returns:
21
+ AUP score (scalar value)
22
+ """
23
+ assert len(rho) == len(y), "rho and y must have the same length"
24
+ assert len(rho) > 0, "rho and y must not be empty"
25
+ assert all(r > 0 for r in rho), "all rho must be positive"
26
+
27
+ # Check if y values are in [0, 100] range
28
+ if any(acc < 1.0 for acc in y):
29
+ print("\033[91mWarning: Detected accuracy values < 1.0. Please check if accuracy should be in percentage (0-100) instead of (0-1).\033[0m")
30
+
31
+ # Sort by rho
32
+ sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0])
33
+ sorted_rho, sorted_y = zip(*sorted_pairs)
34
+ sorted_rho, sorted_y = list(sorted_rho), list(sorted_y)
35
+
36
+ # Filter by y_min threshold (y_1 - y_min_offset)
37
+ y_1 = sorted_y[0]
38
+ assert y_1 - sorted_y[-1] <= y_min_offset, f"Accuracy degradation is too large: minimum accuracy should be at least {y_min_offset:.2f} lower than the maximum accuracy. Max Acc: {y_1}, min Acc: {sorted_y[-1]}"
39
+ y_min = y_1 - y_min_offset
40
+ filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min]
41
+ assert len(filtered_pairs) > 0, f"No valid pairs after filtering with y_min={y_min}"
42
+
43
+ filtered_rho, filtered_y = zip(*filtered_pairs)
44
+ filtered_rho, filtered_y = list(filtered_rho), list(filtered_y)
45
+
46
+ # Calculate AUP: first term + trapezoidal sum
47
+ aup = filtered_rho[0] * filtered_y[0]
48
+ formula_parts = [f"{filtered_rho[0]:.2f} * {filtered_y[0]:.2f}"]
49
+
50
+ for i in range(1, len(filtered_rho)):
51
+ y_i = filtered_y[i]
52
+ y_prev = filtered_y[i-1]
53
+ w_i = weight_function(y_i, y_max, alpha)
54
+ w_prev = weight_function(y_prev, y_max, alpha)
55
+ term = 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev)
56
+ aup += term
57
+ formula_parts.append(f"({filtered_rho[i]:.2f}-{filtered_rho[i-1]:.2f}) * ({y_i:.2f} * {w_i:.4f} + {y_prev:.2f} * {w_prev:.4f})")
58
+
59
+ if is_print:
60
+ formula = f" AUP = " + " + ".join(formula_parts) + f" = {aup:.2f}"
61
+ print(formula)
62
+
63
+ return aup
d3LLM_Code/data_dream.yaml ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUP Benchmark Data
2
+ # Format: task -> method -> list of (rho, accuracy) pairs
3
+ # rho: parallelism (tokens per forward)
4
+ # accuracy: model accuracy (0-1 scale)
5
+
6
+ # Model metadata: type (AR/dLLM), foundation model, link
7
+ _meta:
8
+ Qwen-2.5-7B-it:
9
+ type: AR
10
+ foundation: Qwen-2.5-7B-it
11
+ link: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
12
+ EAGLE-3:
13
+ type: AR
14
+ foundation: Llama-3.1-8B-it
15
+ link: https://github.com/SafeAILab/EAGLE
16
+ Dream:
17
+ type: dLLM
18
+ foundation: Dream-v0-it-7B
19
+ link: https://github.com/DreamLM/Dream
20
+ Fast-dLLM-Dream:
21
+ type: dLLM
22
+ foundation: Dream-v0-it-7B
23
+ link: https://github.com/NVlabs/Fast-dLLM
24
+ Fast-dLLM-v2:
25
+ type: dLLM
26
+ foundation: Qwen-2.5-7B-it
27
+ link: https://github.com/NVlabs/Fast-dLLM/tree/main/v2
28
+ dParallel-Dream:
29
+ type: dLLM
30
+ foundation: Dream-v0-it-7B
31
+ link: https://github.com/czg1225/dParallel
32
+ d3LLM-Dream:
33
+ type: dLLM
34
+ foundation: Dream-v0-it-7B
35
+ link: https://github.com/hao-ai-lab/d3llm
36
+
37
+ GSM8K-CoT:
38
+ Qwen-2.5-7B-it:
39
+ - [1.0, 74.1]
40
+ EAGLE-3:
41
+ - [1.0, 76.57]
42
+ - [5.12, 76.57]
43
+ Dream:
44
+ - [1.0, 83.94]
45
+ Fast-dLLM-Dream:
46
+ - [1.0, 83.68]
47
+ - [1.44, 79.0]
48
+ Fast-dLLM-v2:
49
+ - [1.0, 82.82]
50
+ - [2.21, 81.48]
51
+ dParallel-Dream:
52
+ - [1.0, 83.8]
53
+ - [3.02, 82.12]
54
+ d3LLM-Dream:
55
+ - [1.0, 83.47]
56
+ - [4.94, 81.36]
57
+ MATH:
58
+ Qwen-2.5-7B-it:
59
+ - [1.0, 41.15]
60
+ EAGLE-3:
61
+ - [1.0, 39.80]
62
+ - [5.72, 39.80]
63
+ Dream:
64
+ - [1.0, 39.63]
65
+ Fast-dLLM-Dream:
66
+ - [1.0, 39.53]
67
+ - [1.78, 38.3]
68
+ Fast-dLLM-v2:
69
+ - [1.0, 49.92]
70
+ - [2.61, 48.74]
71
+ dParallel-Dream:
72
+ - [1.0, 39.06]
73
+ - [2.94, 38.72]
74
+ d3LLM-Dream:
75
+ - [1.0, 39.38]
76
+ - [3.92, 38.21]
77
+ MBPP-Instruct:
78
+ Qwen-2.5-7B-it:
79
+ - [1.0, 63.8]
80
+ EAGLE-3:
81
+ - [1.0, 60.20]
82
+ - [5.69, 60.20]
83
+ Dream:
84
+ - [1.0, 57.2]
85
+ Fast-dLLM-Dream:
86
+ - [1.0, 56.38]
87
+ - [1.2, 53.2]
88
+ Fast-dLLM-v2:
89
+ - [1.0, 61.23]
90
+ - [2.04, 59.12]
91
+ dParallel-Dream:
92
+ - [1.0, 57.8]
93
+ - [2.24, 55.4]
94
+ d3LLM-Dream:
95
+ - [1.0, 58.8]
96
+ - [2.96, 55.60]
97
+ HumanEval-Instruct:
98
+ Qwen-2.5-7B-it:
99
+ - [1.0, 72.25]
100
+ EAGLE-3:
101
+ - [1.0, 67.58]
102
+ - [5.98, 67.58]
103
+ Dream:
104
+ - [1.0, 55.2]
105
+ Fast-dLLM-Dream:
106
+ - [1.0, 54.86]
107
+ - [1.33, 54.27]
108
+ Fast-dLLM-v2:
109
+ - [1.0, 63.2]
110
+ - [2.58, 61.7]
111
+ dParallel-Dream:
112
+ - [1.0, 56.08]
113
+ - [2.57, 54.27]
114
+ d3LLM-Dream:
115
+ - [1.0, 58.86]
116
+ - [3.20, 57.10]
117
+ Long-GSM8K:
118
+ Qwen-2.5-7B-it:
119
+ - [1.0, 82.56]
120
+ EAGLE-3:
121
+ - [1.0, 80.52]
122
+ - [5.57, 80.52]
123
+ Dream:
124
+ - [1.0, 78.95]
125
+ Fast-dLLM-Dream:
126
+ - [1.0, 78.83]
127
+ - [1.79, 76.57]
128
+ Fast-dLLM-v2:
129
+ - [1.0, 82.34]
130
+ - [2.58, 80.97]
131
+ dParallel-Dream:
132
+ - [1.0, 81.27]
133
+ - [3.49, 78.56]
134
+ d3LLM-Dream:
135
+ - [1.0, 81.2]
136
+ - [4.80, 77.18]
d3LLM_Code/data_dream_coder.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUP Benchmark Data
2
+ # Format: task -> method -> list of (rho, accuracy) pairs
3
+ # rho: parallelism (tokens per forward)
4
+ # accuracy: model accuracy (0-1 scale)
5
+
6
+ # Model metadata: type (AR/dLLM), foundation model, link
7
+ _meta:
8
+ Qwen2.5-Coder-7B-it:
9
+ type: AR
10
+ foundation: Qwen2.5-Coder-7B-it
11
+ link: https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct
12
+ Dream-Coder-7B:
13
+ type: dLLM
14
+ foundation: Dream-Coder-v0-it-7B
15
+ link: https://github.com/DreamLM/Dream-Coder
16
+ d3LLM-Coder-7B:
17
+ type: dLLM
18
+ foundation: Dream-Coder-v0-it-7B
19
+ link: https://github.com/hao-ai-lab/d3llm
20
+
21
+ HumanEval:
22
+ Qwen2.5-Coder-7B-it:
23
+ - [1.0, 86.6]
24
+ Dream-Coder-7B:
25
+ - [1.0, 82.9]
26
+ d3LLM-Coder-7B:
27
+ - [1.0, 82.4]
28
+ - [2.88, 79.7]
29
+ HumanEval+:
30
+ Qwen2.5-Coder-7B-it:
31
+ - [1.0, 82.3]
32
+ Dream-Coder-7B:
33
+ - [1.0, 76.8]
34
+ d3LLM-Coder-7B:
35
+ - [1.0, 74.4]
36
+ - [2.88, 71.3]
37
+ MBPP:
38
+ Qwen2.5-Coder-7B-it:
39
+ - [1.0, 83.5]
40
+ Dream-Coder-7B:
41
+ - [1.0, 79.9]
42
+ d3LLM-Coder-7B:
43
+ - [1.0, 80.10]
44
+ - [2.5, 80.00]
45
+ MBPP+:
46
+ Qwen2.5-Coder-7B-it:
47
+ - [1.0, 70.1]
48
+ Dream-Coder-7B:
49
+ - [1.0, 68.8]
50
+ d3LLM-Coder-7B:
51
+ - [1.0, 69.6]
52
+ - [2.5, 69.3]
d3LLM_Code/data_llada.yaml ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUP Benchmark Data
2
+ # Format: task -> method -> list of (rho, accuracy) pairs
3
+ # rho: parallelism (tokens per forward)
4
+ # accuracy: model accuracy (0-1 scale)
5
+
6
+ # Model metadata: type (AR/dLLM), foundation model, link
7
+ _meta:
8
+ Qwen-2.5-7B-it:
9
+ type: AR
10
+ foundation: Qwen-2.5-7B-it
11
+ link: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
12
+ LLaDA:
13
+ type: dLLM
14
+ foundation: LLaDA-8B-it
15
+ link: https://github.com/ML-GSAI/LLaDA
16
+ Fast-dLLM-LLaDA:
17
+ type: dLLM
18
+ foundation: LLaDA-8B-it
19
+ link: https://github.com/NVlabs/Fast-dLLM
20
+ D2F-LLaDA:
21
+ type: dLLM
22
+ foundation: LLaDA-8B-it
23
+ link: https://github.com/zhijie-group/Discrete-Diffusion-Forcing
24
+ dParallel-LLaDA:
25
+ type: dLLM
26
+ foundation: LLaDA-8B-it
27
+ link: https://github.com/czg1225/dParallel
28
+ d3LLM-LLaDA:
29
+ type: dLLM
30
+ foundation: LLaDA-8B-it
31
+ link: https://github.com/hao-ai-lab/d3llm
32
+
33
+ GSM8K-CoT:
34
+ Qwen-2.5-7B-it:
35
+ - [1.0, 74.1]
36
+ LLaDA:
37
+ - [1.0, 72.55]
38
+ Fast-dLLM-LLaDA:
39
+ - [1.0, 74.79]
40
+ - [2.77, 74.68]
41
+ D2F-LLaDA:
42
+ - [1.0, 74.98]
43
+ - [2.88, 74.39]
44
+ dParallel-LLaDA:
45
+ - [1.0, 74.0]
46
+ - [5.14, 72.63]
47
+ d3LLM-LLaDA:
48
+ - [1.0, 74.02]
49
+ - [9.11, 73.09]
50
+ MATH:
51
+ Qwen-2.5-7B-it:
52
+ - [1.0, 41.15]
53
+ LLaDA:
54
+ - [1.0, 32.2]
55
+ Fast-dLLM-LLaDA:
56
+ - [1.0, 32.1]
57
+ - [1.97, 30.82]
58
+ D2F-LLaDA:
59
+ - [1.0, 29.1]
60
+ - [2.66, 28.94]
61
+ dParallel-LLaDA:
62
+ - [1.0, 32.0]
63
+ - [3.17, 30.18]
64
+ d3LLM-LLaDA:
65
+ - [1.0, 32.76]
66
+ - [5.74, 30.36]
67
+ MBPP:
68
+ Qwen-2.5-7B-it:
69
+ - [1.0, 63.6]
70
+ LLaDA:
71
+ - [1.0, 41.72]
72
+ Fast-dLLM-LLaDA:
73
+ - [1.0, 41.58]
74
+ - [2.13, 38.6]
75
+ D2F-LLaDA:
76
+ - [1.0, 39.10]
77
+ - [2.13, 39.00]
78
+ dParallel-LLaDA:
79
+ - [1.0, 41.62]
80
+ - [2.35, 40.0]
81
+ d3LLM-LLaDA:
82
+ - [1.0, 42.0]
83
+ - [4.21, 40.60]
84
+ HumanEval:
85
+ Qwen-2.5-7B-it:
86
+ - [1.0, 67.73]
87
+ LLaDA:
88
+ - [1.0, 38.28]
89
+ Fast-dLLM-LLaDA:
90
+ - [1.0, 38.16]
91
+ - [2.56, 37.8]
92
+ D2F-LLaDA:
93
+ - [1.0, 41.02]
94
+ - [2.69, 40.64]
95
+ dParallel-LLaDA:
96
+ - [1.0, 39.68]
97
+ - [4.93, 39.02]
98
+ d3LLM-LLaDA:
99
+ - [1.0, 39.8]
100
+ - [5.95, 39.63]
101
+ Long-GSM8K:
102
+ Qwen-2.5-7B-it:
103
+ - [1.0, 82.56]
104
+ LLaDA:
105
+ - [1.0, 78.58]
106
+ Fast-dLLM-LLaDA:
107
+ - [1.0, 78.45]
108
+ - [2.45, 78.01]
109
+ D2F-LLaDA:
110
+ - [1.0, 76.00]
111
+ - [2.7, 75.66]
112
+ dParallel-LLaDA:
113
+ - [1.0, 79.15]
114
+ - [4.49, 76.65]
115
+ d3LLM-LLaDA:
116
+ - [1.0, 78.32]
117
+ - [6.95, 74.22]
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ plotly
4
+ pyyaml
src/display/css_html_js.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ /* Reset any Gradio overflow restrictions */
3
+ * { box-sizing: border-box; }
4
+
5
+ .markdown-text { font-size: 16px !important; }
6
+
7
+ .welcome-banner {
8
+ background: linear-gradient(135deg, #a8b5f7 0%, #c9a8f7 100%);
9
+ color: #333;
10
+ padding: 25px;
11
+ border-radius: 12px;
12
+ margin: 20px 10px;
13
+ text-align: center;
14
+ box-shadow: 0 4px 15px rgba(168, 181, 247, 0.3);
15
+ }
16
+ .welcome-banner h2 { margin: 0 0 10px 0; font-size: 1.5em; color: #333; }
17
+ .welcome-banner p { margin: 0; font-size: 1.1em; color: #444; }
18
+
19
+ /* Modern, clean leaderboard table - no border */
20
+ .leaderboard-table {
21
+ width: 100%;
22
+ min-width: 30px;
23
+ border-collapse: collapse;
24
+ background: #fff;
25
+ border-radius: 12px;
26
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
27
+ }
28
+ .leaderboard-table thead {
29
+ background: linear-gradient(135deg, #a8b5f7 0%, #c9a8f7 100%);
30
+ }
31
+ .leaderboard-table th {
32
+ padding: 16px 10px;
33
+ text-align: center;
34
+ font-weight: 700;
35
+ font-size: 14px;
36
+ cursor: pointer;
37
+ user-select: none;
38
+ border: none;
39
+ color: #4a3a6e;
40
+ }
41
+ .leaderboard-table th:hover { background: rgba(255,255,255,0.2); }
42
+ .leaderboard-table th.sortable::after { content: ' ↕'; opacity: 0.5; font-size: 11px; }
43
+ .leaderboard-table th.sort-asc::after { content: ' ↑'; opacity: 1; }
44
+ .leaderboard-table th.sort-desc::after { content: ' ↓'; opacity: 1; }
45
+
46
+ .leaderboard-table td {
47
+ padding: 12px 8px;
48
+ text-align: center;
49
+ font-size: 12px;
50
+ color: #333;
51
+ border-bottom: 1px solid #f0f0f0;
52
+ border-left: none;
53
+ border-right: none;
54
+ }
55
+ .leaderboard-table tbody tr:hover { background-color: #fafafa; }
56
+ .leaderboard-table tbody tr:last-child td { border-bottom: none; }
57
+
58
+ .aup-score { font-size: 17px; font-weight: 700; color: #333; display: block; }
59
+ .sub-metrics { font-size: 9px; color: #999; display: block; margin-top: 2px; }
60
+
61
+ .method-cell {
62
+ text-align: center !important;
63
+ font-weight: 600;
64
+ min-width: 100px;
65
+ }
66
+ .method-cell a {
67
+ color: #5a3d8a;
68
+ text-decoration: none;
69
+ transition: color 0.2s;
70
+ font-size: 15px;
71
+ }
72
+ .method-cell a:hover { color: #3d2760; text-decoration: underline; }
73
+
74
+ .rank-cell { font-size: 16px; }
75
+ .type-cell { min-width: 45px; font-size: 9px; }
76
+ .foundation-cell { min-width: 60px; font-size: 9px; }
77
+ .avg-cell { background-color: #f8f7ff; }
78
+ .avg-cell .aup-score { font-size: 18px; color: #5a3d8a; font-weight: 700; }
79
+
80
+ /* Type badges - rounded pill style */
81
+ .type-badge {
82
+ display: inline-block;
83
+ padding: 2px 6px;
84
+ border-radius: 10px;
85
+ font-size: 8px;
86
+ font-weight: 600;
87
+ }
88
+ .type-badge.ar { background-color: #B93413; color: #fff; }
89
+ .type-badge.dllm { background-color: #193D3A; color: #fff; }
90
+
91
+ /* Foundation badges - low saturation colors, pill style */
92
+ .foundation-badge {
93
+ display: inline-block;
94
+ padding: 2px 6px;
95
+ border-radius: 10px;
96
+ font-size: 8px;
97
+ font-weight: 500;
98
+ }
99
+ .foundation-badge.f0 { background-color: #e8e4f0; color: #5a4a6e; }
100
+ .foundation-badge.f1 { background-color: #e4ecf0; color: #4a5a6e; }
101
+ .foundation-badge.f2 { background-color: #e4f0e8; color: #4a6e5a; }
102
+ .foundation-badge.f3 { background-color: #f0e8e4; color: #6e5a4a; }
103
+ .foundation-badge.f4 { background-color: #f0e4ec; color: #6e4a5a; }
104
+ .foundation-badge.f5 { background-color: #ecf0e4; color: #5a6e4a; }
105
+ .foundation-badge.f6 { background-color: #e4e8f0; color: #4a5a6e; }
106
+ .foundation-badge.f7 { background-color: #f0ece4; color: #6e5a4a; }
107
+ .foundation-badge.f8 { background-color: #e8f0ec; color: #4a6e5e; }
108
+ .foundation-badge.f9 { background-color: #f0e4e4; color: #6e4a4a; }
109
+
110
+ .rank-medal { font-size: 16px; }
111
+ .rank-medal .top-medal { font-size: 24px; }
112
+ .tab-buttons button { font-size: 16px; }
113
+ .tip-text { font-size: 12px; color: #888; font-style: italic; margin: 8px 0 15px 0; }
114
+
115
+ /* Container and layout - force overflow to work */
116
+ html, body { overflow-x: auto !important; }
117
+ .gradio-container {
118
+ max-width: 1400px !important;
119
+ margin: auto !important;
120
+ padding: 0 5px !important;
121
+ }
122
+ .tabs, .tabitem, .tabitem > div, #component-0, .contain, .block, .wrap, .prose {
123
+ width: 100% !important;
124
+ height: auto !important;
125
+ min-height: auto !important;
126
+ max-height: none !important;
127
+ }
128
+ .tabitem[style*="display: none"] { display: none !important; }
129
+
130
+ /* Table wrapper - critical for horizontal scroll */
131
+ .table-wrapper {
132
+ width: 100%;
133
+ overflow-x: scroll !important;
134
+ overflow-y: visible;
135
+ -webkit-overflow-scrolling: touch;
136
+ display: block;
137
+ padding-bottom: 15px;
138
+ }
139
+
140
+ /* Content wrapper */
141
+ .content-wrapper {
142
+ width: 100%;
143
+ max-width: 100%;
144
+ overflow-x: auto;
145
+ -webkit-overflow-scrolling: touch;
146
+ box-sizing: border-box;
147
+ }
148
+
149
+ /* Responsive font sizes for smaller screens */
150
+ @media (max-width: 1000px) {
151
+ .welcome-banner { padding: 20px 15px; margin: 15px 5px; }
152
+ .welcome-banner h2 { font-size: 1.3em; }
153
+ .welcome-banner p { font-size: 1em; }
154
+ }
155
+
156
+ @media (max-width: 600px) {
157
+ .welcome-banner { padding: 15px 10px; margin: 10px 5px; }
158
+ .welcome-banner h2 { font-size: 1.1em; }
159
+ .welcome-banner p { font-size: 0.9em; }
160
+ }
161
+ """
162
+
163
+ sort_table_js = """
164
+ <script>
165
+ (function() {
166
+ function initSort() {
167
+ const table = document.querySelector('.leaderboard-table');
168
+ if (!table) { setTimeout(initSort, 100); return; }
169
+
170
+ const headers = table.querySelectorAll('th');
171
+ let currentSort = { col: -1, dir: 'desc' };
172
+
173
+ headers.forEach((th, idx) => {
174
+ if (idx < 4) return;
175
+ th.classList.add('sortable');
176
+ th.onclick = function() { sortTable(idx); };
177
+ });
178
+
179
+ function sortTable(colIdx) {
180
+ const tbody = table.querySelector('tbody');
181
+ const rows = Array.from(tbody.querySelectorAll('tr'));
182
+ const dir = (currentSort.col === colIdx && currentSort.dir === 'desc') ? 'asc' : 'desc';
183
+ currentSort = { col: colIdx, dir };
184
+
185
+ headers.forEach((h, i) => {
186
+ h.classList.remove('sort-asc', 'sort-desc');
187
+ if (i === colIdx) h.classList.add('sort-' + dir);
188
+ });
189
+
190
+ rows.sort((a, b) => {
191
+ const aEl = a.cells[colIdx].querySelector('.aup-score');
192
+ const bEl = b.cells[colIdx].querySelector('.aup-score');
193
+ const aVal = parseFloat(aEl ? aEl.textContent : '0') || 0;
194
+ const bVal = parseFloat(bEl ? bEl.textContent : '0') || 0;
195
+ return dir === 'desc' ? bVal - aVal : aVal - bVal;
196
+ });
197
+
198
+ rows.forEach((row, i) => {
199
+ const rankCell = row.cells[0];
200
+ const medal = i < 3 ? ['🥇', '🥈', '🥉'][i] : (i + 1);
201
+ rankCell.innerHTML = '<span class="rank-medal">' + medal + '</span>';
202
+ tbody.appendChild(row);
203
+ });
204
+ }
205
+ }
206
+ if (document.readyState === 'loading') {
207
+ document.addEventListener('DOMContentLoaded', initSort);
208
+ } else {
209
+ initSort();
210
+ }
211
+ })();
212
+ </script>
213
+ """
214
+
215
+ # Foundation model to badge class mapping
216
+ FOUNDATION_COLORS = {}
217
+ _foundation_idx = 0
218
+
219
+ def get_foundation_class(foundation):
220
+ global _foundation_idx, FOUNDATION_COLORS
221
+ if foundation not in FOUNDATION_COLORS:
222
+ FOUNDATION_COLORS[foundation] = f"f{_foundation_idx % 10}"
223
+ _foundation_idx += 1
224
+ return FOUNDATION_COLORS[foundation]
src/display/formatting.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Formatting utilities for display
2
+ # Currently not used - keeping for potential future extensions
src/display/utils.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Utility functions for display formatting
2
+ # Currently not used - keeping for potential future extensions
src/display/visualization.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ from plotly.subplots import make_subplots
3
+ import numpy as np
4
+
5
+ # 30 distinct colors - assigned by Avg AUP rank
6
+ COLOR_PALETTE = [
7
+ "#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA",
8
+ "#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B",
9
+ "#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42",
10
+ "#009688", "#673AB7", "#F44336", "#3F51B5", "#795548",
11
+ "#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800",
12
+ "#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457",
13
+ ]
14
+
15
+ def get_model_colors(df):
16
+ """Assign colors to methods by Avg AUP rank (descending)."""
17
+ models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
18
+ return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)}
19
+
20
+ def get_model_ranks(df):
21
+ """Get rank for each method by Avg AUP."""
22
+ models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
23
+ return {model: i + 1 for i, model in enumerate(models_sorted)}
24
+
25
+ def hex_to_rgba(hex_color, alpha=0.25):
26
+ hex_color = hex_color.lstrip('#')
27
+ r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
28
+ return f'rgba({r},{g},{b},{alpha})'
29
+
30
+ def create_radar_chart(df, tasks, top_n=15):
31
+ """Create radar chart for top N methods showing original AUP scores (independent axes)."""
32
+ df_top = df.head(top_n).copy()
33
+ model_colors = get_model_colors(df)
34
+ model_ranks = get_model_ranks(df)
35
+
36
+ all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"]
37
+ categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"]
38
+
39
+ # Compute min/max per column for normalization (for radar display only)
40
+ col_stats = {}
41
+ for col in all_cols:
42
+ vals = df_top[col].dropna().astype(float)
43
+ col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0,
44
+ 'max': vals.max() if len(vals) > 0 else 100}
45
+
46
+ fig = go.Figure()
47
+
48
+ for _, row in df_top.iterrows():
49
+ method = row["Method"]
50
+ rank = model_ranks.get(method, 0)
51
+ color = model_colors.get(method, "#808080")
52
+ display_name = f"#{rank} {method}"
53
+
54
+ # Original AUP values for hover display
55
+ original_vals = [row.get(col, 0) or 0 for col in all_cols]
56
+
57
+ # Normalized values for radar shape (0-100 scale per axis)
58
+ normalized = []
59
+ for col, val in zip(all_cols, original_vals):
60
+ stats = col_stats[col]
61
+ range_val = stats['max'] - stats['min']
62
+ if range_val > 0:
63
+ norm = ((val - stats['min']) / range_val) * 80 + 10 # Scale to 10-90
64
+ else:
65
+ norm = 50
66
+ normalized.append(norm)
67
+
68
+ # Custom hover text showing original AUP scores
69
+ hover_texts = [f"<b>{display_name}</b><br>{cat}: <b>{val:.1f}</b>"
70
+ for cat, val in zip(categories, original_vals)]
71
+
72
+ fig.add_trace(go.Scatterpolar(
73
+ r=normalized + [normalized[0]],
74
+ theta=categories + [categories[0]],
75
+ mode='lines+markers', fill='toself', name=display_name,
76
+ line=dict(color=color, width=2), marker=dict(color=color, size=6),
77
+ fillcolor=hex_to_rgba(color, 0.15), opacity=0.9,
78
+ text=hover_texts + [hover_texts[0]],
79
+ hovertemplate='%{text}<extra></extra>'
80
+ ))
81
+
82
+ fig.update_layout(
83
+ height=600, margin=dict(l=100, r=250, t=80, b=60),
84
+ title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
85
+ # title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
86
+ polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11),
87
+ tickvals=[], showticklabels=False)),
88
+ legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)',
89
+ bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))),
90
+ hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333")
91
+ )
92
+ return fig
93
+
94
+ def create_group_bar_chart(df, tasks, top_n=15):
95
+ """Create grouped bar chart with Avg AUP included and rank numbers."""
96
+ df_top = df.head(top_n).copy()
97
+ methods = df_top["Method"].tolist()
98
+ model_colors = get_model_colors(df)
99
+ model_ranks = get_model_ranks(df)
100
+
101
+ all_benchmarks = tasks + ["Avg_AUP"]
102
+ fig = go.Figure()
103
+
104
+ for method in methods:
105
+ row = df_top[df_top["Method"] == method].iloc[0]
106
+ color = model_colors.get(method, "#808080")
107
+ rank = model_ranks.get(method, 0)
108
+ display_name = f"#{rank} {method}"
109
+
110
+ y_vals, x_vals = [], []
111
+ for bench in all_benchmarks:
112
+ aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP")
113
+ if aup is not None and not (isinstance(aup, float) and aup != aup):
114
+ y_vals.append(aup)
115
+ x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench)
116
+
117
+ if y_vals:
118
+ fig.add_trace(go.Bar(
119
+ name=display_name, x=x_vals, y=y_vals, marker_color=color,
120
+ hovertemplate=f"<b>{display_name}</b><br>%{{x}}: %{{y:.1f}}<extra></extra>"
121
+ ))
122
+
123
+ fig.update_layout(
124
+ height=550, margin=dict(l=60, r=250, t=80, b=100),
125
+ title=dict(text=f"📊 Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)),
126
+ # title=dict(text=f"📊 Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)),
127
+ xaxis_title="Benchmark", yaxis_title="AUP Score",
128
+ barmode='group', bargap=0.2, bargroupgap=0.05,
129
+ legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
130
+ bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))),
131
+ hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
132
+ )
133
+ return fig
134
+
135
+ def create_aup_curve_chart(raw_data, tasks, df, top_n=15):
136
+ """Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py)."""
137
+ df_top = df.head(top_n).copy()
138
+ model_colors = get_model_colors(df)
139
+ model_ranks = get_model_ranks(df)
140
+ methods_to_show = set(df_top["Method"].tolist())
141
+
142
+ # Build per-task data: {task: {method: [(rho, y), ...]}}
143
+ task_data = {t: {} for t in tasks}
144
+ for task in tasks:
145
+ for method, pairs in raw_data.get(task, {}).items():
146
+ if method in methods_to_show:
147
+ task_data[task][method] = [(p[0], p[1]) for p in pairs]
148
+
149
+ # Compute average data: average TPF and Acc by index across tasks (all tasks have same length)
150
+ avg_data = {}
151
+ for method in methods_to_show:
152
+ task_points = [task_data.get(t, {}).get(method, []) for t in tasks]
153
+ task_points = [p for p in task_points if p] # filter empty
154
+ if not task_points:
155
+ continue
156
+ n_points = len(task_points[0])
157
+ avg_data[method] = [
158
+ (np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5)
159
+ for i in range(n_points)
160
+ ]
161
+
162
+ # 6 subplots: 5 tasks + 1 Average at (2,3)
163
+ titles = tasks + ["Average"]
164
+ fig = make_subplots(rows=2, cols=3, subplot_titles=titles,
165
+ horizontal_spacing=0.08, vertical_spacing=0.15)
166
+
167
+ # Track which methods have been added to legend
168
+ legend_added = set()
169
+
170
+ def get_pos(idx):
171
+ if idx < 3:
172
+ return (1, idx + 1)
173
+ return (2, idx - 2) # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3)
174
+
175
+ # Helper to draw curve for a given subplot
176
+ def draw_curve(pairs, method, row, col):
177
+ nonlocal legend_added
178
+ if not pairs:
179
+ return
180
+ color = model_colors.get(method, "#808080")
181
+ rank = model_ranks.get(method, 0)
182
+ display_name = f"#{rank} {method}"
183
+ show_legend = method not in legend_added
184
+ if show_legend:
185
+ legend_added.add(method)
186
+
187
+ rho, y = zip(*sorted(pairs, key=lambda x: x[0]))
188
+ rho, y = np.array(rho), np.array(y)
189
+
190
+ # Generate smooth curve (quadratic fitting, same as plot_lines.py)
191
+ if len(rho) >= 3:
192
+ z = np.polyfit(rho, y, 2)
193
+ p = np.poly1d(z)
194
+ x_smooth = np.linspace(rho.min(), rho.max(), 300)
195
+ y_smooth = p(x_smooth)
196
+ elif len(rho) == 2:
197
+ x_smooth = np.linspace(rho.min(), rho.max(), 300)
198
+ if rho[1] != rho[0]:
199
+ a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2)
200
+ y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0]
201
+ else:
202
+ y_smooth = np.linspace(y[0], y[1], 300)
203
+ else:
204
+ x_smooth, y_smooth = rho, y
205
+
206
+ # Add fitted curve
207
+ fig.add_trace(go.Scatter(
208
+ x=x_smooth, y=y_smooth, mode='lines', name=display_name,
209
+ line=dict(color=color, width=2.5), opacity=0.85,
210
+ showlegend=show_legend, legendgroup=method,
211
+ hoverinfo='skip'
212
+ ), row=row, col=col)
213
+
214
+ # Add markers at original data points
215
+ fig.add_trace(go.Scatter(
216
+ x=rho, y=y, mode='markers', name=display_name,
217
+ marker=dict(color='white', size=8, line=dict(color=color, width=2)),
218
+ showlegend=False, legendgroup=method,
219
+ hovertemplate=f"<b>{display_name}</b><br>TPF: %{{x:.2f}}<br>Acc: %{{y:.1f}}<extra></extra>"
220
+ ), row=row, col=col)
221
+
222
+ # Draw 5 task subplots
223
+ for idx, task in enumerate(tasks):
224
+ row, col = get_pos(idx)
225
+ data = task_data.get(task, {})
226
+ for method in df_top["Method"].tolist():
227
+ if method in data:
228
+ draw_curve(data[method], method, row, col)
229
+
230
+ # Draw Average subplot at (2, 3)
231
+ for method in df_top["Method"].tolist():
232
+ if method in avg_data:
233
+ draw_curve(avg_data[method], method, 2, 3)
234
+
235
+ fig.update_layout(
236
+ height=550, margin=dict(l=60, r=250, t=80, b=60),
237
+ title=dict(text=f"📈 Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)),
238
+ legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
239
+ bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)),
240
+ tracegroupgap=1, itemsizing='constant'),
241
+ hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
242
+ )
243
+
244
+ # Update axes labels for 6 subplots
245
+ for idx in range(6):
246
+ row, col = get_pos(idx)
247
+ fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col)
248
+ fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col)
249
+
250
+ return fig
src/envs.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Environment configuration - not used in current implementation
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import yaml
3
+ import pandas as pd
4
+ from pathlib import Path
5
+
6
+ # AUP calculation (from d3LLM_Code/aup_utils.py)
7
+ def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float:
8
+ """Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)"""
9
+ return min(math.exp(-alpha * (1 - y / y_max)), 1.0)
10
+
11
+ def get_aup(rho: list, y: list, y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0) -> float:
12
+ """Calculate AUP (Accuracy Under Parallelism) score."""
13
+ if len(rho) == 0:
14
+ return 0.0
15
+ sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0])
16
+ sorted_rho, sorted_y = zip(*sorted_pairs)
17
+ sorted_rho, sorted_y = list(sorted_rho), list(sorted_y)
18
+
19
+ y_1 = sorted_y[0]
20
+ y_min = y_1 - y_min_offset
21
+ filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min]
22
+ if len(filtered_pairs) == 0:
23
+ return sorted_rho[0] * sorted_y[0]
24
+
25
+ filtered_rho, filtered_y = zip(*filtered_pairs)
26
+ filtered_rho, filtered_y = list(filtered_rho), list(filtered_y)
27
+
28
+ aup = filtered_rho[0] * filtered_y[0]
29
+ for i in range(1, len(filtered_rho)):
30
+ y_i, y_prev = filtered_y[i], filtered_y[i-1]
31
+ w_i = weight_function(y_i, y_max, alpha)
32
+ w_prev = weight_function(y_prev, y_max, alpha)
33
+ aup += 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev)
34
+ return aup
35
+
36
+ DATA_DIR = Path(__file__).parent.parent.parent / "d3LLM_Code"
37
+ DATA_FILES = ["data_dream.yaml", "data_llada.yaml", "data_dream_coder.yaml"]
38
+
39
+ # Merge HumanEval-Instruct -> HumanEval, MBPP-Instruct -> MBPP; exclude HumanEval+, MBPP+
40
+ TASK_MERGE = {"HumanEval-Instruct": "HumanEval", "MBPP-Instruct": "MBPP"}
41
+ TASK_EXCLUDE = {"HumanEval+", "MBPP+"}
42
+ TASK_ORDER = ["GSM8K-CoT", "MATH", "MBPP", "HumanEval", "Long-GSM8K"]
43
+ AVG_AUP_DIVISOR = 5
44
+
45
+ def load_yaml_data():
46
+ """Load YAML files separately, compute y_max per file/task, then merge."""
47
+ all_results = {} # {method: {task: (aup, tpf, acc)}}
48
+ all_meta = {}
49
+ all_tasks = set()
50
+ raw_data = {} # {task: {method: [(rho, y), ...]}} for curve plotting
51
+
52
+ for filename in DATA_FILES:
53
+ filepath = DATA_DIR / filename
54
+ if not filepath.exists():
55
+ continue
56
+ with open(filepath, 'r') as f:
57
+ data = yaml.safe_load(f)
58
+
59
+ meta = data.pop('_meta', {})
60
+ all_meta.update(meta)
61
+
62
+ # Compute y_max per task WITHIN this file only (as per main.py)
63
+ file_tasks = {k: v for k, v in data.items() if k not in TASK_EXCLUDE}
64
+ y_max_per_task = {}
65
+ for task, methods in file_tasks.items():
66
+ y_max_per_task[task] = max(y for pairs in methods.values() for _, y in pairs)
67
+
68
+ # Calculate AUP for each method/task in this file
69
+ for task, methods in file_tasks.items():
70
+ target_task = TASK_MERGE.get(task, task)
71
+ all_tasks.add(target_task)
72
+ y_max = y_max_per_task[task]
73
+
74
+ # Store raw data for curve plotting
75
+ if target_task not in raw_data:
76
+ raw_data[target_task] = {}
77
+
78
+ for method, pairs in methods.items():
79
+ if method not in all_results:
80
+ all_results[method] = {}
81
+
82
+ rho_list = [p[0] for p in pairs]
83
+ y_list = [p[1] for p in pairs]
84
+ aup = get_aup(rho_list, y_list, y_max)
85
+ tpf = max(rho_list)
86
+ acc = pairs[0][1] if len(pairs) == 1 else [p[1] for p in pairs if p[0] == max(rho_list)][0]
87
+ all_results[method][target_task] = (round(aup, 1), round(tpf, 2), round(acc, 1))
88
+ raw_data[target_task][method] = pairs
89
+
90
+ # Return tasks in specified order
91
+ ordered_tasks = [t for t in TASK_ORDER if t in all_tasks]
92
+ return all_results, all_meta, ordered_tasks, raw_data
93
+
94
+ def compute_leaderboard():
95
+ """Compute leaderboard DataFrame from YAML data."""
96
+ results_dict, meta, tasks, raw_data = load_yaml_data()
97
+
98
+ results = []
99
+ for method in sorted(results_dict.keys()):
100
+ method_meta = meta.get(method, {})
101
+ row = {
102
+ "Method": method,
103
+ "Type": method_meta.get("type", "?"),
104
+ "Foundation": method_meta.get("foundation", "?"),
105
+ "Link": method_meta.get("link", ""),
106
+ }
107
+
108
+ aup_sum = 0.0
109
+ for task in tasks:
110
+ if task in results_dict[method]:
111
+ aup, tpf, acc = results_dict[method][task]
112
+ row[f"{task}_AUP"], row[f"{task}_TPF"], row[f"{task}_Acc"] = aup, tpf, acc
113
+ aup_sum += aup
114
+ else:
115
+ row[f"{task}_AUP"] = row[f"{task}_TPF"] = row[f"{task}_Acc"] = None
116
+
117
+ row["Avg_AUP"] = round(aup_sum / AVG_AUP_DIVISOR, 1)
118
+ results.append(row)
119
+
120
+ df = pd.DataFrame(results).sort_values("Avg_AUP", ascending=False).reset_index(drop=True)
121
+ return df, tasks, raw_data
122
+
123
+ def get_leaderboard_df():
124
+ df, _, _ = compute_leaderboard()
125
+ return df
126
+
127
+ def get_tasks():
128
+ _, tasks, _ = compute_leaderboard()
129
+ return tasks
130
+
131
+ def get_raw_data():
132
+ _, _, raw_data = compute_leaderboard()
133
+ return raw_data
src/populate.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Population utilities - not used in current implementation
src/submission/check_validity.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Submission validation - not used in current implementation
src/submission/submit.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Submission handling - not used in current implementation