Spaces:

d3LLM
/

dLLM_Leaderboard

Running

App Files Files Community

d3LLM-Data-LLaDA commited on Dec 21, 2025

Commit

d473371

0 Parent(s):

Initial commit

Browse files

Files changed (21) hide show

.gitattributes +35 -0
.gitignore +13 -0
.pre-commit-config.yaml +53 -0
Makefile +13 -0
README.md +48 -0
app.py +180 -0
d3LLM_Code/aup_utils.py +63 -0
d3LLM_Code/data_dream.yaml +136 -0
d3LLM_Code/data_dream_coder.yaml +52 -0
d3LLM_Code/data_llada.yaml +117 -0
pyproject.toml +13 -0
requirements.txt +4 -0
src/display/css_html_js.py +224 -0
src/display/formatting.py +2 -0
src/display/utils.py +2 -0
src/display/visualization.py +250 -0
src/envs.py +1 -0
src/leaderboard/read_evals.py +133 -0
src/populate.py +1 -0
src/submission/check_validity.py +1 -0
src/submission/submit.py +1 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+title: dLLM Leaderboard
+emoji: 🏆
+colorFrom: purple
+colorTo: indigo
+sdk: gradio
+app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: A leaderboard of Diffusion Large Language Models (dLLMs)
+sdk_version: 5.43.1
+tags:
+- leaderboard
+---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import pandas as pd
+from src.leaderboard.read_evals import get_leaderboard_df, get_tasks, get_raw_data
+from src.display.visualization import create_radar_chart, create_group_bar_chart, create_aup_curve_chart
+from src.display.css_html_js import custom_css, sort_table_js, get_foundation_class
+CITATION_HTML = """
+<div style="max-width: 800px; margin: 30px auto 0 auto; padding: 20px; background: #f8f7ff; border-radius: 12px; border-left: 4px solid #5a3d8a;">
+    <p style="margin: 0 0 12px 0; color: #5a3d8a; font-weight: 600;">📝 If you find this Leaderboard useful for your research, please star <a href="https://github.com/hao-ai-lab/d3llm" target="_blank" style="color: #5a3d8a;">our GitHub repo</a> and cite our work:</p>
+    <pre style="background: #fff; padding: 15px; border-radius: 8px; overflow-x: auto; font-size: 12px; margin: 0; color: #333; white-space: pre-wrap; word-wrap: break-word;">@article{preprint'25:d3llm,
+  author  = {Yu-Yang Qian and Junda Su and Lanxiang Hu and Peiyuan Zhang and Zhijie Deng and Peng Zhao and Hao Zhang},
+  title   = {d3LLM: Ultra-Fast Diffusion LLM using Pseudo-Trajectory Distillation},
+  journal = {ArXiv preprint},
+  volume  = {to appear},
+  note    = {\\url{https://github.com/hao-ai-lab/d3LLM} [Accessed: 2025-12-11]},
+  year    = {2025}
+}</pre>
+</div>
+"""
+def create_leaderboard_html(df, tasks):
+    """Generate HTML table for detailed results."""
+    rows_html = ""
+    for rank, (_, row) in enumerate(df.iterrows(), 1):
+        medal = f'<span class="top-medal">{["🥇", "🥈", "🥉"][rank-1]}</span>' if rank <= 3 else str(rank)
+        # Method with link
+        method = row['Method']
+        link = row.get('Link', '')
+        method_html = f'<a href="{link}" target="_blank">{method}</a>' if link else method
+        # Type badge
+        type_val = row.get('Type', '?')
+        type_display = 'dLLM' if type_val == 'dLLM' else type_val
+        type_class = 'ar' if type_val == 'AR' else 'dllm'
+        # Foundation badge
+        foundation = row.get('Foundation', '?')
+        foundation_class = get_foundation_class(foundation)
+        # Build cells for each task
+        task_cells = ""
+        for task in tasks:
+            aup = row.get(f'{task}_AUP')
+            tpf = row.get(f'{task}_TPF')
+            acc = row.get(f'{task}_Acc')
+            if pd.notna(aup):
+                task_cells += f'''<td>
+                    <span class="aup-score">{aup:.1f}</span>
+                    <span class="sub-metrics">TPF:{tpf:.2f} Acc:{acc:.1f}</span>
+                </td>'''
+            else:
+                task_cells += '<td><span class="aup-score">-</span></td>'
+        # Avg AUP
+        avg_aup = row.get('Avg_AUP', 0)
+        rows_html += f'''<tr>
+            <td class="rank-cell"><span class="rank-medal">{medal}</span></td>
+            <td class="method-cell">{method_html}</td>
+            <td class="type-cell"><span class="type-badge {type_class}">{type_display}</span></td>
+            <td class="foundation-cell"><span class="foundation-badge {foundation_class}">{foundation}</span></td>
+            {task_cells}
+            <td class="avg-cell"><span class="aup-score">{avg_aup:.1f}</span></td>
+        </tr>'''
+    task_headers = ''.join(f'<th>{t}</th>' for t in tasks)
+    return f'''
+    {sort_table_js}
+    <div class="table-wrapper">
+        <table class="leaderboard-table">
+            <thead><tr>
+                <th>Rank</th><th>Method</th><th>Type</th><th>Foundation Model</th>
+                {task_headers}
+                <th>Avg AUP</th>
+            </tr></thead>
+            <tbody>{rows_html}</tbody>
+        </table>
+    </div>
+    '''
+def update_charts(top_n):
+    df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
+    return create_radar_chart(df, tasks, top_n), create_group_bar_chart(df, tasks, top_n), create_aup_curve_chart(raw_data, tasks, df, top_n)
+# Load data
+df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
+default_top_n = min(15, len(df))
+with gr.Blocks(css=custom_css, title="dLLM Leaderboard", fill_height=False) as demo:
+    gr.HTML('''
+        <div class="welcome-banner">
+            <h2>🫧 Welcome to dLLM Leaderboard! 🏆</h2>
+            <p>Benchmarking various Diffusion Large Language Models (dLLMs) with <i><a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank" style="color: inherit; text-decoration: underline;">AUP (Accuracy Under Parallelism)</a></i>, considering both accuracy and parallelism.</p>
+        </div>
+    ''')
+    with gr.Tabs():
+        with gr.TabItem("📊 Leaderboard"):
+            with gr.Row():
+                top_n_slider = gr.Slider(minimum=3, maximum=len(df), value=default_top_n, step=1,
+                                        label="Number of Top Methods to Display")
+            with gr.Row():
+                radar_plot = gr.Plot(value=create_radar_chart(df, tasks, default_top_n))
+            with gr.Row():
+                bar_plot = gr.Plot(value=create_group_bar_chart(df, tasks, default_top_n))
+            with gr.Row():
+                curve_plot = gr.Plot(value=create_aup_curve_chart(raw_data, tasks, df, default_top_n))
+            top_n_slider.change(fn=update_charts, inputs=[top_n_slider], outputs=[radar_plot, bar_plot, curve_plot])
+            gr.Markdown("### 🏆 Detailed Leaderboard")
+            gr.HTML(create_leaderboard_html(df, tasks))
+            gr.HTML(CITATION_HTML)
+        with gr.TabItem("📤 Submit Result"):
+            gr.HTML("""
+            <div class="content-wrapper">
+                <div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
+                    <h2>Submit Your Results</h2>
+                    <p>We welcome contributions to the dLLM Leaderboard! To submit your method's results:</p>
+                    <h3>Step 1: Evaluate Your Method</h3>
+                    <p>Follow the evaluation protocol in the <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">d3LLM repository</a>.
+                    Refer to the <code>eval_scripts</code> folder for benchmark evaluation scripts, and <code>AUP_leaderboard</code> folder for AUP calculation utilities.</p>
+                    <h3>Step 2: Prepare Your Evaluation Results</h3>
+                    <p>Add your results to the appropriate YAML file following this format:</p>
+                    <pre style="background: #f5f5f5; padding: 15px; border-radius: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word;">_meta:
+  YourMethod:
+    type: dLLM  # or AR
+    foundation: YourFoundation
+    link: https://link/to/your/method
+TaskName:
+  YourMethod:
+  - [rho_1, accuracy_1]  # (parallelism, accuracy) pairs
+  - [rho_2, accuracy_2]</pre>
+                    <h3>Step 3: Submit a Pull Request</h3>
+                    <ol>
+                        <li>Fork the repository</li>
+                        <li>Add your results to the YAML files</li>
+                        <li>Submit a PR with your method name, description, and evaluation details</li>
+                    </ol>
+                    <p><strong>Questions?</strong> Open an issue on <a href="https://github.com/hao-ai-lab/d3LLM/issues" target="_blank">GitHub</a>.</p>
+                </div>
+            </div>
+            """ + CITATION_HTML)
+        with gr.TabItem("ℹ️ About"):
+            gr.HTML("""
+            <div class="content-wrapper">
+                <div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
+                    <h2>About dLLM Leaderboard</h2>
+                    <p>This leaderboard evaluates <strong>Diffusion Large Language Models (dLLMs)</strong> using the <strong>AUP (Accuracy Under Parallelism)</strong> metric.</p>
+                    <h3>Metrics</h3>
+                    <ul>
+                        <li><strong>AUP</strong>: Primary metric - measures efficiency-accuracy trade-off (higher is better)</li>
+                        <li><strong>TPF</strong>: Tokens Per Forward - parallelism level achieved</li>
+                        <li><strong>Acc</strong>: Accuracy at maximum parallelism</li>
+                    </ul>
+                    <h3>Benchmarks</h3>
+                    <p>GSM8K-CoT, MATH, HumanEval, MBPP, Long-GSM8K</p>
+                    <h3>References</h3>
+                    <p>
+                        GitHub Code Repo: <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">https://github.com/hao-ai-lab/d3LLM</a><br>
+                        Blog: <a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank">https://hao-ai-lab.github.io/blogs/text-diffusion/</a>
+                    </p>
+                </div>
+            </div>
+            """ + CITATION_HTML)
+demo.launch()

d3LLM_Code/aup_utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# AUP (Accuracy Under Parallelism) measure for parallel decoders
+# See paper for detailed definition and motivation
+import math
+def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float:
+    """Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)"""
+    return min(math.exp(-alpha * (1 - y / y_max)), 1.0)
+def get_aup(rho: list[float], y: list[float], y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0, is_print: bool = False) -> float:
+    """
+    Calculate the Accuracy Under Parallelism (AUP) of parallelism-accuracy pairs.
+    Args:
+        rho: list of parallelism values (TPF, tokens per forward)
+        y: list of accuracy values in [0, 100] (percentage)
+        y_max: maximum accuracy across all methods (for normalization)
+        alpha: penalty factor for accuracy degradation (default: 3.0)
+        y_min_offset: minimum accuracy threshold offset (default: 5.0, i.e., 5%)
+    Returns:
+        AUP score (scalar value)
+    """
+    assert len(rho) == len(y), "rho and y must have the same length"
+    assert len(rho) > 0, "rho and y must not be empty"
+    assert all(r > 0 for r in rho), "all rho must be positive"
+    # Check if y values are in [0, 100] range
+    if any(acc < 1.0 for acc in y):
+        print("\033[91mWarning: Detected accuracy values < 1.0. Please check if accuracy should be in percentage (0-100) instead of (0-1).\033[0m")
+    # Sort by rho
+    sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0])
+    sorted_rho, sorted_y = zip(*sorted_pairs)
+    sorted_rho, sorted_y = list(sorted_rho), list(sorted_y)
+    # Filter by y_min threshold (y_1 - y_min_offset)
+    y_1 = sorted_y[0]
+    assert y_1 - sorted_y[-1] <= y_min_offset, f"Accuracy degradation is too large: minimum accuracy should be at least {y_min_offset:.2f} lower than the maximum accuracy. Max Acc: {y_1}, min Acc: {sorted_y[-1]}"
+    y_min = y_1 - y_min_offset
+    filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min]
+    assert len(filtered_pairs) > 0, f"No valid pairs after filtering with y_min={y_min}"
+    filtered_rho, filtered_y = zip(*filtered_pairs)
+    filtered_rho, filtered_y = list(filtered_rho), list(filtered_y)
+    # Calculate AUP: first term + trapezoidal sum
+    aup = filtered_rho[0] * filtered_y[0]
+    formula_parts = [f"{filtered_rho[0]:.2f} * {filtered_y[0]:.2f}"]
+    for i in range(1, len(filtered_rho)):
+        y_i = filtered_y[i]
+        y_prev = filtered_y[i-1]
+        w_i = weight_function(y_i, y_max, alpha)
+        w_prev = weight_function(y_prev, y_max, alpha)
+        term = 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev)
+        aup += term
+        formula_parts.append(f"({filtered_rho[i]:.2f}-{filtered_rho[i-1]:.2f}) * ({y_i:.2f} * {w_i:.4f} + {y_prev:.2f} * {w_prev:.4f})")
+    if is_print:
+        formula = f"    AUP = " + " + ".join(formula_parts) + f" = {aup:.2f}"
+        print(formula)
+    return aup

d3LLM_Code/data_dream.yaml ADDED Viewed

	@@ -0,0 +1,136 @@

+# AUP Benchmark Data
+# Format: task -> method -> list of (rho, accuracy) pairs
+# rho: parallelism (tokens per forward)
+# accuracy: model accuracy (0-1 scale)
+# Model metadata: type (AR/dLLM), foundation model, link
+_meta:
+  Qwen-2.5-7B-it:
+    type: AR
+    foundation: Qwen-2.5-7B-it
+    link: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+  EAGLE-3:
+    type: AR
+    foundation: Llama-3.1-8B-it
+    link: https://github.com/SafeAILab/EAGLE
+  Dream:
+    type: dLLM
+    foundation: Dream-v0-it-7B
+    link: https://github.com/DreamLM/Dream
+  Fast-dLLM-Dream:
+    type: dLLM
+    foundation: Dream-v0-it-7B
+    link: https://github.com/NVlabs/Fast-dLLM
+  Fast-dLLM-v2:
+    type: dLLM
+    foundation: Qwen-2.5-7B-it
+    link: https://github.com/NVlabs/Fast-dLLM/tree/main/v2
+  dParallel-Dream:
+    type: dLLM
+    foundation: Dream-v0-it-7B
+    link: https://github.com/czg1225/dParallel
+  d3LLM-Dream:
+    type: dLLM
+    foundation: Dream-v0-it-7B
+    link: https://github.com/hao-ai-lab/d3llm
+GSM8K-CoT:
+  Qwen-2.5-7B-it:
+  - [1.0, 74.1]
+  EAGLE-3:
+  - [1.0, 76.57]
+  - [5.12, 76.57]
+  Dream:
+  - [1.0, 83.94]
+  Fast-dLLM-Dream:
+  - [1.0, 83.68]
+  - [1.44, 79.0]
+  Fast-dLLM-v2:
+  - [1.0, 82.82]
+  - [2.21, 81.48]
+  dParallel-Dream:
+  - [1.0, 83.8]
+  - [3.02, 82.12]
+  d3LLM-Dream:
+  - [1.0, 83.47]
+  - [4.94, 81.36]
+MATH:
+  Qwen-2.5-7B-it:
+  - [1.0, 41.15]
+  EAGLE-3:
+  - [1.0, 39.80]
+  - [5.72, 39.80]
+  Dream:
+  - [1.0, 39.63]
+  Fast-dLLM-Dream:
+  - [1.0, 39.53]
+  - [1.78, 38.3]
+  Fast-dLLM-v2:
+  - [1.0, 49.92]
+  - [2.61, 48.74]
+  dParallel-Dream:
+  - [1.0, 39.06]
+  - [2.94, 38.72]
+  d3LLM-Dream:
+  - [1.0, 39.38]
+  - [3.92, 38.21]
+MBPP-Instruct:
+  Qwen-2.5-7B-it:
+  - [1.0, 63.8]
+  EAGLE-3:
+  - [1.0, 60.20]
+  - [5.69, 60.20]
+  Dream:
+  - [1.0, 57.2]
+  Fast-dLLM-Dream:
+  - [1.0, 56.38]
+  - [1.2, 53.2]
+  Fast-dLLM-v2:
+  - [1.0, 61.23]
+  - [2.04, 59.12]
+  dParallel-Dream:
+  - [1.0, 57.8]
+  - [2.24, 55.4]
+  d3LLM-Dream:
+  - [1.0, 58.8]
+  - [2.96, 55.60]
+HumanEval-Instruct:
+  Qwen-2.5-7B-it:
+  - [1.0, 72.25]
+  EAGLE-3:
+  - [1.0, 67.58]
+  - [5.98, 67.58]
+  Dream:
+  - [1.0, 55.2]
+  Fast-dLLM-Dream:
+  - [1.0, 54.86]
+  - [1.33, 54.27]
+  Fast-dLLM-v2:
+  - [1.0, 63.2]
+  - [2.58, 61.7]
+  dParallel-Dream:
+  - [1.0, 56.08]
+  - [2.57, 54.27]
+  d3LLM-Dream:
+  - [1.0, 58.86]
+  - [3.20, 57.10]
+Long-GSM8K:
+  Qwen-2.5-7B-it:
+  - [1.0, 82.56]
+  EAGLE-3:
+  - [1.0, 80.52]
+  - [5.57, 80.52]
+  Dream:
+  - [1.0, 78.95]
+  Fast-dLLM-Dream:
+  - [1.0, 78.83]
+  - [1.79, 76.57]
+  Fast-dLLM-v2:
+  - [1.0, 82.34]
+  - [2.58, 80.97]
+  dParallel-Dream:
+  - [1.0, 81.27]
+  - [3.49, 78.56]
+  d3LLM-Dream:
+  - [1.0, 81.2]
+  - [4.80, 77.18]

d3LLM_Code/data_dream_coder.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# AUP Benchmark Data
+# Format: task -> method -> list of (rho, accuracy) pairs
+# rho: parallelism (tokens per forward)
+# accuracy: model accuracy (0-1 scale)
+# Model metadata: type (AR/dLLM), foundation model, link
+_meta:
+  Qwen2.5-Coder-7B-it:
+    type: AR
+    foundation: Qwen2.5-Coder-7B-it
+    link: https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct
+  Dream-Coder-7B:
+    type: dLLM
+    foundation: Dream-Coder-v0-it-7B
+    link: https://github.com/DreamLM/Dream-Coder
+  d3LLM-Coder-7B:
+    type: dLLM
+    foundation: Dream-Coder-v0-it-7B
+    link: https://github.com/hao-ai-lab/d3llm
+HumanEval:
+  Qwen2.5-Coder-7B-it:
+  - [1.0, 86.6]
+  Dream-Coder-7B:
+  - [1.0, 82.9]
+  d3LLM-Coder-7B:
+  - [1.0, 82.4]
+  - [2.88, 79.7]
+HumanEval+:
+  Qwen2.5-Coder-7B-it:
+  - [1.0, 82.3]
+  Dream-Coder-7B:
+  - [1.0, 76.8]
+  d3LLM-Coder-7B:
+  - [1.0, 74.4]
+  - [2.88, 71.3]
+MBPP:
+  Qwen2.5-Coder-7B-it:
+  - [1.0, 83.5]
+  Dream-Coder-7B:
+  - [1.0, 79.9]
+  d3LLM-Coder-7B:
+  - [1.0, 80.10]
+  - [2.5, 80.00]
+MBPP+:
+  Qwen2.5-Coder-7B-it:
+  - [1.0, 70.1]
+  Dream-Coder-7B:
+  - [1.0, 68.8]
+  d3LLM-Coder-7B:
+  - [1.0, 69.6]
+  - [2.5, 69.3]

d3LLM_Code/data_llada.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# AUP Benchmark Data
+# Format: task -> method -> list of (rho, accuracy) pairs
+# rho: parallelism (tokens per forward)
+# accuracy: model accuracy (0-1 scale)
+# Model metadata: type (AR/dLLM), foundation model, link
+_meta:
+  Qwen-2.5-7B-it:
+    type: AR
+    foundation: Qwen-2.5-7B-it
+    link: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+  LLaDA:
+    type: dLLM
+    foundation: LLaDA-8B-it
+    link: https://github.com/ML-GSAI/LLaDA
+  Fast-dLLM-LLaDA:
+    type: dLLM
+    foundation: LLaDA-8B-it
+    link: https://github.com/NVlabs/Fast-dLLM
+  D2F-LLaDA:
+    type: dLLM
+    foundation: LLaDA-8B-it
+    link: https://github.com/zhijie-group/Discrete-Diffusion-Forcing
+  dParallel-LLaDA:
+    type: dLLM
+    foundation: LLaDA-8B-it
+    link: https://github.com/czg1225/dParallel
+  d3LLM-LLaDA:
+    type: dLLM
+    foundation: LLaDA-8B-it
+    link: https://github.com/hao-ai-lab/d3llm
+GSM8K-CoT:
+  Qwen-2.5-7B-it:
+  - [1.0, 74.1]
+  LLaDA:
+  - [1.0, 72.55]
+  Fast-dLLM-LLaDA:
+  - [1.0, 74.79]
+  - [2.77, 74.68]
+  D2F-LLaDA:
+  - [1.0, 74.98]
+  - [2.88, 74.39]
+  dParallel-LLaDA:
+  - [1.0, 74.0]
+  - [5.14, 72.63]
+  d3LLM-LLaDA:
+  - [1.0, 74.02]
+  - [9.11, 73.09]
+MATH:
+  Qwen-2.5-7B-it:
+  - [1.0, 41.15]
+  LLaDA:
+  - [1.0, 32.2]
+  Fast-dLLM-LLaDA:
+  - [1.0, 32.1]
+  - [1.97, 30.82]
+  D2F-LLaDA:
+  - [1.0, 29.1]
+  - [2.66, 28.94]
+  dParallel-LLaDA:
+  - [1.0, 32.0]
+  - [3.17, 30.18]
+  d3LLM-LLaDA:
+  - [1.0, 32.76]
+  - [5.74, 30.36]
+MBPP:
+  Qwen-2.5-7B-it:
+  - [1.0, 63.6]
+  LLaDA:
+  - [1.0, 41.72]
+  Fast-dLLM-LLaDA:
+  - [1.0, 41.58]
+  - [2.13, 38.6]
+  D2F-LLaDA:
+  - [1.0, 39.10]
+  - [2.13, 39.00]
+  dParallel-LLaDA:
+  - [1.0, 41.62]
+  - [2.35, 40.0]
+  d3LLM-LLaDA:
+  - [1.0, 42.0]
+  - [4.21, 40.60]
+HumanEval:
+  Qwen-2.5-7B-it:
+  - [1.0, 67.73]
+  LLaDA:
+  - [1.0, 38.28]
+  Fast-dLLM-LLaDA:
+  - [1.0, 38.16]
+  - [2.56, 37.8]
+  D2F-LLaDA:
+  - [1.0, 41.02]
+  - [2.69, 40.64]
+  dParallel-LLaDA:
+  - [1.0, 39.68]
+  - [4.93, 39.02]
+  d3LLM-LLaDA:
+  - [1.0, 39.8]
+  - [5.95, 39.63]
+Long-GSM8K:
+  Qwen-2.5-7B-it:
+  - [1.0, 82.56]
+  LLaDA:
+  - [1.0, 78.58]
+  Fast-dLLM-LLaDA:
+  - [1.0, 78.45]
+  - [2.45, 78.01]
+  D2F-LLaDA:
+  - [1.0, 76.00]
+  - [2.7, 75.66]
+  dParallel-LLaDA:
+  - [1.0, 79.15]
+  - [4.49, 76.65]
+  d3LLM-LLaDA:
+  - [1.0, 78.32]
+  - [6.95, 74.22]

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+pandas
+plotly
+pyyaml

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,224 @@

+custom_css = """
+/* Reset any Gradio overflow restrictions */
+* { box-sizing: border-box; }
+.markdown-text { font-size: 16px !important; }
+.welcome-banner {
+    background: linear-gradient(135deg, #a8b5f7 0%, #c9a8f7 100%);
+    color: #333;
+    padding: 25px;
+    border-radius: 12px;
+    margin: 20px 10px;
+    text-align: center;
+    box-shadow: 0 4px 15px rgba(168, 181, 247, 0.3);
+}
+.welcome-banner h2 { margin: 0 0 10px 0; font-size: 1.5em; color: #333; }
+.welcome-banner p { margin: 0; font-size: 1.1em; color: #444; }
+/* Modern, clean leaderboard table - no border */
+.leaderboard-table {
+    width: 100%;
+    min-width: 30px;
+    border-collapse: collapse;
+    background: #fff;
+    border-radius: 12px;
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+}
+.leaderboard-table thead {
+    background: linear-gradient(135deg, #a8b5f7 0%, #c9a8f7 100%);
+}
+.leaderboard-table th {
+    padding: 16px 10px;
+    text-align: center;
+    font-weight: 700;
+    font-size: 14px;
+    cursor: pointer;
+    user-select: none;
+    border: none;
+    color: #4a3a6e;
+}
+.leaderboard-table th:hover { background: rgba(255,255,255,0.2); }
+.leaderboard-table th.sortable::after { content: ' ↕'; opacity: 0.5; font-size: 11px; }
+.leaderboard-table th.sort-asc::after { content: ' ↑'; opacity: 1; }
+.leaderboard-table th.sort-desc::after { content: ' ↓'; opacity: 1; }
+.leaderboard-table td {
+    padding: 12px 8px;
+    text-align: center;
+    font-size: 12px;
+    color: #333;
+    border-bottom: 1px solid #f0f0f0;
+    border-left: none;
+    border-right: none;
+}
+.leaderboard-table tbody tr:hover { background-color: #fafafa; }
+.leaderboard-table tbody tr:last-child td { border-bottom: none; }
+.aup-score { font-size: 17px; font-weight: 700; color: #333; display: block; }
+.sub-metrics { font-size: 9px; color: #999; display: block; margin-top: 2px; }
+.method-cell {
+    text-align: center !important;
+    font-weight: 600;
+    min-width: 100px;
+}
+.method-cell a {
+    color: #5a3d8a;
+    text-decoration: none;
+    transition: color 0.2s;
+    font-size: 15px;
+}
+.method-cell a:hover { color: #3d2760; text-decoration: underline; }
+.rank-cell { font-size: 16px; }
+.type-cell { min-width: 45px; font-size: 9px; }
+.foundation-cell { min-width: 60px; font-size: 9px; }
+.avg-cell { background-color: #f8f7ff; }
+.avg-cell .aup-score { font-size: 18px; color: #5a3d8a; font-weight: 700; }
+/* Type badges - rounded pill style */
+.type-badge {
+    display: inline-block;
+    padding: 2px 6px;
+    border-radius: 10px;
+    font-size: 8px;
+    font-weight: 600;
+}
+.type-badge.ar { background-color: #B93413; color: #fff; }
+.type-badge.dllm { background-color: #193D3A; color: #fff; }
+/* Foundation badges - low saturation colors, pill style */
+.foundation-badge {
+    display: inline-block;
+    padding: 2px 6px;
+    border-radius: 10px;
+    font-size: 8px;
+    font-weight: 500;
+}
+.foundation-badge.f0 { background-color: #e8e4f0; color: #5a4a6e; }
+.foundation-badge.f1 { background-color: #e4ecf0; color: #4a5a6e; }
+.foundation-badge.f2 { background-color: #e4f0e8; color: #4a6e5a; }
+.foundation-badge.f3 { background-color: #f0e8e4; color: #6e5a4a; }
+.foundation-badge.f4 { background-color: #f0e4ec; color: #6e4a5a; }
+.foundation-badge.f5 { background-color: #ecf0e4; color: #5a6e4a; }
+.foundation-badge.f6 { background-color: #e4e8f0; color: #4a5a6e; }
+.foundation-badge.f7 { background-color: #f0ece4; color: #6e5a4a; }
+.foundation-badge.f8 { background-color: #e8f0ec; color: #4a6e5e; }
+.foundation-badge.f9 { background-color: #f0e4e4; color: #6e4a4a; }
+.rank-medal { font-size: 16px; }
+.rank-medal .top-medal { font-size: 24px; }
+.tab-buttons button { font-size: 16px; }
+.tip-text { font-size: 12px; color: #888; font-style: italic; margin: 8px 0 15px 0; }
+/* Container and layout - force overflow to work */
+html, body { overflow-x: auto !important; }
+.gradio-container {
+    max-width: 1400px !important;
+    margin: auto !important;
+    padding: 0 5px !important;
+}
+.tabs, .tabitem, .tabitem > div, #component-0, .contain, .block, .wrap, .prose {
+    width: 100% !important;
+    height: auto !important;
+    min-height: auto !important;
+    max-height: none !important;
+}
+.tabitem[style*="display: none"] { display: none !important; }
+/* Table wrapper - critical for horizontal scroll */
+.table-wrapper {
+    width: 100%;
+    overflow-x: scroll !important;
+    overflow-y: visible;
+    -webkit-overflow-scrolling: touch;
+    display: block;
+    padding-bottom: 15px;
+}
+/* Content wrapper */
+.content-wrapper {
+    width: 100%;
+    max-width: 100%;
+    overflow-x: auto;
+    -webkit-overflow-scrolling: touch;
+    box-sizing: border-box;
+}
+/* Responsive font sizes for smaller screens */
+@media (max-width: 1000px) {
+    .welcome-banner { padding: 20px 15px; margin: 15px 5px; }
+    .welcome-banner h2 { font-size: 1.3em; }
+    .welcome-banner p { font-size: 1em; }
+}
+@media (max-width: 600px) {
+    .welcome-banner { padding: 15px 10px; margin: 10px 5px; }
+    .welcome-banner h2 { font-size: 1.1em; }
+    .welcome-banner p { font-size: 0.9em; }
+}
+"""
+sort_table_js = """
+<script>
+(function() {
+    function initSort() {
+        const table = document.querySelector('.leaderboard-table');
+        if (!table) { setTimeout(initSort, 100); return; }
+        const headers = table.querySelectorAll('th');
+        let currentSort = { col: -1, dir: 'desc' };
+        headers.forEach((th, idx) => {
+            if (idx < 4) return;
+            th.classList.add('sortable');
+            th.onclick = function() { sortTable(idx); };
+        });
+        function sortTable(colIdx) {
+            const tbody = table.querySelector('tbody');
+            const rows = Array.from(tbody.querySelectorAll('tr'));
+            const dir = (currentSort.col === colIdx && currentSort.dir === 'desc') ? 'asc' : 'desc';
+            currentSort = { col: colIdx, dir };
+            headers.forEach((h, i) => {
+                h.classList.remove('sort-asc', 'sort-desc');
+                if (i === colIdx) h.classList.add('sort-' + dir);
+            });
+            rows.sort((a, b) => {
+                const aEl = a.cells[colIdx].querySelector('.aup-score');
+                const bEl = b.cells[colIdx].querySelector('.aup-score');
+                const aVal = parseFloat(aEl ? aEl.textContent : '0') || 0;
+                const bVal = parseFloat(bEl ? bEl.textContent : '0') || 0;
+                return dir === 'desc' ? bVal - aVal : aVal - bVal;
+            });
+            rows.forEach((row, i) => {
+                const rankCell = row.cells[0];
+                const medal = i < 3 ? ['🥇', '🥈', '🥉'][i] : (i + 1);
+                rankCell.innerHTML = '<span class="rank-medal">' + medal + '</span>';
+                tbody.appendChild(row);
+            });
+        }
+    }
+    if (document.readyState === 'loading') {
+        document.addEventListener('DOMContentLoaded', initSort);
+    } else {
+        initSort();
+    }
+})();
+</script>
+"""
+# Foundation model to badge class mapping
+FOUNDATION_COLORS = {}
+_foundation_idx = 0
+def get_foundation_class(foundation):
+    global _foundation_idx, FOUNDATION_COLORS
+    if foundation not in FOUNDATION_COLORS:
+        FOUNDATION_COLORS[foundation] = f"f{_foundation_idx % 10}"
+        _foundation_idx += 1
+    return FOUNDATION_COLORS[foundation]

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Formatting utilities for display
2	+ # Currently not used - keeping for potential future extensions

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Utility functions for display formatting
2	+ # Currently not used - keeping for potential future extensions

src/display/visualization.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import numpy as np
+# 30 distinct colors - assigned by Avg AUP rank
+COLOR_PALETTE = [
+    "#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA",
+    "#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B",
+    "#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42",
+    "#009688", "#673AB7", "#F44336", "#3F51B5", "#795548",
+    "#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800",
+    "#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457",
+]
+def get_model_colors(df):
+    """Assign colors to methods by Avg AUP rank (descending)."""
+    models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
+    return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)}
+def get_model_ranks(df):
+    """Get rank for each method by Avg AUP."""
+    models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
+    return {model: i + 1 for i, model in enumerate(models_sorted)}
+def hex_to_rgba(hex_color, alpha=0.25):
+    hex_color = hex_color.lstrip('#')
+    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
+    return f'rgba({r},{g},{b},{alpha})'
+def create_radar_chart(df, tasks, top_n=15):
+    """Create radar chart for top N methods showing original AUP scores (independent axes)."""
+    df_top = df.head(top_n).copy()
+    model_colors = get_model_colors(df)
+    model_ranks = get_model_ranks(df)
+    all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"]
+    categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"]
+    # Compute min/max per column for normalization (for radar display only)
+    col_stats = {}
+    for col in all_cols:
+        vals = df_top[col].dropna().astype(float)
+        col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0,
+                          'max': vals.max() if len(vals) > 0 else 100}
+    fig = go.Figure()
+    for _, row in df_top.iterrows():
+        method = row["Method"]
+        rank = model_ranks.get(method, 0)
+        color = model_colors.get(method, "#808080")
+        display_name = f"#{rank} {method}"
+        # Original AUP values for hover display
+        original_vals = [row.get(col, 0) or 0 for col in all_cols]
+        # Normalized values for radar shape (0-100 scale per axis)
+        normalized = []
+        for col, val in zip(all_cols, original_vals):
+            stats = col_stats[col]
+            range_val = stats['max'] - stats['min']
+            if range_val > 0:
+                norm = ((val - stats['min']) / range_val) * 80 + 10  # Scale to 10-90
+            else:
+                norm = 50
+            normalized.append(norm)
+        # Custom hover text showing original AUP scores
+        hover_texts = [f"<b>{display_name}</b><br>{cat}: <b>{val:.1f}</b>"
+                       for cat, val in zip(categories, original_vals)]
+        fig.add_trace(go.Scatterpolar(
+            r=normalized + [normalized[0]],
+            theta=categories + [categories[0]],
+            mode='lines+markers', fill='toself', name=display_name,
+            line=dict(color=color, width=2), marker=dict(color=color, size=6),
+            fillcolor=hex_to_rgba(color, 0.15), opacity=0.9,
+            text=hover_texts + [hover_texts[0]],
+            hovertemplate='%{text}<extra></extra>'
+        ))
+    fig.update_layout(
+        height=600, margin=dict(l=100, r=250, t=80, b=60),
+        title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
+        # title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
+        polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11),
+                                   tickvals=[], showticklabels=False)),
+        legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)',
+                   bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))),
+        hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333")
+    )
+    return fig
+def create_group_bar_chart(df, tasks, top_n=15):
+    """Create grouped bar chart with Avg AUP included and rank numbers."""
+    df_top = df.head(top_n).copy()
+    methods = df_top["Method"].tolist()
+    model_colors = get_model_colors(df)
+    model_ranks = get_model_ranks(df)
+    all_benchmarks = tasks + ["Avg_AUP"]
+    fig = go.Figure()
+    for method in methods:
+        row = df_top[df_top["Method"] == method].iloc[0]
+        color = model_colors.get(method, "#808080")
+        rank = model_ranks.get(method, 0)
+        display_name = f"#{rank} {method}"
+        y_vals, x_vals = [], []
+        for bench in all_benchmarks:
+            aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP")
+            if aup is not None and not (isinstance(aup, float) and aup != aup):
+                y_vals.append(aup)
+                x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench)
+        if y_vals:
+            fig.add_trace(go.Bar(
+                name=display_name, x=x_vals, y=y_vals, marker_color=color,
+                hovertemplate=f"<b>{display_name}</b><br>%{{x}}: %{{y:.1f}}<extra></extra>"
+            ))
+    fig.update_layout(
+        height=550, margin=dict(l=60, r=250, t=80, b=100),
+        title=dict(text=f"📊 Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)),
+        # title=dict(text=f"📊 Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)),
+        xaxis_title="Benchmark", yaxis_title="AUP Score",
+        barmode='group', bargap=0.2, bargroupgap=0.05,
+        legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
+                   bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))),
+        hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
+    )
+    return fig
+def create_aup_curve_chart(raw_data, tasks, df, top_n=15):
+    """Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py)."""
+    df_top = df.head(top_n).copy()
+    model_colors = get_model_colors(df)
+    model_ranks = get_model_ranks(df)
+    methods_to_show = set(df_top["Method"].tolist())
+    # Build per-task data: {task: {method: [(rho, y), ...]}}
+    task_data = {t: {} for t in tasks}
+    for task in tasks:
+        for method, pairs in raw_data.get(task, {}).items():
+            if method in methods_to_show:
+                task_data[task][method] = [(p[0], p[1]) for p in pairs]
+    # Compute average data: average TPF and Acc by index across tasks (all tasks have same length)
+    avg_data = {}
+    for method in methods_to_show:
+        task_points = [task_data.get(t, {}).get(method, []) for t in tasks]
+        task_points = [p for p in task_points if p]  # filter empty
+        if not task_points:
+            continue
+        n_points = len(task_points[0])
+        avg_data[method] = [
+            (np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5)
+            for i in range(n_points)
+        ]
+    # 6 subplots: 5 tasks + 1 Average at (2,3)
+    titles = tasks + ["Average"]
+    fig = make_subplots(rows=2, cols=3, subplot_titles=titles,
+                        horizontal_spacing=0.08, vertical_spacing=0.15)
+    # Track which methods have been added to legend
+    legend_added = set()
+    def get_pos(idx):
+        if idx < 3:
+            return (1, idx + 1)
+        return (2, idx - 2)  # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3)
+    # Helper to draw curve for a given subplot
+    def draw_curve(pairs, method, row, col):
+        nonlocal legend_added
+        if not pairs:
+            return
+        color = model_colors.get(method, "#808080")
+        rank = model_ranks.get(method, 0)
+        display_name = f"#{rank} {method}"
+        show_legend = method not in legend_added
+        if show_legend:
+            legend_added.add(method)
+        rho, y = zip(*sorted(pairs, key=lambda x: x[0]))
+        rho, y = np.array(rho), np.array(y)
+        # Generate smooth curve (quadratic fitting, same as plot_lines.py)
+        if len(rho) >= 3:
+            z = np.polyfit(rho, y, 2)
+            p = np.poly1d(z)
+            x_smooth = np.linspace(rho.min(), rho.max(), 300)
+            y_smooth = p(x_smooth)
+        elif len(rho) == 2:
+            x_smooth = np.linspace(rho.min(), rho.max(), 300)
+            if rho[1] != rho[0]:
+                a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2)
+                y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0]
+            else:
+                y_smooth = np.linspace(y[0], y[1], 300)
+        else:
+            x_smooth, y_smooth = rho, y
+        # Add fitted curve
+        fig.add_trace(go.Scatter(
+            x=x_smooth, y=y_smooth, mode='lines', name=display_name,
+            line=dict(color=color, width=2.5), opacity=0.85,
+            showlegend=show_legend, legendgroup=method,
+            hoverinfo='skip'
+        ), row=row, col=col)
+        # Add markers at original data points
+        fig.add_trace(go.Scatter(
+            x=rho, y=y, mode='markers', name=display_name,
+            marker=dict(color='white', size=8, line=dict(color=color, width=2)),
+            showlegend=False, legendgroup=method,
+            hovertemplate=f"<b>{display_name}</b><br>TPF: %{{x:.2f}}<br>Acc: %{{y:.1f}}<extra></extra>"
+        ), row=row, col=col)
+    # Draw 5 task subplots
+    for idx, task in enumerate(tasks):
+        row, col = get_pos(idx)
+        data = task_data.get(task, {})
+        for method in df_top["Method"].tolist():
+            if method in data:
+                draw_curve(data[method], method, row, col)
+    # Draw Average subplot at (2, 3)
+    for method in df_top["Method"].tolist():
+        if method in avg_data:
+            draw_curve(avg_data[method], method, 2, 3)
+    fig.update_layout(
+        height=550, margin=dict(l=60, r=250, t=80, b=60),
+        title=dict(text=f"📈 Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)),
+        legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
+                   bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)),
+                   tracegroupgap=1, itemsizing='constant'),
+        hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
+    )
+    # Update axes labels for 6 subplots
+    for idx in range(6):
+        row, col = get_pos(idx)
+        fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col)
+        fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col)
+    return fig

src/envs.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Environment configuration - not used in current implementation

src/leaderboard/read_evals.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import math
+import yaml
+import pandas as pd
+from pathlib import Path
+# AUP calculation (from d3LLM_Code/aup_utils.py)
+def weight_function(y: float, y_max: float, alpha: float = 3.0) -> float:
+    """Quality-weighting function W(y) = min(exp(-alpha * (1 - y/y_max)), 1)"""
+    return min(math.exp(-alpha * (1 - y / y_max)), 1.0)
+def get_aup(rho: list, y: list, y_max: float, alpha: float = 3.0, y_min_offset: float = 5.0) -> float:
+    """Calculate AUP (Accuracy Under Parallelism) score."""
+    if len(rho) == 0:
+        return 0.0
+    sorted_pairs = sorted(zip(rho, y), key=lambda x: x[0])
+    sorted_rho, sorted_y = zip(*sorted_pairs)
+    sorted_rho, sorted_y = list(sorted_rho), list(sorted_y)
+    y_1 = sorted_y[0]
+    y_min = y_1 - y_min_offset
+    filtered_pairs = [(r, acc) for r, acc in zip(sorted_rho, sorted_y) if acc >= y_min]
+    if len(filtered_pairs) == 0:
+        return sorted_rho[0] * sorted_y[0]
+    filtered_rho, filtered_y = zip(*filtered_pairs)
+    filtered_rho, filtered_y = list(filtered_rho), list(filtered_y)
+    aup = filtered_rho[0] * filtered_y[0]
+    for i in range(1, len(filtered_rho)):
+        y_i, y_prev = filtered_y[i], filtered_y[i-1]
+        w_i = weight_function(y_i, y_max, alpha)
+        w_prev = weight_function(y_prev, y_max, alpha)
+        aup += 0.5 * (filtered_rho[i] - filtered_rho[i-1]) * (y_i * w_i + y_prev * w_prev)
+    return aup
+DATA_DIR = Path(__file__).parent.parent.parent / "d3LLM_Code"
+DATA_FILES = ["data_dream.yaml", "data_llada.yaml", "data_dream_coder.yaml"]
+# Merge HumanEval-Instruct -> HumanEval, MBPP-Instruct -> MBPP; exclude HumanEval+, MBPP+
+TASK_MERGE = {"HumanEval-Instruct": "HumanEval", "MBPP-Instruct": "MBPP"}
+TASK_EXCLUDE = {"HumanEval+", "MBPP+"}
+TASK_ORDER = ["GSM8K-CoT", "MATH", "MBPP", "HumanEval", "Long-GSM8K"]
+AVG_AUP_DIVISOR = 5
+def load_yaml_data():
+    """Load YAML files separately, compute y_max per file/task, then merge."""
+    all_results = {}  # {method: {task: (aup, tpf, acc)}}
+    all_meta = {}
+    all_tasks = set()
+    raw_data = {}  # {task: {method: [(rho, y), ...]}} for curve plotting
+    for filename in DATA_FILES:
+        filepath = DATA_DIR / filename
+        if not filepath.exists():
+            continue
+        with open(filepath, 'r') as f:
+            data = yaml.safe_load(f)
+        meta = data.pop('_meta', {})
+        all_meta.update(meta)
+        # Compute y_max per task WITHIN this file only (as per main.py)
+        file_tasks = {k: v for k, v in data.items() if k not in TASK_EXCLUDE}
+        y_max_per_task = {}
+        for task, methods in file_tasks.items():
+            y_max_per_task[task] = max(y for pairs in methods.values() for _, y in pairs)
+        # Calculate AUP for each method/task in this file
+        for task, methods in file_tasks.items():
+            target_task = TASK_MERGE.get(task, task)
+            all_tasks.add(target_task)
+            y_max = y_max_per_task[task]
+            # Store raw data for curve plotting
+            if target_task not in raw_data:
+                raw_data[target_task] = {}
+            for method, pairs in methods.items():
+                if method not in all_results:
+                    all_results[method] = {}
+                rho_list = [p[0] for p in pairs]
+                y_list = [p[1] for p in pairs]
+                aup = get_aup(rho_list, y_list, y_max)
+                tpf = max(rho_list)
+                acc = pairs[0][1] if len(pairs) == 1 else [p[1] for p in pairs if p[0] == max(rho_list)][0]
+                all_results[method][target_task] = (round(aup, 1), round(tpf, 2), round(acc, 1))
+                raw_data[target_task][method] = pairs
+    # Return tasks in specified order
+    ordered_tasks = [t for t in TASK_ORDER if t in all_tasks]
+    return all_results, all_meta, ordered_tasks, raw_data
+def compute_leaderboard():
+    """Compute leaderboard DataFrame from YAML data."""
+    results_dict, meta, tasks, raw_data = load_yaml_data()
+    results = []
+    for method in sorted(results_dict.keys()):
+        method_meta = meta.get(method, {})
+        row = {
+            "Method": method,
+            "Type": method_meta.get("type", "?"),
+            "Foundation": method_meta.get("foundation", "?"),
+            "Link": method_meta.get("link", ""),
+        }
+        aup_sum = 0.0
+        for task in tasks:
+            if task in results_dict[method]:
+                aup, tpf, acc = results_dict[method][task]
+                row[f"{task}_AUP"], row[f"{task}_TPF"], row[f"{task}_Acc"] = aup, tpf, acc
+                aup_sum += aup
+            else:
+                row[f"{task}_AUP"] = row[f"{task}_TPF"] = row[f"{task}_Acc"] = None
+        row["Avg_AUP"] = round(aup_sum / AVG_AUP_DIVISOR, 1)
+        results.append(row)
+    df = pd.DataFrame(results).sort_values("Avg_AUP", ascending=False).reset_index(drop=True)
+    return df, tasks, raw_data
+def get_leaderboard_df():
+    df, _, _ = compute_leaderboard()
+    return df
+def get_tasks():
+    _, tasks, _ = compute_leaderboard()
+    return tasks
+def get_raw_data():
+    _, _, raw_data = compute_leaderboard()
+    return raw_data

src/populate.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Population utilities - not used in current implementation

src/submission/check_validity.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Submission validation - not used in current implementation

src/submission/submit.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Submission handling - not used in current implementation