import gradio as gr import pandas as pd from src.leaderboard.read_evals import get_leaderboard_df, get_tasks, get_raw_data from src.display.visualization import create_radar_chart, create_group_bar_chart, create_aup_curve_chart from src.display.css_html_js import custom_css, sort_table_js, get_foundation_class CITATION_HTML = """

📝 If you find this Leaderboard useful for your research, please star our GitHub repo and cite our work:

@article{preprint'25:d3llm,
  author  = {Yu-Yang Qian and Junda Su and Lanxiang Hu and Peiyuan Zhang and Zhijie Deng and Peng Zhao and Hao Zhang},
  title   = {d3LLM: Ultra-Fast Diffusion LLM using Pseudo-Trajectory Distillation},
  journal = {ArXiv preprint},
  volume  = {to appear},
  note    = {\\url{https://github.com/hao-ai-lab/d3LLM} [Accessed: 2025-12-11]},
  year    = {2025}
}

""" def create_leaderboard_html(df, tasks): """Generate HTML table for detailed results.""" rows_html = "" for rank, (_, row) in enumerate(df.iterrows(), 1): medal = f'{["🥇", "🥈", "🥉"][rank-1]}' if rank <= 3 else str(rank) # Method with link method = row['Method'] link = row.get('Link', '') method_html = f'{method}' if link else method # Type badge type_val = row.get('Type', '?') type_display = 'dLLM' if type_val == 'dLLM' else type_val type_class = 'ar' if type_val == 'AR' else 'dllm' # Foundation badge foundation = row.get('Foundation', '?') foundation_class = get_foundation_class(foundation) # Build cells for each task task_cells = "" for task in tasks: aup = row.get(f'{task}_AUP') tpf = row.get(f'{task}_TPF') acc = row.get(f'{task}_Acc') if pd.notna(aup): task_cells += f''' {aup:.1f} TPF:{tpf:.2f} Acc:{acc:.1f} ''' else: task_cells += '-' # Avg AUP avg_aup = row.get('Avg_AUP', 0) rows_html += f''' {medal} {method_html} {type_display} {foundation} {task_cells} {avg_aup:.1f} ''' task_headers = ''.join(f'{t}' for t in tasks) return f''' {sort_table_js}

{task_headers} {rows_html}

Rank	Method	Type	Foundation Model	Avg AUP

''' def update_charts(top_n): df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data() return create_radar_chart(df, tasks, top_n), create_group_bar_chart(df, tasks, top_n), create_aup_curve_chart(raw_data, tasks, df, top_n) # Load data df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data() default_top_n = min(15, len(df)) with gr.Blocks(css=custom_css, title="dLLM Leaderboard", fill_height=False) as demo: gr.HTML(''' ''') with gr.Tabs(): with gr.TabItem("📊 Leaderboard"): with gr.Row(): top_n_slider = gr.Slider(minimum=3, maximum=len(df), value=default_top_n, step=1, label="Number of Top Methods to Display") with gr.Row(): radar_plot = gr.Plot(value=create_radar_chart(df, tasks, default_top_n)) with gr.Row(): bar_plot = gr.Plot(value=create_group_bar_chart(df, tasks, default_top_n)) with gr.Row(): curve_plot = gr.Plot(value=create_aup_curve_chart(raw_data, tasks, df, default_top_n)) top_n_slider.change(fn=update_charts, inputs=[top_n_slider], outputs=[radar_plot, bar_plot, curve_plot]) gr.Markdown("### 🏆 Detailed Leaderboard") gr.HTML(create_leaderboard_html(df, tasks)) gr.HTML(CITATION_HTML) with gr.TabItem("📤 Submit Result"): gr.HTML("""

Submit Your Results

We welcome contributions to the dLLM Leaderboard! To submit your method's results:

Step 1: Evaluate Your Method

Follow the evaluation protocol in the d3LLM repository. Refer to the eval_scripts folder for benchmark evaluation scripts, and AUP_leaderboard folder for AUP calculation utilities.

Step 2: Prepare Your Evaluation Results

Add your results to the appropriate YAML file following this format:

_meta:
  YourMethod:
    type: dLLM  # or AR
    foundation: YourFoundation
    link: https://link/to/your/method

TaskName:
  YourMethod:
  - [rho_1, accuracy_1]  # (parallelism, accuracy) pairs
  - [rho_2, accuracy_2]

Step 3: Submit a Pull Request

Fork the repository
Add your results to the YAML files
Submit a PR with your method name, description, and evaluation details

Questions? Open an issue on GitHub.

""" + CITATION_HTML) with gr.TabItem("ℹ️ About"): gr.HTML("""

About dLLM Leaderboard

This leaderboard evaluates Diffusion Large Language Models (dLLMs) using the AUP (Accuracy Under Parallelism) metric.

Metrics

AUP: Primary metric - measures efficiency-accuracy trade-off (higher is better)
TPF: Tokens Per Forward - parallelism level achieved
Acc: Accuracy at maximum parallelism

Benchmarks

GSM8K-CoT, MATH, HumanEval, MBPP, Long-GSM8K

References

GitHub Code Repo: https://github.com/hao-ai-lab/d3LLM
Blog: https://hao-ai-lab.github.io/blogs/text-diffusion/

""" + CITATION_HTML) demo.launch()

🫧 Welcome to dLLM Leaderboard! 🏆