Spaces:

d3LLM
/

dLLM_Leaderboard

Running

File size: 9,027 Bytes

d473371

import gradio as gr
import pandas as pd
from src.leaderboard.read_evals import get_leaderboard_df, get_tasks, get_raw_data
from src.display.visualization import create_radar_chart, create_group_bar_chart, create_aup_curve_chart
from src.display.css_html_js import custom_css, sort_table_js, get_foundation_class

CITATION_HTML = """
<div style="max-width: 800px; margin: 30px auto 0 auto; padding: 20px; background: #f8f7ff; border-radius: 12px; border-left: 4px solid #5a3d8a;">
    <p style="margin: 0 0 12px 0; color: #5a3d8a; font-weight: 600;">📝 If you find this Leaderboard useful for your research, please star <a href="https://github.com/hao-ai-lab/d3llm" target="_blank" style="color: #5a3d8a;">our GitHub repo</a> and cite our work:</p>
    <pre style="background: #fff; padding: 15px; border-radius: 8px; overflow-x: auto; font-size: 12px; margin: 0; color: #333; white-space: pre-wrap; word-wrap: break-word;">@article{preprint'25:d3llm,
  author  = {Yu-Yang Qian and Junda Su and Lanxiang Hu and Peiyuan Zhang and Zhijie Deng and Peng Zhao and Hao Zhang},
  title   = {d3LLM: Ultra-Fast Diffusion LLM using Pseudo-Trajectory Distillation},
  journal = {ArXiv preprint},
  volume  = {to appear},
  note    = {\\url{https://github.com/hao-ai-lab/d3LLM} [Accessed: 2025-12-11]},
  year    = {2025}
}</pre>
</div>
"""

def create_leaderboard_html(df, tasks):
    """Generate HTML table for detailed results."""
    rows_html = ""
    for rank, (_, row) in enumerate(df.iterrows(), 1):
        medal = f'<span class="top-medal">{["🥇", "🥈", "🥉"][rank-1]}</span>' if rank <= 3 else str(rank)
        
        # Method with link
        method = row['Method']
        link = row.get('Link', '')
        method_html = f'<a href="{link}" target="_blank">{method}</a>' if link else method
        
        # Type badge
        type_val = row.get('Type', '?')
        type_display = 'dLLM' if type_val == 'dLLM' else type_val
        type_class = 'ar' if type_val == 'AR' else 'dllm'
        
        # Foundation badge
        foundation = row.get('Foundation', '?')
        foundation_class = get_foundation_class(foundation)
        
        # Build cells for each task
        task_cells = ""
        for task in tasks:
            aup = row.get(f'{task}_AUP')
            tpf = row.get(f'{task}_TPF')
            acc = row.get(f'{task}_Acc')
            if pd.notna(aup):
                task_cells += f'''<td>
                    <span class="aup-score">{aup:.1f}</span>
                    <span class="sub-metrics">TPF:{tpf:.2f} Acc:{acc:.1f}</span>
                </td>'''
            else:
                task_cells += '<td><span class="aup-score">-</span></td>'
        
        # Avg AUP
        avg_aup = row.get('Avg_AUP', 0)
        
        rows_html += f'''<tr>
            <td class="rank-cell"><span class="rank-medal">{medal}</span></td>
            <td class="method-cell">{method_html}</td>
            <td class="type-cell"><span class="type-badge {type_class}">{type_display}</span></td>
            <td class="foundation-cell"><span class="foundation-badge {foundation_class}">{foundation}</span></td>
            {task_cells}
            <td class="avg-cell"><span class="aup-score">{avg_aup:.1f}</span></td>
        </tr>'''
    
    task_headers = ''.join(f'<th>{t}</th>' for t in tasks)
    
    return f'''
    {sort_table_js}
    <div class="table-wrapper">
        <table class="leaderboard-table">
            <thead><tr>
                <th>Rank</th><th>Method</th><th>Type</th><th>Foundation Model</th>
                {task_headers}
                <th>Avg AUP</th>
            </tr></thead>
            <tbody>{rows_html}</tbody>
        </table>
    </div>
    '''

def update_charts(top_n):
    df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
    return create_radar_chart(df, tasks, top_n), create_group_bar_chart(df, tasks, top_n), create_aup_curve_chart(raw_data, tasks, df, top_n)

# Load data
df, tasks, raw_data = get_leaderboard_df(), get_tasks(), get_raw_data()
default_top_n = min(15, len(df))

with gr.Blocks(css=custom_css, title="dLLM Leaderboard", fill_height=False) as demo:
    gr.HTML('''
        <div class="welcome-banner">
            <h2>🫧 Welcome to dLLM Leaderboard! 🏆</h2>
            <p>Benchmarking various Diffusion Large Language Models (dLLMs) with <i><a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank" style="color: inherit; text-decoration: underline;">AUP (Accuracy Under Parallelism)</a></i>, considering both accuracy and parallelism.</p>
        </div>
    ''')
    
    with gr.Tabs():
        with gr.TabItem("📊 Leaderboard"):
            with gr.Row():
                top_n_slider = gr.Slider(minimum=3, maximum=len(df), value=default_top_n, step=1,
                                        label="Number of Top Methods to Display")
            
            with gr.Row():
                radar_plot = gr.Plot(value=create_radar_chart(df, tasks, default_top_n))
            with gr.Row():
                bar_plot = gr.Plot(value=create_group_bar_chart(df, tasks, default_top_n))
            with gr.Row():
                curve_plot = gr.Plot(value=create_aup_curve_chart(raw_data, tasks, df, default_top_n))
            
            top_n_slider.change(fn=update_charts, inputs=[top_n_slider], outputs=[radar_plot, bar_plot, curve_plot])
            
            gr.Markdown("### 🏆 Detailed Leaderboard")
            gr.HTML(create_leaderboard_html(df, tasks))
            gr.HTML(CITATION_HTML)
        
        with gr.TabItem("📤 Submit Result"):
            gr.HTML("""
            <div class="content-wrapper">
                <div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
                    <h2>Submit Your Results</h2>
                    <p>We welcome contributions to the dLLM Leaderboard! To submit your method's results:</p>
                    
                    <h3>Step 1: Evaluate Your Method</h3>
                    <p>Follow the evaluation protocol in the <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">d3LLM repository</a>. 
                    Refer to the <code>eval_scripts</code> folder for benchmark evaluation scripts, and <code>AUP_leaderboard</code> folder for AUP calculation utilities.</p>
                    
                    <h3>Step 2: Prepare Your Evaluation Results</h3>
                    <p>Add your results to the appropriate YAML file following this format:</p>
                    <pre style="background: #f5f5f5; padding: 15px; border-radius: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word;">_meta:
  YourMethod:
    type: dLLM  # or AR
    foundation: YourFoundation
    link: https://link/to/your/method

TaskName:
  YourMethod:
  - [rho_1, accuracy_1]  # (parallelism, accuracy) pairs
  - [rho_2, accuracy_2]</pre>
                    
                    <h3>Step 3: Submit a Pull Request</h3>
                    <ol>
                        <li>Fork the repository</li>
                        <li>Add your results to the YAML files</li>
                        <li>Submit a PR with your method name, description, and evaluation details</li>
                    </ol>
                    
                    <p><strong>Questions?</strong> Open an issue on <a href="https://github.com/hao-ai-lab/d3LLM/issues" target="_blank">GitHub</a>.</p>
                </div>
            </div>
            """ + CITATION_HTML)
        
        with gr.TabItem("ℹ️ About"):
            gr.HTML("""
            <div class="content-wrapper">
                <div style="max-width: 800px; margin: 0 auto; padding: 20px; box-sizing: border-box;">
                    <h2>About dLLM Leaderboard</h2>
                    <p>This leaderboard evaluates <strong>Diffusion Large Language Models (dLLMs)</strong> using the <strong>AUP (Accuracy Under Parallelism)</strong> metric.</p>
                    
                    <h3>Metrics</h3>
                    <ul>
                        <li><strong>AUP</strong>: Primary metric - measures efficiency-accuracy trade-off (higher is better)</li>
                        <li><strong>TPF</strong>: Tokens Per Forward - parallelism level achieved</li>
                        <li><strong>Acc</strong>: Accuracy at maximum parallelism</li>
                    </ul>
                    
                    <h3>Benchmarks</h3>
                    <p>GSM8K-CoT, MATH, HumanEval, MBPP, Long-GSM8K</p>
                    
                    <h3>References</h3>
                    <p>
                        GitHub Code Repo: <a href="https://github.com/hao-ai-lab/d3LLM" target="_blank">https://github.com/hao-ai-lab/d3LLM</a><br>
                        Blog: <a href="https://hao-ai-lab.github.io/blogs/text-diffusion/" target="_blank">https://hao-ai-lab.github.io/blogs/text-diffusion/</a>
                    </p>
                </div>
            </div>
            """ + CITATION_HTML)

demo.launch()