dike-leaderboard

Sleeping

File size: 10,756 Bytes

fc291fb

import gradio as gr
import pandas as pd


def model_hyperlink_md(link: str, name: str) -> str:
    return f"[{name}]({link})"


def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
    if "Links" not in df.columns:
        raise ValueError("CSV must include a 'Links' column.")
    df = df.copy()
    df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
    return df.drop(columns=["Links"])


def datatypes_with_markdown(df: pd.DataFrame):
    return ["markdown" if c == "Model" else "str" for c in df.columns]


# ---------- load data ----------
BASE_CSV = "code_eval_board.csv"
INSTRUCT_CSV = "eval_instruct_lms.csv"

base_df_raw = pd.read_csv(BASE_CSV)
inst_df_raw = pd.read_csv(INSTRUCT_CSV)

base_df = make_clickable_and_drop_links(base_df_raw)
inst_df = make_clickable_and_drop_links(inst_df_raw)

base_dtypes = datatypes_with_markdown(base_df)
inst_dtypes = datatypes_with_markdown(inst_df)

# ---------- css ----------
custom_css = """
.gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
#base-table a, #inst-table a {
  color: #2a7ae2 !important;
  text-decoration: underline dotted !important;
  text-underline-offset: 3px;
}
#base-table a:hover, #inst-table a:hover {
  color: #1e5bbf !important;
  text-decoration: underline solid !important;
}
"""

# ---------- app ----------
demo = gr.Blocks(css=custom_css)

with demo:
    # ---------- HEADER ----------
    gr.HTML(
        """<div id='header' style='text-align:center; margin-top:16px;'>
            <div id='title-row'
                 style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'>
                <img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg'
                     alt='Diké' width='80'
                     style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'>
                <div style='display:flex; flex-direction:column; align-items:center; text-align:center;'>
                    <h1 style='font-size:30px; margin:0; font-weight:650;'>Open Diké Leaderboard</h1>
                    <p style='font-size:18px; margin:4px 0; color:#6c7a89;'>
                        Bias and Fairness in Compressed LLMs
                    </p>
                </div>
            </div>

            <p id='subtitle' 
               style='font-size:14px; color:#8a9aad; margin-top:12px; 
                      max-width:1000px; margin-left:auto; margin-right:auto; 
                      line-height:1.6; text-align:justify;'>
                Inspired by 
                <a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/' 
                   target='_blank' 
                   style='color:#5a8dee; text-decoration:none; font-weight:500;'>
                   🤗 Open LLM Leaderboard
                </a> and 
                <a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard' 
                   target='_blank' 
                   style='color:#5a8dee; text-decoration:none; font-weight:500;'>
                   Optimum Leaderboard 🏋️
                </a>, we compare the performance of compressed LLMs across 
                <b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the 
                <a href='https://www.anr-dike.fr/' target='_blank'
                   style='color:#5a8dee; text-decoration:none; font-weight:500;'>⚖️ Diké Project</a>.
            </p>
        </div>"""
    )

    # ---------- TABS ----------
    with gr.Tabs():
        # TAB 1: Base LLMs
        with gr.TabItem("🟢 Base LLMs Evaluation"):
            with gr.Row():
                base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False)


            def base_search_fn(q):
                if not q or not q.strip():
                    return base_df
                mask = base_df["Model"].str.contains(q, case=False)
                return base_df[mask]


            base_table = gr.Dataframe(
                value=base_df,
                datatype=base_dtypes,
                interactive=False,
                sortable=True,
                elem_id="base-table",
            )
            base_search.submit(base_search_fn, base_search, base_table)

        # TAB 2: Instruction-tuned LLMs
        with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"):
            with gr.Row():
                inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False)


            def inst_search_fn(q):
                if not q or not q.strip():
                    return inst_df
                mask = inst_df["Model"].str.contains(q, case=False)
                return inst_df[mask]


            inst_table = gr.Dataframe(
                value=inst_df,
                datatype=inst_dtypes,
                interactive=False,
                sortable=True,
                elem_id="inst-table",
            )
            inst_search.submit(inst_search_fn, inst_search, inst_table)

        # TAB 3: About
        # ---------- TAB 3: About ----------
        with gr.TabItem("📘 About"):
            gr.HTML("""
            <div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'>
              <h3 style='text-align:center;'>📊 Benchmarks and Metrics Overview</h3>
              <p>
                The Diké Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b> 
                on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs).
                Each benchmark measures a specific social or ethical aspect of model behavior.
              </p>

              <ul style='list-style-type: " "; padding-left: 1em;'>

                <li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality, 
                evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>. 
                Lower values indicate better language modeling performance.</li>

                <li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
                Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>. 
                Metric: Accuracy.</li>

                <li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated 
                question-answering contexts across 11 protected categories 
                (<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>).  
                Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li>

                <li><b>CrowS-Pairs</b> - Minimal stereotype pairs.
                (<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>).
                Metric: % of stereotyped continuations.</li>

                <li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts
                (<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>).  
                Metric: Sentiment skew across identity descriptors.</li>

                <li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability 
                (<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>).  
                Metric: Variance of log-perplexity across identity groups.</li>

                <li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated) 
                across gender, race, religion, profession 
                (<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>).  
                Metric: Stereotype Score, Language Modeling Score.</li>

                <li><b>ETHICS</b> - Morality judgments across five ethical principles; 
                we use the <i>Commonsense Morality</i> subset 
                (<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>).  
                Metric: Accuracy.</li>

                <li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection  
                (<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).  
                Metrics: Moral preference Accuracy, Refusal rate.</li>

                <li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation.  
                (<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).  
                Metric: Accuracy, Refusal rate.</li> 

                <li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts 
                (<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>).  
                Metric: Average toxicity probability.</li>

                <li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts 
                (<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>).  
                Metric: Unsafe response rate.</li>

              </ul>

              <p style='margin-top:1.5em;'>
                All evaluations are implemented via the 
                <a href='https://github.com/EleutherAI/lm-evaluation-harness' 
                   target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a> 
                and follow consistent zero-shot protocols.
              </p>
            </div>
            """)

    gr.HTML(
        """
        <div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'>
          <b>Notes</b><br>
          • Click column headers to sort ascending/descending<br>
          • Model names are clickable links to Hugging Face pages<br><br>
          Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>⚖️ Diké Project</a>.
        </div>
        """
    )
demo.launch(server_name="0.0.0.0", server_port=7860)