Spaces:
Sleeping
Sleeping
File size: 10,756 Bytes
fc291fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import gradio as gr
import pandas as pd
def model_hyperlink_md(link: str, name: str) -> str:
return f"[{name}]({link})"
def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
if "Links" not in df.columns:
raise ValueError("CSV must include a 'Links' column.")
df = df.copy()
df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
return df.drop(columns=["Links"])
def datatypes_with_markdown(df: pd.DataFrame):
return ["markdown" if c == "Model" else "str" for c in df.columns]
# ---------- load data ----------
BASE_CSV = "code_eval_board.csv"
INSTRUCT_CSV = "eval_instruct_lms.csv"
base_df_raw = pd.read_csv(BASE_CSV)
inst_df_raw = pd.read_csv(INSTRUCT_CSV)
base_df = make_clickable_and_drop_links(base_df_raw)
inst_df = make_clickable_and_drop_links(inst_df_raw)
base_dtypes = datatypes_with_markdown(base_df)
inst_dtypes = datatypes_with_markdown(inst_df)
# ---------- css ----------
custom_css = """
.gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
#base-table a, #inst-table a {
color: #2a7ae2 !important;
text-decoration: underline dotted !important;
text-underline-offset: 3px;
}
#base-table a:hover, #inst-table a:hover {
color: #1e5bbf !important;
text-decoration: underline solid !important;
}
"""
# ---------- app ----------
demo = gr.Blocks(css=custom_css)
with demo:
# ---------- HEADER ----------
gr.HTML(
"""<div id='header' style='text-align:center; margin-top:16px;'>
<div id='title-row'
style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'>
<img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg'
alt='Diké' width='80'
style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'>
<div style='display:flex; flex-direction:column; align-items:center; text-align:center;'>
<h1 style='font-size:30px; margin:0; font-weight:650;'>Open Diké Leaderboard</h1>
<p style='font-size:18px; margin:4px 0; color:#6c7a89;'>
Bias and Fairness in Compressed LLMs
</p>
</div>
</div>
<p id='subtitle'
style='font-size:14px; color:#8a9aad; margin-top:12px;
max-width:1000px; margin-left:auto; margin-right:auto;
line-height:1.6; text-align:justify;'>
Inspired by
<a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/'
target='_blank'
style='color:#5a8dee; text-decoration:none; font-weight:500;'>
🤗 Open LLM Leaderboard
</a> and
<a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard'
target='_blank'
style='color:#5a8dee; text-decoration:none; font-weight:500;'>
Optimum Leaderboard 🏋️
</a>, we compare the performance of compressed LLMs across
<b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the
<a href='https://www.anr-dike.fr/' target='_blank'
style='color:#5a8dee; text-decoration:none; font-weight:500;'>⚖️ Diké Project</a>.
</p>
</div>"""
)
# ---------- TABS ----------
with gr.Tabs():
# TAB 1: Base LLMs
with gr.TabItem("🟢 Base LLMs Evaluation"):
with gr.Row():
base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False)
def base_search_fn(q):
if not q or not q.strip():
return base_df
mask = base_df["Model"].str.contains(q, case=False)
return base_df[mask]
base_table = gr.Dataframe(
value=base_df,
datatype=base_dtypes,
interactive=False,
sortable=True,
elem_id="base-table",
)
base_search.submit(base_search_fn, base_search, base_table)
# TAB 2: Instruction-tuned LLMs
with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"):
with gr.Row():
inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False)
def inst_search_fn(q):
if not q or not q.strip():
return inst_df
mask = inst_df["Model"].str.contains(q, case=False)
return inst_df[mask]
inst_table = gr.Dataframe(
value=inst_df,
datatype=inst_dtypes,
interactive=False,
sortable=True,
elem_id="inst-table",
)
inst_search.submit(inst_search_fn, inst_search, inst_table)
# TAB 3: About
# ---------- TAB 3: About ----------
with gr.TabItem("📘 About"):
gr.HTML("""
<div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'>
<h3 style='text-align:center;'>📊 Benchmarks and Metrics Overview</h3>
<p>
The Diké Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b>
on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs).
Each benchmark measures a specific social or ethical aspect of model behavior.
</p>
<ul style='list-style-type: " "; padding-left: 1em;'>
<li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality,
evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>.
Lower values indicate better language modeling performance.</li>
<li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>.
Metric: Accuracy.</li>
<li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated
question-answering contexts across 11 protected categories
(<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>).
Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li>
<li><b>CrowS-Pairs</b> - Minimal stereotype pairs.
(<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>).
Metric: % of stereotyped continuations.</li>
<li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts
(<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>).
Metric: Sentiment skew across identity descriptors.</li>
<li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability
(<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>).
Metric: Variance of log-perplexity across identity groups.</li>
<li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated)
across gender, race, religion, profession
(<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>).
Metric: Stereotype Score, Language Modeling Score.</li>
<li><b>ETHICS</b> - Morality judgments across five ethical principles;
we use the <i>Commonsense Morality</i> subset
(<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>).
Metric: Accuracy.</li>
<li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection
(<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
Metrics: Moral preference Accuracy, Refusal rate.</li>
<li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation.
(<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
Metric: Accuracy, Refusal rate.</li>
<li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts
(<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>).
Metric: Average toxicity probability.</li>
<li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts
(<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>).
Metric: Unsafe response rate.</li>
</ul>
<p style='margin-top:1.5em;'>
All evaluations are implemented via the
<a href='https://github.com/EleutherAI/lm-evaluation-harness'
target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a>
and follow consistent zero-shot protocols.
</p>
</div>
""")
gr.HTML(
"""
<div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'>
<b>Notes</b><br>
• Click column headers to sort ascending/descending<br>
• Model names are clickable links to Hugging Face pages<br><br>
Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>⚖️ Diké Project</a>.
</div>
"""
)
demo.launch(server_name="0.0.0.0", server_port=7860)
|