saleh-alibrahim's picture
Remove local fallback: API-only data loading, error on failure
9d287e6 verified
"""Tawkeed Arabic Benchmark Leaderboard β€” HuggingFace Space."""
import logging
import math
import gradio as gr
import httpx
import pandas as pd
import plotly.graph_objects as go
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ── Configuration ────────────────────────────────────────────────────────────
API_BASE_URL = "https://benchmark.tawkeed.ai"
SKILLS = [
"MMLU", "General Knowledge", "Reasoning & Math", "RAG QA",
"Translation (incl Dialects)", "Trust & Safety", "Writing (incl Dialects)",
"Arabic Language & Grammar", "Reading Comprehension", "Dialect Detection",
"Diacritization", "Sentiment Analysis", "Summarization", "Instruction Following",
"Transliteration", "Paraphrasing", "Entity Extraction", "Long Context",
"Function Calling", "Hallucination", "Coding", "Structuring",
]
OCR_METRICS = ["WER", "CER", "BLEU", "ChrF", "TEDS", "MARS"]
TIER_COLORS = {
"S": "#22c55e",
"A": "#3b82f6",
"B": "#f59e0b",
"C": "#ef4444",
"D": "#6b7280",
}
def score_to_tier(score: float) -> str:
if score >= 9.0:
return "S"
elif score >= 7.0:
return "A"
elif score >= 5.0:
return "B"
elif score >= 3.0:
return "C"
else:
return "D"
def ocr_wer_to_tier(wer: float) -> str:
"""OCR tier based on WER (lower is better)."""
if wer < 0.1:
return "S"
elif wer < 0.3:
return "A"
elif wer < 0.5:
return "B"
elif wer < 0.7:
return "C"
else:
return "D"
# ── Load LLM Results ────────────────────────────────────────────────────────
def load_results() -> list[dict]:
"""Fetch LLM results from the API. Raises on failure β€” no fallback."""
api_url = f"{API_BASE_URL}/api/results"
resp = httpx.get(api_url, timeout=15.0)
resp.raise_for_status()
models = resp.json()["models"]
logger.info("Loaded %d models from API: %s", len(models), api_url)
return sorted(models, key=lambda r: -r["average_score"])
# ── Load OCR Results ────────────────────────────────────────────────────────
def load_ocr_results() -> list[dict]:
"""Fetch OCR results from the API. Raises on failure β€” no fallback."""
api_url = f"{API_BASE_URL}/api/ocr-results"
resp = httpx.get(api_url, timeout=15.0)
resp.raise_for_status()
models = resp.json()["models"]
logger.info("Loaded %d OCR models from API: %s", len(models), api_url)
return sorted(models, key=lambda r: r.get("wer", 1.0))
# ── Build DataFrames ────────────────────────────────────────────────────────
def build_leaderboard_df(results: list[dict]) -> pd.DataFrame:
rows = []
for i, r in enumerate(results, 1):
row = {
"Rank": i,
"Model": r["model_name"],
"Provider": r.get("provider", ""),
"Eval": r.get("eval_method", ""),
"Parameters": r.get("parameters", ""),
"Score": r["average_score"],
"Tier": score_to_tier(r["average_score"]),
}
for skill in SKILLS:
row[skill] = r.get("scores_by_category", {}).get(skill, None)
rows.append(row)
return pd.DataFrame(rows)
def build_ocr_leaderboard_df(results: list[dict]) -> pd.DataFrame:
rows = []
for i, r in enumerate(results, 1):
wer = r.get("wer")
rows.append({
"Rank": i,
"Model": r.get("model_name", ""),
"Provider": r.get("provider", ""),
"Parameters": r.get("parameters", ""),
"WER": wer,
"CER": r.get("cer"),
"BLEU": r.get("bleu"),
"ChrF": r.get("chrf"),
"TEDS": r.get("teds"),
"MARS": r.get("mars"),
"Tier": ocr_wer_to_tier(wer) if wer is not None else "D",
})
return pd.DataFrame(rows)
ALL_RESULTS = load_results()
LEADERBOARD_DF = build_leaderboard_df(ALL_RESULTS)
ALL_OCR_RESULTS = load_ocr_results()
OCR_LEADERBOARD_DF = build_ocr_leaderboard_df(ALL_OCR_RESULTS)
# ── Chart Color Palette ─────────────────────────────────────────────────────
CHART_COLORS = [
"#06b6d4", # cyan-500
"#8b5cf6", # violet-500
"#f43f5e", # rose-500
"#f59e0b", # amber-500
"#10b981", # emerald-500
"#ec4899", # pink-500
]
CHART_FILL_COLORS = [
"rgba(6,182,212,0.10)",
"rgba(139,92,246,0.10)",
"rgba(244,63,94,0.10)",
"rgba(245,158,11,0.10)",
"rgba(16,185,129,0.10)",
"rgba(236,72,153,0.10)",
]
CHART_FONT = "Inter, system-ui, -apple-system, sans-serif"
# Dark chart background for embedded Plotly charts
CHART_BG = "rgba(15, 23, 42, 0.0)"
CHART_GRID = "rgba(148,163,184,0.12)"
CHART_AXIS_COLOR = "rgba(148,163,184,0.25)"
CHART_TEXT_COLOR = "#cbd5e1"
TIER_CHART_COLORS = {
"S": "#22d3ee", # cyan-400
"A": "#818cf8", # indigo-400
"B": "#fbbf24", # amber-400
"C": "#fb7185", # rose-400
"D": "#94a3b8", # slate-400
}
# ── LLM Charts ──────────────────────────────────────────────────────────────
def make_radar_chart(model_names: list[str]) -> go.Figure:
fig = go.Figure()
for i, model_name in enumerate(model_names):
row = LEADERBOARD_DF[LEADERBOARD_DF["Model"] == model_name]
if row.empty:
continue
row = row.iloc[0]
scores = [row.get(s, 0) or 0 for s in SKILLS]
color = CHART_COLORS[i % len(CHART_COLORS)]
fill = CHART_FILL_COLORS[i % len(CHART_FILL_COLORS)]
fig.add_trace(go.Scatterpolar(
r=scores + [scores[0]],
theta=SKILLS + [SKILLS[0]],
fill="toself",
fillcolor=fill,
line=dict(color=color, width=2.5, shape="spline"),
name=model_name,
hovertemplate="<b>%{theta}</b><br>Score: %{r:.2f}<extra></extra>",
mode="lines+markers",
marker=dict(size=4, color=color),
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 10],
tickfont=dict(size=10, color=CHART_TEXT_COLOR),
gridcolor=CHART_GRID,
linecolor=CHART_AXIS_COLOR,
tickvals=[2, 4, 6, 8, 10],
),
angularaxis=dict(
tickfont=dict(size=10, color=CHART_TEXT_COLOR),
gridcolor=CHART_GRID,
linecolor=CHART_AXIS_COLOR,
rotation=90,
direction="clockwise",
),
bgcolor="rgba(0,0,0,0)",
),
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.22,
xanchor="center",
x=0.5,
font=dict(size=12, color=CHART_TEXT_COLOR),
bgcolor="rgba(0,0,0,0)",
),
height=650,
margin=dict(l=90, r=90, t=50, b=90),
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)",
font=dict(family=CHART_FONT, color=CHART_TEXT_COLOR),
)
return fig
def make_bar_chart(model_name: str) -> go.Figure:
row = LEADERBOARD_DF[LEADERBOARD_DF["Model"] == model_name]
if row.empty:
return go.Figure()
row = row.iloc[0]
skills_data = []
for skill in SKILLS:
score = row.get(skill)
if score is not None and not (isinstance(score, float) and math.isnan(score)):
skills_data.append((skill, float(score)))
skills_data.sort(key=lambda x: x[1])
names = [s[0] for s in skills_data]
scores = [s[1] for s in skills_data]
colors = [TIER_CHART_COLORS[score_to_tier(s)] for s in scores]
fig = go.Figure(go.Bar(
y=names,
x=scores,
orientation="h",
marker=dict(
color=colors,
line=dict(width=0),
cornerradius=6,
),
text=[f"{s:.1f}" for s in scores],
textposition="outside",
textfont=dict(size=11, color=CHART_TEXT_COLOR),
hovertemplate="<b>%{y}</b><br>Score: %{x:.2f}<extra></extra>",
))
fig.update_layout(
xaxis=dict(
range=[0, 10.8],
title=dict(text="Score (0-10)", font=dict(size=12, color=CHART_TEXT_COLOR)),
tickfont=dict(size=10, color=CHART_TEXT_COLOR),
gridcolor=CHART_GRID,
zeroline=False,
),
yaxis=dict(
tickfont=dict(size=11, color=CHART_TEXT_COLOR),
automargin=True,
),
height=max(450, len(skills_data) * 34),
margin=dict(l=10, r=50, t=25, b=50),
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)",
font=dict(family=CHART_FONT, color=CHART_TEXT_COLOR),
bargap=0.22,
)
return fig
# ── OCR Charts ──────────────────────────────────────────────────────────────
def make_ocr_compare_chart(model_names: list[str]) -> go.Figure:
"""Bar chart comparing selected OCR models across all 6 metrics."""
fig = go.Figure()
for i, model_name in enumerate(model_names):
row = OCR_LEADERBOARD_DF[OCR_LEADERBOARD_DF["Model"] == model_name]
if row.empty:
continue
row = row.iloc[0]
values = []
for metric in OCR_METRICS:
val = row.get(metric)
if val is not None and not (isinstance(val, float) and math.isnan(val)):
# For WER/CER, display as-is (lower is better)
# For BLEU/ChrF/TEDS/MARS, display as-is (higher is better)
values.append(float(val))
else:
values.append(0)
fig.add_trace(go.Bar(
name=model_name,
x=OCR_METRICS,
y=values,
marker_color=CHART_COLORS[i % len(CHART_COLORS)],
marker=dict(cornerradius=4),
text=[f"{v:.3f}" if v < 1 else f"{v:.1f}" for v in values],
textposition="outside",
textfont=dict(color=CHART_TEXT_COLOR),
))
fig.update_layout(
barmode="group",
xaxis=dict(
title=dict(text="Metric", font=dict(size=12, color=CHART_TEXT_COLOR)),
tickfont=dict(size=11, color=CHART_TEXT_COLOR),
gridcolor=CHART_GRID,
),
yaxis=dict(
title=dict(text="Value", font=dict(size=12, color=CHART_TEXT_COLOR)),
tickfont=dict(size=10, color=CHART_TEXT_COLOR),
gridcolor=CHART_GRID,
zeroline=False,
),
height=520,
margin=dict(l=60, r=40, t=40, b=70),
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)",
font=dict(family=CHART_FONT, color=CHART_TEXT_COLOR),
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.22,
xanchor="center",
x=0.5,
font=dict(size=12, color=CHART_TEXT_COLOR),
bgcolor="rgba(0,0,0,0)",
),
bargap=0.18,
)
return fig
# ── Build display DataFrames ─────────────────────────────────────────────────
def build_leaderboard_display() -> pd.DataFrame:
if LEADERBOARD_DF.empty:
return pd.DataFrame()
df = LEADERBOARD_DF[["Rank", "Model", "Provider", "Eval", "Parameters", "Score", "Tier"]].copy()
df["Score"] = df["Score"].apply(lambda x: f"**{x:.2f}**")
df["Tier"] = df["Tier"].apply(lambda t: f"**{t}**")
return df
def build_ocr_leaderboard_display() -> pd.DataFrame:
if OCR_LEADERBOARD_DF.empty:
return pd.DataFrame()
df = OCR_LEADERBOARD_DF[["Rank", "Model", "Provider", "Parameters",
"WER", "CER", "BLEU", "ChrF", "TEDS", "MARS", "Tier"]].copy()
for col in ["WER", "CER"]:
df[col] = df[col].apply(lambda x: f"{x:.4f}" if x is not None else "-")
for col in ["BLEU", "ChrF", "TEDS", "MARS"]:
df[col] = df[col].apply(lambda x: f"{x:.2f}" if x is not None else "-")
df["Tier"] = df["Tier"].apply(lambda t: f"**{t}**")
return df
def build_skills_df(model_name: str) -> pd.DataFrame:
row = LEADERBOARD_DF[LEADERBOARD_DF["Model"] == model_name]
if row.empty:
return pd.DataFrame(columns=["Skill", "Score", "Tier"])
row = row.iloc[0]
skills_data = []
for skill in SKILLS:
score = row.get(skill)
if score is not None and not (isinstance(score, float) and math.isnan(score)):
tier = score_to_tier(float(score))
skills_data.append({
"Skill": skill,
"Score": round(float(score), 2),
"Tier": tier,
})
df = pd.DataFrame(skills_data)
if not df.empty:
df = df.sort_values("Score", ascending=False).reset_index(drop=True)
return df
DISPLAY_DF = build_leaderboard_display()
OCR_DISPLAY_DF = build_ocr_leaderboard_display()
# ── Custom CSS ───────────────────────────────────────────────────────────────
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&display=swap');
/* ── Global ─────────────────────────────────────────────────────── */
.gradio-container {
max-width: 1280px !important;
font-family: 'Inter', system-ui, -apple-system, sans-serif !important;
background: transparent !important;
}
.dark .gradio-container,
.gradio-container {
--body-background-fill: #0a0f1a !important;
--background-fill-primary: #0f1629 !important;
--background-fill-secondary: #131b2e !important;
--block-background-fill: #111827 !important;
--block-border-color: rgba(99, 126, 181, 0.12) !important;
--block-label-text-color: #94a3b8 !important;
--body-text-color: #e2e8f0 !important;
--body-text-color-subdued: #94a3b8 !important;
--input-background-fill: #1e293b !important;
--input-border-color: rgba(99, 126, 181, 0.2) !important;
--border-color-primary: rgba(99, 126, 181, 0.12) !important;
--block-shadow: 0 4px 24px rgba(0, 0, 0, 0.3) !important;
--block-border-width: 1px !important;
--block-radius: 16px !important;
--checkbox-label-background-fill: #1e293b !important;
--checkbox-background-color: #1e293b !important;
--table-even-background-fill: rgba(15, 23, 42, 0.4) !important;
--table-odd-background-fill: rgba(30, 41, 59, 0.3) !important;
--table-row-focus: rgba(6, 182, 212, 0.08) !important;
}
/* ── Header Banner ──────────────────────────────────────────────── */
.header-banner {
background: linear-gradient(145deg, #0a0f1a 0%, #0f1d3a 30%, #0c2a50 55%, #0e3b6e 80%, #104080 100%);
border-radius: 20px;
padding: 52px 48px 48px 48px;
margin-bottom: 28px;
position: relative;
overflow: hidden;
border: 1px solid rgba(6, 182, 212, 0.12);
box-shadow:
0 0 80px rgba(6, 182, 212, 0.06),
0 20px 60px rgba(0, 0, 0, 0.4),
inset 0 1px 0 rgba(255, 255, 255, 0.04);
}
.header-banner::before {
content: '';
position: absolute;
top: -60%;
right: -15%;
width: 500px;
height: 500px;
background: radial-gradient(circle, rgba(6, 182, 212, 0.12) 0%, rgba(6, 182, 212, 0.03) 40%, transparent 70%);
border-radius: 50%;
animation: pulse-glow 6s ease-in-out infinite alternate;
}
.header-banner::after {
content: '';
position: absolute;
bottom: -40%;
left: -8%;
width: 400px;
height: 400px;
background: radial-gradient(circle, rgba(139, 92, 246, 0.08) 0%, transparent 65%);
border-radius: 50%;
animation: pulse-glow 8s ease-in-out infinite alternate-reverse;
}
@keyframes pulse-glow {
0% { opacity: 0.5; transform: scale(1); }
100% { opacity: 1; transform: scale(1.08); }
}
.header-badge {
display: inline-block;
background: rgba(6, 182, 212, 0.12);
border: 1px solid rgba(6, 182, 212, 0.25);
color: #67e8f9;
padding: 5px 14px;
border-radius: 100px;
font-size: 11px;
font-weight: 600;
letter-spacing: 1.5px;
text-transform: uppercase;
margin-bottom: 16px;
position: relative;
z-index: 1;
backdrop-filter: blur(8px);
}
.header-title {
font-size: 42px;
font-weight: 900;
color: white;
margin: 0 0 10px 0;
letter-spacing: -1px;
position: relative;
z-index: 1;
line-height: 1.1;
background: linear-gradient(135deg, #ffffff 0%, #e0f2fe 50%, #67e8f9 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.header-subtitle {
font-size: 16px;
color: rgba(203, 213, 225, 0.8);
margin: 0;
position: relative;
z-index: 1;
max-width: 620px;
line-height: 1.6;
font-weight: 400;
}
.header-divider {
width: 60px;
height: 2px;
background: linear-gradient(90deg, #06b6d4, rgba(6, 182, 212, 0));
margin: 24px 0;
border-radius: 2px;
position: relative;
z-index: 1;
}
.header-stats {
display: flex;
gap: 40px;
margin-top: 0;
position: relative;
z-index: 1;
flex-wrap: wrap;
}
.header-stat {
display: flex;
flex-direction: column;
padding: 16px 20px;
background: rgba(255, 255, 255, 0.03);
border: 1px solid rgba(255, 255, 255, 0.06);
border-radius: 14px;
backdrop-filter: blur(12px);
min-width: 100px;
transition: all 0.3s ease;
}
.header-stat:hover {
background: rgba(255, 255, 255, 0.06);
border-color: rgba(6, 182, 212, 0.2);
transform: translateY(-2px);
box-shadow: 0 8px 24px rgba(0, 0, 0, 0.3);
}
.header-stat-value {
font-size: 32px;
font-weight: 800;
color: white;
line-height: 1;
letter-spacing: -0.5px;
}
.header-stat-label {
font-size: 11px;
color: rgba(148, 163, 184, 0.7);
text-transform: uppercase;
letter-spacing: 1.5px;
margin-top: 6px;
font-weight: 600;
}
/* ── Tabs ───────────────────────────────────────────────────────── */
.tab-buttons button {
font-size: 14px !important;
font-weight: 600 !important;
letter-spacing: 0.3px !important;
padding: 10px 24px !important;
border-radius: 10px !important;
transition: all 0.25s ease !important;
}
.tab-buttons button.selected {
background: linear-gradient(135deg, rgba(6, 182, 212, 0.15), rgba(139, 92, 246, 0.1)) !important;
border-color: rgba(6, 182, 212, 0.3) !important;
color: #67e8f9 !important;
box-shadow: 0 0 20px rgba(6, 182, 212, 0.1) !important;
}
/* ── Dataframe / Table ──────────────────────────────────────────── */
.gradio-dataframe {
border-radius: 14px !important;
overflow: hidden !important;
border: 1px solid rgba(99, 126, 181, 0.12) !important;
box-shadow: 0 4px 24px rgba(0, 0, 0, 0.2) !important;
}
table thead th {
background: rgba(15, 22, 41, 0.9) !important;
color: #94a3b8 !important;
font-weight: 700 !important;
font-size: 11px !important;
text-transform: uppercase !important;
letter-spacing: 0.8px !important;
padding: 14px 16px !important;
border-bottom: 2px solid rgba(6, 182, 212, 0.15) !important;
}
table tbody td {
padding: 12px 16px !important;
font-size: 13px !important;
border-bottom: 1px solid rgba(99, 126, 181, 0.06) !important;
color: #e2e8f0 !important;
transition: background 0.15s ease !important;
}
table tbody tr:hover td {
background: rgba(6, 182, 212, 0.04) !important;
}
/* ── Dropdowns and Inputs ───────────────────────────────────────── */
.gradio-dropdown, .gradio-textbox, input, textarea, select {
border-radius: 12px !important;
font-size: 14px !important;
}
.gradio-dropdown .wrap {
border-color: rgba(99, 126, 181, 0.2) !important;
background: #1e293b !important;
}
/* ── Plot containers ────────────────────────────────────────────── */
.gradio-plot {
border-radius: 16px !important;
overflow: hidden !important;
border: 1px solid rgba(99, 126, 181, 0.1) !important;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15) !important;
}
/* ── Citation ───────────────────────────────────────────────────── */
#citation-button textarea {
font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
font-size: 13px !important;
line-height: 1.6 !important;
background: #0f172a !important;
border: 1px solid rgba(99, 126, 181, 0.15) !important;
border-radius: 12px !important;
color: #67e8f9 !important;
padding: 20px !important;
}
/* ── Markdown content ───────────────────────────────────────────── */
.prose h2, .markdown-text h2 {
color: #e2e8f0 !important;
font-weight: 700 !important;
font-size: 22px !important;
margin-top: 32px !important;
margin-bottom: 12px !important;
padding-bottom: 8px;
border-bottom: 1px solid rgba(99, 126, 181, 0.1);
}
.prose h3, .markdown-text h3 {
color: #cbd5e1 !important;
font-weight: 600 !important;
}
.prose table, .markdown-text table {
border-collapse: collapse !important;
width: 100% !important;
margin: 16px 0 !important;
border-radius: 10px !important;
overflow: hidden !important;
}
.prose table th, .markdown-text table th {
background: rgba(15, 23, 42, 0.8) !important;
color: #94a3b8 !important;
font-weight: 600 !important;
font-size: 12px !important;
text-transform: uppercase !important;
letter-spacing: 0.5px !important;
padding: 12px 16px !important;
text-align: left !important;
border-bottom: 1px solid rgba(99, 126, 181, 0.15) !important;
}
.prose table td, .markdown-text table td {
padding: 10px 16px !important;
color: #cbd5e1 !important;
font-size: 13px !important;
border-bottom: 1px solid rgba(99, 126, 181, 0.06) !important;
}
.prose a, .markdown-text a {
color: #67e8f9 !important;
text-decoration: none !important;
font-weight: 500 !important;
}
.prose a:hover, .markdown-text a:hover {
text-decoration: underline !important;
color: #22d3ee !important;
}
/* ── Section containers ─────────────────────────────────────────── */
.section-card {
background: rgba(17, 24, 39, 0.6);
border: 1px solid rgba(99, 126, 181, 0.1);
border-radius: 16px;
padding: 28px;
margin-bottom: 16px;
backdrop-filter: blur(12px);
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
}
/* ── Footer ─────────────────────────────────────────────────────── */
.footer-text {
text-align: center;
padding: 24px 0 8px 0;
color: #475569;
font-size: 13px;
border-top: 1px solid rgba(99, 126, 181, 0.08);
margin-top: 32px;
}
.footer-text a {
color: #67e8f9 !important;
text-decoration: none;
font-weight: 500;
}
/* ── Responsive ─────────────────────────────────────────────────── */
@media (max-width: 768px) {
.header-banner {
padding: 32px 24px;
border-radius: 14px;
}
.header-title {
font-size: 28px;
}
.header-stats {
gap: 16px;
}
.header-stat {
min-width: 80px;
padding: 12px 14px;
}
.header-stat-value {
font-size: 24px;
}
.gradio-container {
padding: 8px !important;
}
}
@media (max-width: 480px) {
.header-title {
font-size: 22px;
}
.header-subtitle {
font-size: 13px;
}
.header-stats {
gap: 10px;
flex-wrap: wrap;
}
.header-stat {
min-width: 60px;
padding: 10px 12px;
}
}
"""
# ── About Markdown ───────────────────────────────────────────────────────────
ABOUT_MD = """
## What is the Tawkeed Arabic Benchmark?
The Tawkeed Arabic Benchmark is a comprehensive evaluation suite for Arabic LLMs, covering **970 questions** across **22 skill categories**. It evaluates models on everything from MMLU-style knowledge to dialect writing, diacritization, and Arabic grammar analysis.
## How are scores calculated?
Each question is scored on a **0-10 scale** using one of two methods:
- **LLM-as-Judge** -- GPT evaluates the model's response against a reference answer across dimensions like accuracy, completeness, and fluency.
- **Manual Metrics** -- Automated scoring using ROUGE-L, Levenshtein distance, diacritization accuracy, JSON validity, or word intersection.
The **Benchmark Score** is the average across all questions.
## Tier System
| Tier | Score Range | Description |
|------|-------------|-------------|
| **S** | 9.0 - 10.0 | Exceptional -- near-human performance |
| **A** | 7.0 - 8.9 | Strong -- competent across most tasks |
| **B** | 5.0 - 6.9 | Adequate -- acceptable for general use |
| **C** | 3.0 - 4.9 | Weak -- significant gaps in capability |
| **D** | 0.0 - 2.9 | Failing -- not suitable for Arabic tasks |
## Dataset
The benchmark dataset is available on HuggingFace: [tawkeed-sa/tawkeed-arabic-benchmark](https://huggingface.co/datasets/tawkeed-sa/tawkeed-arabic-benchmark)
## How to submit your model?
To evaluate your model on the Tawkeed Arabic Benchmark, reach out to the Tawkeed team with your model's API endpoint details. We'll run the evaluation and add results to this leaderboard.
"""
OCR_ABOUT_MD = """
## What is the Tawkeed OCR Benchmark?
The Tawkeed OCR Benchmark evaluates Arabic document OCR and vision-language models across **400 expert-verified images** from the Misraj-DocOCR and KITAB-Bench datasets.
## Metrics
| Metric | Direction | Description |
|--------|-----------|-------------|
| **WER** | Lower is better | Word Error Rate -- proportion of incorrectly recognized words |
| **CER** | Lower is better | Character Error Rate -- proportion of incorrectly recognized characters |
| **BLEU** | Higher is better | Bilingual Evaluation Understudy -- n-gram precision score (0-100) |
| **ChrF** | Higher is better | Character F-Score -- character-level F-measure (0-100) |
| **TEDS** | Higher is better | Tree Edit Distance Similarity -- structural table accuracy (0-100) |
| **MARS** | Higher is better | Mixed Arabic Recognition Score -- blended text + table score (0-100) |
## OCR Tier System (based on WER)
| Tier | WER Range | Description |
|------|-----------|-------------|
| **S** | < 0.10 | Exceptional -- near-perfect recognition |
| **A** | 0.10 - 0.29 | Strong -- high accuracy |
| **B** | 0.30 - 0.49 | Good -- acceptable for most uses |
| **C** | 0.50 - 0.69 | Fair -- noticeable errors |
| **D** | >= 0.70 | Weak -- significant recognition issues |
## Datasets
- **Misraj-DocOCR** -- 400 expert-verified Arabic document images ([Misraj/Misraj-DocOCR](https://huggingface.co/datasets/Misraj/Misraj-DocOCR))
- **KITAB-Bench** -- Arabic PDF-to-markdown reviewed dataset ([Misraj/KITAB_pdf_to_markdown_reviewed](https://huggingface.co/datasets/Misraj/KITAB_pdf_to_markdown_reviewed))
"""
# ── Main App ─────────────────────────────────────────────────────────────────
theme = gr.themes.Base(
primary_hue=gr.themes.colors.cyan,
secondary_hue=gr.themes.colors.slate,
neutral_hue=gr.themes.colors.slate,
font=gr.themes.GoogleFont("Inter"),
font_mono=gr.themes.GoogleFont("JetBrains Mono"),
).set(
body_background_fill="#0a0f1a",
body_background_fill_dark="#0a0f1a",
body_text_color="#e2e8f0",
body_text_color_dark="#e2e8f0",
body_text_color_subdued="#94a3b8",
body_text_color_subdued_dark="#94a3b8",
background_fill_primary="#0f1629",
background_fill_primary_dark="#0f1629",
background_fill_secondary="#131b2e",
background_fill_secondary_dark="#131b2e",
block_background_fill="#111827",
block_background_fill_dark="#111827",
block_border_color="rgba(99, 126, 181, 0.12)",
block_border_color_dark="rgba(99, 126, 181, 0.12)",
block_border_width="1px",
block_label_text_color="#94a3b8",
block_label_text_color_dark="#94a3b8",
block_radius="16px",
block_shadow="0 4px 24px rgba(0, 0, 0, 0.3)",
block_shadow_dark="0 4px 24px rgba(0, 0, 0, 0.3)",
block_title_text_color="#e2e8f0",
block_title_text_color_dark="#e2e8f0",
border_color_primary="rgba(99, 126, 181, 0.12)",
border_color_primary_dark="rgba(99, 126, 181, 0.12)",
input_background_fill="#1e293b",
input_background_fill_dark="#1e293b",
input_border_color="rgba(99, 126, 181, 0.2)",
input_border_color_dark="rgba(99, 126, 181, 0.2)",
input_border_width="1px",
input_radius="12px",
button_primary_background_fill="linear-gradient(135deg, #0891b2, #0e7490)",
button_primary_background_fill_dark="linear-gradient(135deg, #0891b2, #0e7490)",
button_primary_background_fill_hover="linear-gradient(135deg, #06b6d4, #0891b2)",
button_primary_background_fill_hover_dark="linear-gradient(135deg, #06b6d4, #0891b2)",
button_primary_text_color="#ffffff",
button_primary_text_color_dark="#ffffff",
button_secondary_background_fill="#1e293b",
button_secondary_background_fill_dark="#1e293b",
button_secondary_text_color="#e2e8f0",
button_secondary_text_color_dark="#e2e8f0",
checkbox_background_color="#1e293b",
checkbox_background_color_dark="#1e293b",
checkbox_label_background_fill="#1e293b",
checkbox_label_background_fill_dark="#1e293b",
table_even_background_fill="rgba(15, 23, 42, 0.4)",
table_even_background_fill_dark="rgba(15, 23, 42, 0.4)",
table_odd_background_fill="rgba(30, 41, 59, 0.3)",
table_odd_background_fill_dark="rgba(30, 41, 59, 0.3)",
table_row_focus="rgba(6, 182, 212, 0.08)",
table_row_focus_dark="rgba(6, 182, 212, 0.08)",
shadow_drop="0 4px 12px rgba(0, 0, 0, 0.2)",
shadow_drop_lg="0 8px 32px rgba(0, 0, 0, 0.3)",
)
demo = gr.Blocks(css=CUSTOM_CSS, theme=theme, title="Tawkeed Arabic Benchmark Leaderboard")
with demo:
# Header (self-contained dark bg -- works in both themes)
gr.HTML(f"""
<div class="header-banner">
<div class="header-badge">Arabic AI Evaluation</div>
<div class="header-title">Tawkeed Arabic Benchmark</div>
<p class="header-subtitle">
The comprehensive leaderboard for evaluating Arabic large language models
and OCR systems across diverse real-world tasks.
</p>
<div class="header-divider"></div>
<div class="header-stats">
<div class="header-stat">
<span class="header-stat-value">970</span>
<span class="header-stat-label">LLM Questions</span>
</div>
<div class="header-stat">
<span class="header-stat-value">22</span>
<span class="header-stat-label">Skill Categories</span>
</div>
<div class="header-stat">
<span class="header-stat-value">{len(ALL_RESULTS)}</span>
<span class="header-stat-label">LLM Models</span>
</div>
<div class="header-stat">
<span class="header-stat-value">{len(ALL_OCR_RESULTS)}</span>
<span class="header-stat-label">OCR Models</span>
</div>
</div>
</div>
""")
model_names = LEADERBOARD_DF["Model"].tolist() if not LEADERBOARD_DF.empty else []
ocr_model_names = OCR_LEADERBOARD_DF["Model"].tolist() if not OCR_LEADERBOARD_DF.empty else []
# Top-level tabs: LLM Benchmark | OCR Benchmark
with gr.Tabs(elem_classes="tab-buttons") as top_tabs:
# ═══════════════════════════════════════════════════════════════════
# LLM BENCHMARK SECTION
# ═══════════════════════════════════════════════════════════════════
with gr.TabItem("LLM Benchmark", id="llm"):
with gr.Tabs() as llm_tabs:
# ── LLM Tab 1: Leaderboard ──────────────────────────────
with gr.TabItem("Leaderboard", id=0):
gr.Dataframe(
value=DISPLAY_DF,
datatype=["number", "markdown", "str", "str", "str", "markdown", "markdown"],
interactive=False,
wrap=True,
show_search="filter",
column_widths=[60, 250, 130, 110, 80, 90, 60],
)
# ── LLM Tab 2: Skills Breakdown ─────────────────────────
with gr.TabItem("Skills Breakdown", id=1):
skill_model_dropdown = gr.Dropdown(
choices=model_names,
value=model_names[0] if model_names else None,
label="Select Model",
)
skills_table = gr.Dataframe(
value=build_skills_df(model_names[0]) if model_names else pd.DataFrame(),
datatype=["str", "number", "str"],
interactive=False,
wrap=True,
column_widths=[280, 100, 60],
)
skill_bar_chart = gr.Plot(
make_bar_chart(model_names[0]) if model_names else None
)
def update_skills(model_name):
return build_skills_df(model_name), make_bar_chart(model_name)
skill_model_dropdown.change(
update_skills,
inputs=skill_model_dropdown,
outputs=[skills_table, skill_bar_chart],
)
# ── LLM Tab 3: Compare ──────────────────────────────────
with gr.TabItem("Compare", id=2):
compare_dropdown = gr.Dropdown(
choices=model_names,
value=model_names[:min(2, len(model_names))],
label="Select Models to Compare",
multiselect=True,
)
radar_chart = gr.Plot(
make_radar_chart(model_names[:min(2, len(model_names))]) if model_names else None
)
def update_radar(selected):
if not selected:
return go.Figure()
return make_radar_chart(selected)
compare_dropdown.change(
update_radar,
inputs=compare_dropdown,
outputs=radar_chart,
)
# ── LLM Tab 4: About ────────────────────────────────────
with gr.TabItem("About", id=3):
gr.Markdown(ABOUT_MD)
# ═══════════════════════════════════════════════════════════════════
# OCR BENCHMARK SECTION
# ═══════════════════════════════════════════════════════════════════
with gr.TabItem("OCR Benchmark", id="ocr"):
with gr.Tabs() as ocr_tabs:
# ── OCR Tab 1: Leaderboard ──────────────────────────────
with gr.TabItem("Leaderboard", id=10):
if OCR_DISPLAY_DF.empty:
gr.Markdown("*No OCR benchmark results available yet.*")
else:
gr.Dataframe(
value=OCR_DISPLAY_DF,
datatype=["number", "str", "str", "str",
"str", "str", "str", "str", "str", "str", "markdown"],
interactive=False,
wrap=True,
show_search="filter",
column_widths=[50, 200, 120, 80, 80, 80, 80, 80, 80, 80, 50],
)
# ── OCR Tab 2: Compare ──────────────────────────────────
with gr.TabItem("Compare", id=11):
if ocr_model_names:
ocr_compare_dropdown = gr.Dropdown(
choices=ocr_model_names,
value=ocr_model_names[:min(2, len(ocr_model_names))],
label="Select OCR Models to Compare",
multiselect=True,
)
ocr_compare_chart = gr.Plot(
make_ocr_compare_chart(ocr_model_names[:min(2, len(ocr_model_names))])
)
def update_ocr_compare(selected):
if not selected:
return go.Figure()
return make_ocr_compare_chart(selected)
ocr_compare_dropdown.change(
update_ocr_compare,
inputs=ocr_compare_dropdown,
outputs=ocr_compare_chart,
)
else:
gr.Markdown("*No OCR benchmark results available yet.*")
# ── OCR Tab 3: About ────────────────────────────────────
with gr.TabItem("About", id=12):
gr.Markdown(OCR_ABOUT_MD)
# ── Citation Tab (shared) ───────────────────────────────────────
with gr.TabItem("Citation", id="citation"):
gr.Markdown("### Cite the Tawkeed Arabic Benchmark")
gr.Textbox(
value="""@dataset{tawkeed_arabic_benchmark_2026,
title={Tawkeed Arabic Benchmark},
author={Tawkeed Team},
year={2026},
publisher={HuggingFace},
url={https://huggingface.co/datasets/tawkeed-sa/tawkeed-arabic-benchmark}
}""",
label="BibTeX",
lines=7,
show_copy_button=True,
elem_id="citation-button",
)
# Footer
gr.HTML(
'<div class="footer-text">'
'Built by <strong>Tawkeed</strong> &middot; '
'<a href="https://huggingface.co/datasets/tawkeed-sa/tawkeed-arabic-benchmark" target="_blank">Dataset</a> &middot; '
'<a href="https://huggingface.co/tawkeed-sa" target="_blank">HuggingFace</a>'
'</div>'
)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)