Spaces:
Runtime error
Runtime error
| """Tawkeed Arabic Benchmark Leaderboard β HuggingFace Space.""" | |
| import logging | |
| import math | |
| import gradio as gr | |
| import httpx | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| API_BASE_URL = "https://benchmark.tawkeed.ai" | |
| SKILLS = [ | |
| "MMLU", "General Knowledge", "Reasoning & Math", "RAG QA", | |
| "Translation (incl Dialects)", "Trust & Safety", "Writing (incl Dialects)", | |
| "Arabic Language & Grammar", "Reading Comprehension", "Dialect Detection", | |
| "Diacritization", "Sentiment Analysis", "Summarization", "Instruction Following", | |
| "Transliteration", "Paraphrasing", "Entity Extraction", "Long Context", | |
| "Function Calling", "Hallucination", "Coding", "Structuring", | |
| ] | |
| OCR_METRICS = ["WER", "CER", "BLEU", "ChrF", "TEDS", "MARS"] | |
| TIER_COLORS = { | |
| "S": "#22c55e", | |
| "A": "#3b82f6", | |
| "B": "#f59e0b", | |
| "C": "#ef4444", | |
| "D": "#6b7280", | |
| } | |
| def score_to_tier(score: float) -> str: | |
| if score >= 9.0: | |
| return "S" | |
| elif score >= 7.0: | |
| return "A" | |
| elif score >= 5.0: | |
| return "B" | |
| elif score >= 3.0: | |
| return "C" | |
| else: | |
| return "D" | |
| def ocr_wer_to_tier(wer: float) -> str: | |
| """OCR tier based on WER (lower is better).""" | |
| if wer < 0.1: | |
| return "S" | |
| elif wer < 0.3: | |
| return "A" | |
| elif wer < 0.5: | |
| return "B" | |
| elif wer < 0.7: | |
| return "C" | |
| else: | |
| return "D" | |
| # ββ Load LLM Results ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_results() -> list[dict]: | |
| """Fetch LLM results from the API. Raises on failure β no fallback.""" | |
| api_url = f"{API_BASE_URL}/api/results" | |
| resp = httpx.get(api_url, timeout=15.0) | |
| resp.raise_for_status() | |
| models = resp.json()["models"] | |
| logger.info("Loaded %d models from API: %s", len(models), api_url) | |
| return sorted(models, key=lambda r: -r["average_score"]) | |
| # ββ Load OCR Results ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_ocr_results() -> list[dict]: | |
| """Fetch OCR results from the API. Raises on failure β no fallback.""" | |
| api_url = f"{API_BASE_URL}/api/ocr-results" | |
| resp = httpx.get(api_url, timeout=15.0) | |
| resp.raise_for_status() | |
| models = resp.json()["models"] | |
| logger.info("Loaded %d OCR models from API: %s", len(models), api_url) | |
| return sorted(models, key=lambda r: r.get("wer", 1.0)) | |
| # ββ Build DataFrames ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_leaderboard_df(results: list[dict]) -> pd.DataFrame: | |
| rows = [] | |
| for i, r in enumerate(results, 1): | |
| row = { | |
| "Rank": i, | |
| "Model": r["model_name"], | |
| "Provider": r.get("provider", ""), | |
| "Eval": r.get("eval_method", ""), | |
| "Parameters": r.get("parameters", ""), | |
| "Score": r["average_score"], | |
| "Tier": score_to_tier(r["average_score"]), | |
| } | |
| for skill in SKILLS: | |
| row[skill] = r.get("scores_by_category", {}).get(skill, None) | |
| rows.append(row) | |
| return pd.DataFrame(rows) | |
| def build_ocr_leaderboard_df(results: list[dict]) -> pd.DataFrame: | |
| rows = [] | |
| for i, r in enumerate(results, 1): | |
| wer = r.get("wer") | |
| rows.append({ | |
| "Rank": i, | |
| "Model": r.get("model_name", ""), | |
| "Provider": r.get("provider", ""), | |
| "Parameters": r.get("parameters", ""), | |
| "WER": wer, | |
| "CER": r.get("cer"), | |
| "BLEU": r.get("bleu"), | |
| "ChrF": r.get("chrf"), | |
| "TEDS": r.get("teds"), | |
| "MARS": r.get("mars"), | |
| "Tier": ocr_wer_to_tier(wer) if wer is not None else "D", | |
| }) | |
| return pd.DataFrame(rows) | |
| ALL_RESULTS = load_results() | |
| LEADERBOARD_DF = build_leaderboard_df(ALL_RESULTS) | |
| ALL_OCR_RESULTS = load_ocr_results() | |
| OCR_LEADERBOARD_DF = build_ocr_leaderboard_df(ALL_OCR_RESULTS) | |
| # ββ Chart Color Palette βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CHART_COLORS = [ | |
| "#06b6d4", # cyan-500 | |
| "#8b5cf6", # violet-500 | |
| "#f43f5e", # rose-500 | |
| "#f59e0b", # amber-500 | |
| "#10b981", # emerald-500 | |
| "#ec4899", # pink-500 | |
| ] | |
| CHART_FILL_COLORS = [ | |
| "rgba(6,182,212,0.10)", | |
| "rgba(139,92,246,0.10)", | |
| "rgba(244,63,94,0.10)", | |
| "rgba(245,158,11,0.10)", | |
| "rgba(16,185,129,0.10)", | |
| "rgba(236,72,153,0.10)", | |
| ] | |
| CHART_FONT = "Inter, system-ui, -apple-system, sans-serif" | |
| # Dark chart background for embedded Plotly charts | |
| CHART_BG = "rgba(15, 23, 42, 0.0)" | |
| CHART_GRID = "rgba(148,163,184,0.12)" | |
| CHART_AXIS_COLOR = "rgba(148,163,184,0.25)" | |
| CHART_TEXT_COLOR = "#cbd5e1" | |
| TIER_CHART_COLORS = { | |
| "S": "#22d3ee", # cyan-400 | |
| "A": "#818cf8", # indigo-400 | |
| "B": "#fbbf24", # amber-400 | |
| "C": "#fb7185", # rose-400 | |
| "D": "#94a3b8", # slate-400 | |
| } | |
| # ββ LLM Charts ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def make_radar_chart(model_names: list[str]) -> go.Figure: | |
| fig = go.Figure() | |
| for i, model_name in enumerate(model_names): | |
| row = LEADERBOARD_DF[LEADERBOARD_DF["Model"] == model_name] | |
| if row.empty: | |
| continue | |
| row = row.iloc[0] | |
| scores = [row.get(s, 0) or 0 for s in SKILLS] | |
| color = CHART_COLORS[i % len(CHART_COLORS)] | |
| fill = CHART_FILL_COLORS[i % len(CHART_FILL_COLORS)] | |
| fig.add_trace(go.Scatterpolar( | |
| r=scores + [scores[0]], | |
| theta=SKILLS + [SKILLS[0]], | |
| fill="toself", | |
| fillcolor=fill, | |
| line=dict(color=color, width=2.5, shape="spline"), | |
| name=model_name, | |
| hovertemplate="<b>%{theta}</b><br>Score: %{r:.2f}<extra></extra>", | |
| mode="lines+markers", | |
| marker=dict(size=4, color=color), | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 10], | |
| tickfont=dict(size=10, color=CHART_TEXT_COLOR), | |
| gridcolor=CHART_GRID, | |
| linecolor=CHART_AXIS_COLOR, | |
| tickvals=[2, 4, 6, 8, 10], | |
| ), | |
| angularaxis=dict( | |
| tickfont=dict(size=10, color=CHART_TEXT_COLOR), | |
| gridcolor=CHART_GRID, | |
| linecolor=CHART_AXIS_COLOR, | |
| rotation=90, | |
| direction="clockwise", | |
| ), | |
| bgcolor="rgba(0,0,0,0)", | |
| ), | |
| showlegend=True, | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=-0.22, | |
| xanchor="center", | |
| x=0.5, | |
| font=dict(size=12, color=CHART_TEXT_COLOR), | |
| bgcolor="rgba(0,0,0,0)", | |
| ), | |
| height=650, | |
| margin=dict(l=90, r=90, t=50, b=90), | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| font=dict(family=CHART_FONT, color=CHART_TEXT_COLOR), | |
| ) | |
| return fig | |
| def make_bar_chart(model_name: str) -> go.Figure: | |
| row = LEADERBOARD_DF[LEADERBOARD_DF["Model"] == model_name] | |
| if row.empty: | |
| return go.Figure() | |
| row = row.iloc[0] | |
| skills_data = [] | |
| for skill in SKILLS: | |
| score = row.get(skill) | |
| if score is not None and not (isinstance(score, float) and math.isnan(score)): | |
| skills_data.append((skill, float(score))) | |
| skills_data.sort(key=lambda x: x[1]) | |
| names = [s[0] for s in skills_data] | |
| scores = [s[1] for s in skills_data] | |
| colors = [TIER_CHART_COLORS[score_to_tier(s)] for s in scores] | |
| fig = go.Figure(go.Bar( | |
| y=names, | |
| x=scores, | |
| orientation="h", | |
| marker=dict( | |
| color=colors, | |
| line=dict(width=0), | |
| cornerradius=6, | |
| ), | |
| text=[f"{s:.1f}" for s in scores], | |
| textposition="outside", | |
| textfont=dict(size=11, color=CHART_TEXT_COLOR), | |
| hovertemplate="<b>%{y}</b><br>Score: %{x:.2f}<extra></extra>", | |
| )) | |
| fig.update_layout( | |
| xaxis=dict( | |
| range=[0, 10.8], | |
| title=dict(text="Score (0-10)", font=dict(size=12, color=CHART_TEXT_COLOR)), | |
| tickfont=dict(size=10, color=CHART_TEXT_COLOR), | |
| gridcolor=CHART_GRID, | |
| zeroline=False, | |
| ), | |
| yaxis=dict( | |
| tickfont=dict(size=11, color=CHART_TEXT_COLOR), | |
| automargin=True, | |
| ), | |
| height=max(450, len(skills_data) * 34), | |
| margin=dict(l=10, r=50, t=25, b=50), | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| font=dict(family=CHART_FONT, color=CHART_TEXT_COLOR), | |
| bargap=0.22, | |
| ) | |
| return fig | |
| # ββ OCR Charts ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def make_ocr_compare_chart(model_names: list[str]) -> go.Figure: | |
| """Bar chart comparing selected OCR models across all 6 metrics.""" | |
| fig = go.Figure() | |
| for i, model_name in enumerate(model_names): | |
| row = OCR_LEADERBOARD_DF[OCR_LEADERBOARD_DF["Model"] == model_name] | |
| if row.empty: | |
| continue | |
| row = row.iloc[0] | |
| values = [] | |
| for metric in OCR_METRICS: | |
| val = row.get(metric) | |
| if val is not None and not (isinstance(val, float) and math.isnan(val)): | |
| # For WER/CER, display as-is (lower is better) | |
| # For BLEU/ChrF/TEDS/MARS, display as-is (higher is better) | |
| values.append(float(val)) | |
| else: | |
| values.append(0) | |
| fig.add_trace(go.Bar( | |
| name=model_name, | |
| x=OCR_METRICS, | |
| y=values, | |
| marker_color=CHART_COLORS[i % len(CHART_COLORS)], | |
| marker=dict(cornerradius=4), | |
| text=[f"{v:.3f}" if v < 1 else f"{v:.1f}" for v in values], | |
| textposition="outside", | |
| textfont=dict(color=CHART_TEXT_COLOR), | |
| )) | |
| fig.update_layout( | |
| barmode="group", | |
| xaxis=dict( | |
| title=dict(text="Metric", font=dict(size=12, color=CHART_TEXT_COLOR)), | |
| tickfont=dict(size=11, color=CHART_TEXT_COLOR), | |
| gridcolor=CHART_GRID, | |
| ), | |
| yaxis=dict( | |
| title=dict(text="Value", font=dict(size=12, color=CHART_TEXT_COLOR)), | |
| tickfont=dict(size=10, color=CHART_TEXT_COLOR), | |
| gridcolor=CHART_GRID, | |
| zeroline=False, | |
| ), | |
| height=520, | |
| margin=dict(l=60, r=40, t=40, b=70), | |
| paper_bgcolor="rgba(0,0,0,0)", | |
| plot_bgcolor="rgba(0,0,0,0)", | |
| font=dict(family=CHART_FONT, color=CHART_TEXT_COLOR), | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=-0.22, | |
| xanchor="center", | |
| x=0.5, | |
| font=dict(size=12, color=CHART_TEXT_COLOR), | |
| bgcolor="rgba(0,0,0,0)", | |
| ), | |
| bargap=0.18, | |
| ) | |
| return fig | |
| # ββ Build display DataFrames βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_leaderboard_display() -> pd.DataFrame: | |
| if LEADERBOARD_DF.empty: | |
| return pd.DataFrame() | |
| df = LEADERBOARD_DF[["Rank", "Model", "Provider", "Eval", "Parameters", "Score", "Tier"]].copy() | |
| df["Score"] = df["Score"].apply(lambda x: f"**{x:.2f}**") | |
| df["Tier"] = df["Tier"].apply(lambda t: f"**{t}**") | |
| return df | |
| def build_ocr_leaderboard_display() -> pd.DataFrame: | |
| if OCR_LEADERBOARD_DF.empty: | |
| return pd.DataFrame() | |
| df = OCR_LEADERBOARD_DF[["Rank", "Model", "Provider", "Parameters", | |
| "WER", "CER", "BLEU", "ChrF", "TEDS", "MARS", "Tier"]].copy() | |
| for col in ["WER", "CER"]: | |
| df[col] = df[col].apply(lambda x: f"{x:.4f}" if x is not None else "-") | |
| for col in ["BLEU", "ChrF", "TEDS", "MARS"]: | |
| df[col] = df[col].apply(lambda x: f"{x:.2f}" if x is not None else "-") | |
| df["Tier"] = df["Tier"].apply(lambda t: f"**{t}**") | |
| return df | |
| def build_skills_df(model_name: str) -> pd.DataFrame: | |
| row = LEADERBOARD_DF[LEADERBOARD_DF["Model"] == model_name] | |
| if row.empty: | |
| return pd.DataFrame(columns=["Skill", "Score", "Tier"]) | |
| row = row.iloc[0] | |
| skills_data = [] | |
| for skill in SKILLS: | |
| score = row.get(skill) | |
| if score is not None and not (isinstance(score, float) and math.isnan(score)): | |
| tier = score_to_tier(float(score)) | |
| skills_data.append({ | |
| "Skill": skill, | |
| "Score": round(float(score), 2), | |
| "Tier": tier, | |
| }) | |
| df = pd.DataFrame(skills_data) | |
| if not df.empty: | |
| df = df.sort_values("Score", ascending=False).reset_index(drop=True) | |
| return df | |
| DISPLAY_DF = build_leaderboard_display() | |
| OCR_DISPLAY_DF = build_ocr_leaderboard_display() | |
| # ββ Custom CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&display=swap'); | |
| /* ββ Global βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gradio-container { | |
| max-width: 1280px !important; | |
| font-family: 'Inter', system-ui, -apple-system, sans-serif !important; | |
| background: transparent !important; | |
| } | |
| .dark .gradio-container, | |
| .gradio-container { | |
| --body-background-fill: #0a0f1a !important; | |
| --background-fill-primary: #0f1629 !important; | |
| --background-fill-secondary: #131b2e !important; | |
| --block-background-fill: #111827 !important; | |
| --block-border-color: rgba(99, 126, 181, 0.12) !important; | |
| --block-label-text-color: #94a3b8 !important; | |
| --body-text-color: #e2e8f0 !important; | |
| --body-text-color-subdued: #94a3b8 !important; | |
| --input-background-fill: #1e293b !important; | |
| --input-border-color: rgba(99, 126, 181, 0.2) !important; | |
| --border-color-primary: rgba(99, 126, 181, 0.12) !important; | |
| --block-shadow: 0 4px 24px rgba(0, 0, 0, 0.3) !important; | |
| --block-border-width: 1px !important; | |
| --block-radius: 16px !important; | |
| --checkbox-label-background-fill: #1e293b !important; | |
| --checkbox-background-color: #1e293b !important; | |
| --table-even-background-fill: rgba(15, 23, 42, 0.4) !important; | |
| --table-odd-background-fill: rgba(30, 41, 59, 0.3) !important; | |
| --table-row-focus: rgba(6, 182, 212, 0.08) !important; | |
| } | |
| /* ββ Header Banner ββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .header-banner { | |
| background: linear-gradient(145deg, #0a0f1a 0%, #0f1d3a 30%, #0c2a50 55%, #0e3b6e 80%, #104080 100%); | |
| border-radius: 20px; | |
| padding: 52px 48px 48px 48px; | |
| margin-bottom: 28px; | |
| position: relative; | |
| overflow: hidden; | |
| border: 1px solid rgba(6, 182, 212, 0.12); | |
| box-shadow: | |
| 0 0 80px rgba(6, 182, 212, 0.06), | |
| 0 20px 60px rgba(0, 0, 0, 0.4), | |
| inset 0 1px 0 rgba(255, 255, 255, 0.04); | |
| } | |
| .header-banner::before { | |
| content: ''; | |
| position: absolute; | |
| top: -60%; | |
| right: -15%; | |
| width: 500px; | |
| height: 500px; | |
| background: radial-gradient(circle, rgba(6, 182, 212, 0.12) 0%, rgba(6, 182, 212, 0.03) 40%, transparent 70%); | |
| border-radius: 50%; | |
| animation: pulse-glow 6s ease-in-out infinite alternate; | |
| } | |
| .header-banner::after { | |
| content: ''; | |
| position: absolute; | |
| bottom: -40%; | |
| left: -8%; | |
| width: 400px; | |
| height: 400px; | |
| background: radial-gradient(circle, rgba(139, 92, 246, 0.08) 0%, transparent 65%); | |
| border-radius: 50%; | |
| animation: pulse-glow 8s ease-in-out infinite alternate-reverse; | |
| } | |
| @keyframes pulse-glow { | |
| 0% { opacity: 0.5; transform: scale(1); } | |
| 100% { opacity: 1; transform: scale(1.08); } | |
| } | |
| .header-badge { | |
| display: inline-block; | |
| background: rgba(6, 182, 212, 0.12); | |
| border: 1px solid rgba(6, 182, 212, 0.25); | |
| color: #67e8f9; | |
| padding: 5px 14px; | |
| border-radius: 100px; | |
| font-size: 11px; | |
| font-weight: 600; | |
| letter-spacing: 1.5px; | |
| text-transform: uppercase; | |
| margin-bottom: 16px; | |
| position: relative; | |
| z-index: 1; | |
| backdrop-filter: blur(8px); | |
| } | |
| .header-title { | |
| font-size: 42px; | |
| font-weight: 900; | |
| color: white; | |
| margin: 0 0 10px 0; | |
| letter-spacing: -1px; | |
| position: relative; | |
| z-index: 1; | |
| line-height: 1.1; | |
| background: linear-gradient(135deg, #ffffff 0%, #e0f2fe 50%, #67e8f9 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| } | |
| .header-subtitle { | |
| font-size: 16px; | |
| color: rgba(203, 213, 225, 0.8); | |
| margin: 0; | |
| position: relative; | |
| z-index: 1; | |
| max-width: 620px; | |
| line-height: 1.6; | |
| font-weight: 400; | |
| } | |
| .header-divider { | |
| width: 60px; | |
| height: 2px; | |
| background: linear-gradient(90deg, #06b6d4, rgba(6, 182, 212, 0)); | |
| margin: 24px 0; | |
| border-radius: 2px; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .header-stats { | |
| display: flex; | |
| gap: 40px; | |
| margin-top: 0; | |
| position: relative; | |
| z-index: 1; | |
| flex-wrap: wrap; | |
| } | |
| .header-stat { | |
| display: flex; | |
| flex-direction: column; | |
| padding: 16px 20px; | |
| background: rgba(255, 255, 255, 0.03); | |
| border: 1px solid rgba(255, 255, 255, 0.06); | |
| border-radius: 14px; | |
| backdrop-filter: blur(12px); | |
| min-width: 100px; | |
| transition: all 0.3s ease; | |
| } | |
| .header-stat:hover { | |
| background: rgba(255, 255, 255, 0.06); | |
| border-color: rgba(6, 182, 212, 0.2); | |
| transform: translateY(-2px); | |
| box-shadow: 0 8px 24px rgba(0, 0, 0, 0.3); | |
| } | |
| .header-stat-value { | |
| font-size: 32px; | |
| font-weight: 800; | |
| color: white; | |
| line-height: 1; | |
| letter-spacing: -0.5px; | |
| } | |
| .header-stat-label { | |
| font-size: 11px; | |
| color: rgba(148, 163, 184, 0.7); | |
| text-transform: uppercase; | |
| letter-spacing: 1.5px; | |
| margin-top: 6px; | |
| font-weight: 600; | |
| } | |
| /* ββ Tabs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .tab-buttons button { | |
| font-size: 14px !important; | |
| font-weight: 600 !important; | |
| letter-spacing: 0.3px !important; | |
| padding: 10px 24px !important; | |
| border-radius: 10px !important; | |
| transition: all 0.25s ease !important; | |
| } | |
| .tab-buttons button.selected { | |
| background: linear-gradient(135deg, rgba(6, 182, 212, 0.15), rgba(139, 92, 246, 0.1)) !important; | |
| border-color: rgba(6, 182, 212, 0.3) !important; | |
| color: #67e8f9 !important; | |
| box-shadow: 0 0 20px rgba(6, 182, 212, 0.1) !important; | |
| } | |
| /* ββ Dataframe / Table ββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gradio-dataframe { | |
| border-radius: 14px !important; | |
| overflow: hidden !important; | |
| border: 1px solid rgba(99, 126, 181, 0.12) !important; | |
| box-shadow: 0 4px 24px rgba(0, 0, 0, 0.2) !important; | |
| } | |
| table thead th { | |
| background: rgba(15, 22, 41, 0.9) !important; | |
| color: #94a3b8 !important; | |
| font-weight: 700 !important; | |
| font-size: 11px !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.8px !important; | |
| padding: 14px 16px !important; | |
| border-bottom: 2px solid rgba(6, 182, 212, 0.15) !important; | |
| } | |
| table tbody td { | |
| padding: 12px 16px !important; | |
| font-size: 13px !important; | |
| border-bottom: 1px solid rgba(99, 126, 181, 0.06) !important; | |
| color: #e2e8f0 !important; | |
| transition: background 0.15s ease !important; | |
| } | |
| table tbody tr:hover td { | |
| background: rgba(6, 182, 212, 0.04) !important; | |
| } | |
| /* ββ Dropdowns and Inputs βββββββββββββββββββββββββββββββββββββββββ */ | |
| .gradio-dropdown, .gradio-textbox, input, textarea, select { | |
| border-radius: 12px !important; | |
| font-size: 14px !important; | |
| } | |
| .gradio-dropdown .wrap { | |
| border-color: rgba(99, 126, 181, 0.2) !important; | |
| background: #1e293b !important; | |
| } | |
| /* ββ Plot containers ββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .gradio-plot { | |
| border-radius: 16px !important; | |
| overflow: hidden !important; | |
| border: 1px solid rgba(99, 126, 181, 0.1) !important; | |
| box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15) !important; | |
| } | |
| /* ββ Citation βββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| #citation-button textarea { | |
| font-family: 'JetBrains Mono', 'Fira Code', monospace !important; | |
| font-size: 13px !important; | |
| line-height: 1.6 !important; | |
| background: #0f172a !important; | |
| border: 1px solid rgba(99, 126, 181, 0.15) !important; | |
| border-radius: 12px !important; | |
| color: #67e8f9 !important; | |
| padding: 20px !important; | |
| } | |
| /* ββ Markdown content βββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .prose h2, .markdown-text h2 { | |
| color: #e2e8f0 !important; | |
| font-weight: 700 !important; | |
| font-size: 22px !important; | |
| margin-top: 32px !important; | |
| margin-bottom: 12px !important; | |
| padding-bottom: 8px; | |
| border-bottom: 1px solid rgba(99, 126, 181, 0.1); | |
| } | |
| .prose h3, .markdown-text h3 { | |
| color: #cbd5e1 !important; | |
| font-weight: 600 !important; | |
| } | |
| .prose table, .markdown-text table { | |
| border-collapse: collapse !important; | |
| width: 100% !important; | |
| margin: 16px 0 !important; | |
| border-radius: 10px !important; | |
| overflow: hidden !important; | |
| } | |
| .prose table th, .markdown-text table th { | |
| background: rgba(15, 23, 42, 0.8) !important; | |
| color: #94a3b8 !important; | |
| font-weight: 600 !important; | |
| font-size: 12px !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.5px !important; | |
| padding: 12px 16px !important; | |
| text-align: left !important; | |
| border-bottom: 1px solid rgba(99, 126, 181, 0.15) !important; | |
| } | |
| .prose table td, .markdown-text table td { | |
| padding: 10px 16px !important; | |
| color: #cbd5e1 !important; | |
| font-size: 13px !important; | |
| border-bottom: 1px solid rgba(99, 126, 181, 0.06) !important; | |
| } | |
| .prose a, .markdown-text a { | |
| color: #67e8f9 !important; | |
| text-decoration: none !important; | |
| font-weight: 500 !important; | |
| } | |
| .prose a:hover, .markdown-text a:hover { | |
| text-decoration: underline !important; | |
| color: #22d3ee !important; | |
| } | |
| /* ββ Section containers βββββββββββββββββββββββββββββββββββββββββββ */ | |
| .section-card { | |
| background: rgba(17, 24, 39, 0.6); | |
| border: 1px solid rgba(99, 126, 181, 0.1); | |
| border-radius: 16px; | |
| padding: 28px; | |
| margin-bottom: 16px; | |
| backdrop-filter: blur(12px); | |
| box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15); | |
| } | |
| /* ββ Footer βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .footer-text { | |
| text-align: center; | |
| padding: 24px 0 8px 0; | |
| color: #475569; | |
| font-size: 13px; | |
| border-top: 1px solid rgba(99, 126, 181, 0.08); | |
| margin-top: 32px; | |
| } | |
| .footer-text a { | |
| color: #67e8f9 !important; | |
| text-decoration: none; | |
| font-weight: 500; | |
| } | |
| /* ββ Responsive βββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| @media (max-width: 768px) { | |
| .header-banner { | |
| padding: 32px 24px; | |
| border-radius: 14px; | |
| } | |
| .header-title { | |
| font-size: 28px; | |
| } | |
| .header-stats { | |
| gap: 16px; | |
| } | |
| .header-stat { | |
| min-width: 80px; | |
| padding: 12px 14px; | |
| } | |
| .header-stat-value { | |
| font-size: 24px; | |
| } | |
| .gradio-container { | |
| padding: 8px !important; | |
| } | |
| } | |
| @media (max-width: 480px) { | |
| .header-title { | |
| font-size: 22px; | |
| } | |
| .header-subtitle { | |
| font-size: 13px; | |
| } | |
| .header-stats { | |
| gap: 10px; | |
| flex-wrap: wrap; | |
| } | |
| .header-stat { | |
| min-width: 60px; | |
| padding: 10px 12px; | |
| } | |
| } | |
| """ | |
| # ββ About Markdown βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ABOUT_MD = """ | |
| ## What is the Tawkeed Arabic Benchmark? | |
| The Tawkeed Arabic Benchmark is a comprehensive evaluation suite for Arabic LLMs, covering **970 questions** across **22 skill categories**. It evaluates models on everything from MMLU-style knowledge to dialect writing, diacritization, and Arabic grammar analysis. | |
| ## How are scores calculated? | |
| Each question is scored on a **0-10 scale** using one of two methods: | |
| - **LLM-as-Judge** -- GPT evaluates the model's response against a reference answer across dimensions like accuracy, completeness, and fluency. | |
| - **Manual Metrics** -- Automated scoring using ROUGE-L, Levenshtein distance, diacritization accuracy, JSON validity, or word intersection. | |
| The **Benchmark Score** is the average across all questions. | |
| ## Tier System | |
| | Tier | Score Range | Description | | |
| |------|-------------|-------------| | |
| | **S** | 9.0 - 10.0 | Exceptional -- near-human performance | | |
| | **A** | 7.0 - 8.9 | Strong -- competent across most tasks | | |
| | **B** | 5.0 - 6.9 | Adequate -- acceptable for general use | | |
| | **C** | 3.0 - 4.9 | Weak -- significant gaps in capability | | |
| | **D** | 0.0 - 2.9 | Failing -- not suitable for Arabic tasks | | |
| ## Dataset | |
| The benchmark dataset is available on HuggingFace: [tawkeed-sa/tawkeed-arabic-benchmark](https://huggingface.co/datasets/tawkeed-sa/tawkeed-arabic-benchmark) | |
| ## How to submit your model? | |
| To evaluate your model on the Tawkeed Arabic Benchmark, reach out to the Tawkeed team with your model's API endpoint details. We'll run the evaluation and add results to this leaderboard. | |
| """ | |
| OCR_ABOUT_MD = """ | |
| ## What is the Tawkeed OCR Benchmark? | |
| The Tawkeed OCR Benchmark evaluates Arabic document OCR and vision-language models across **400 expert-verified images** from the Misraj-DocOCR and KITAB-Bench datasets. | |
| ## Metrics | |
| | Metric | Direction | Description | | |
| |--------|-----------|-------------| | |
| | **WER** | Lower is better | Word Error Rate -- proportion of incorrectly recognized words | | |
| | **CER** | Lower is better | Character Error Rate -- proportion of incorrectly recognized characters | | |
| | **BLEU** | Higher is better | Bilingual Evaluation Understudy -- n-gram precision score (0-100) | | |
| | **ChrF** | Higher is better | Character F-Score -- character-level F-measure (0-100) | | |
| | **TEDS** | Higher is better | Tree Edit Distance Similarity -- structural table accuracy (0-100) | | |
| | **MARS** | Higher is better | Mixed Arabic Recognition Score -- blended text + table score (0-100) | | |
| ## OCR Tier System (based on WER) | |
| | Tier | WER Range | Description | | |
| |------|-----------|-------------| | |
| | **S** | < 0.10 | Exceptional -- near-perfect recognition | | |
| | **A** | 0.10 - 0.29 | Strong -- high accuracy | | |
| | **B** | 0.30 - 0.49 | Good -- acceptable for most uses | | |
| | **C** | 0.50 - 0.69 | Fair -- noticeable errors | | |
| | **D** | >= 0.70 | Weak -- significant recognition issues | | |
| ## Datasets | |
| - **Misraj-DocOCR** -- 400 expert-verified Arabic document images ([Misraj/Misraj-DocOCR](https://huggingface.co/datasets/Misraj/Misraj-DocOCR)) | |
| - **KITAB-Bench** -- Arabic PDF-to-markdown reviewed dataset ([Misraj/KITAB_pdf_to_markdown_reviewed](https://huggingface.co/datasets/Misraj/KITAB_pdf_to_markdown_reviewed)) | |
| """ | |
| # ββ Main App βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| theme = gr.themes.Base( | |
| primary_hue=gr.themes.colors.cyan, | |
| secondary_hue=gr.themes.colors.slate, | |
| neutral_hue=gr.themes.colors.slate, | |
| font=gr.themes.GoogleFont("Inter"), | |
| font_mono=gr.themes.GoogleFont("JetBrains Mono"), | |
| ).set( | |
| body_background_fill="#0a0f1a", | |
| body_background_fill_dark="#0a0f1a", | |
| body_text_color="#e2e8f0", | |
| body_text_color_dark="#e2e8f0", | |
| body_text_color_subdued="#94a3b8", | |
| body_text_color_subdued_dark="#94a3b8", | |
| background_fill_primary="#0f1629", | |
| background_fill_primary_dark="#0f1629", | |
| background_fill_secondary="#131b2e", | |
| background_fill_secondary_dark="#131b2e", | |
| block_background_fill="#111827", | |
| block_background_fill_dark="#111827", | |
| block_border_color="rgba(99, 126, 181, 0.12)", | |
| block_border_color_dark="rgba(99, 126, 181, 0.12)", | |
| block_border_width="1px", | |
| block_label_text_color="#94a3b8", | |
| block_label_text_color_dark="#94a3b8", | |
| block_radius="16px", | |
| block_shadow="0 4px 24px rgba(0, 0, 0, 0.3)", | |
| block_shadow_dark="0 4px 24px rgba(0, 0, 0, 0.3)", | |
| block_title_text_color="#e2e8f0", | |
| block_title_text_color_dark="#e2e8f0", | |
| border_color_primary="rgba(99, 126, 181, 0.12)", | |
| border_color_primary_dark="rgba(99, 126, 181, 0.12)", | |
| input_background_fill="#1e293b", | |
| input_background_fill_dark="#1e293b", | |
| input_border_color="rgba(99, 126, 181, 0.2)", | |
| input_border_color_dark="rgba(99, 126, 181, 0.2)", | |
| input_border_width="1px", | |
| input_radius="12px", | |
| button_primary_background_fill="linear-gradient(135deg, #0891b2, #0e7490)", | |
| button_primary_background_fill_dark="linear-gradient(135deg, #0891b2, #0e7490)", | |
| button_primary_background_fill_hover="linear-gradient(135deg, #06b6d4, #0891b2)", | |
| button_primary_background_fill_hover_dark="linear-gradient(135deg, #06b6d4, #0891b2)", | |
| button_primary_text_color="#ffffff", | |
| button_primary_text_color_dark="#ffffff", | |
| button_secondary_background_fill="#1e293b", | |
| button_secondary_background_fill_dark="#1e293b", | |
| button_secondary_text_color="#e2e8f0", | |
| button_secondary_text_color_dark="#e2e8f0", | |
| checkbox_background_color="#1e293b", | |
| checkbox_background_color_dark="#1e293b", | |
| checkbox_label_background_fill="#1e293b", | |
| checkbox_label_background_fill_dark="#1e293b", | |
| table_even_background_fill="rgba(15, 23, 42, 0.4)", | |
| table_even_background_fill_dark="rgba(15, 23, 42, 0.4)", | |
| table_odd_background_fill="rgba(30, 41, 59, 0.3)", | |
| table_odd_background_fill_dark="rgba(30, 41, 59, 0.3)", | |
| table_row_focus="rgba(6, 182, 212, 0.08)", | |
| table_row_focus_dark="rgba(6, 182, 212, 0.08)", | |
| shadow_drop="0 4px 12px rgba(0, 0, 0, 0.2)", | |
| shadow_drop_lg="0 8px 32px rgba(0, 0, 0, 0.3)", | |
| ) | |
| demo = gr.Blocks(css=CUSTOM_CSS, theme=theme, title="Tawkeed Arabic Benchmark Leaderboard") | |
| with demo: | |
| # Header (self-contained dark bg -- works in both themes) | |
| gr.HTML(f""" | |
| <div class="header-banner"> | |
| <div class="header-badge">Arabic AI Evaluation</div> | |
| <div class="header-title">Tawkeed Arabic Benchmark</div> | |
| <p class="header-subtitle"> | |
| The comprehensive leaderboard for evaluating Arabic large language models | |
| and OCR systems across diverse real-world tasks. | |
| </p> | |
| <div class="header-divider"></div> | |
| <div class="header-stats"> | |
| <div class="header-stat"> | |
| <span class="header-stat-value">970</span> | |
| <span class="header-stat-label">LLM Questions</span> | |
| </div> | |
| <div class="header-stat"> | |
| <span class="header-stat-value">22</span> | |
| <span class="header-stat-label">Skill Categories</span> | |
| </div> | |
| <div class="header-stat"> | |
| <span class="header-stat-value">{len(ALL_RESULTS)}</span> | |
| <span class="header-stat-label">LLM Models</span> | |
| </div> | |
| <div class="header-stat"> | |
| <span class="header-stat-value">{len(ALL_OCR_RESULTS)}</span> | |
| <span class="header-stat-label">OCR Models</span> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| model_names = LEADERBOARD_DF["Model"].tolist() if not LEADERBOARD_DF.empty else [] | |
| ocr_model_names = OCR_LEADERBOARD_DF["Model"].tolist() if not OCR_LEADERBOARD_DF.empty else [] | |
| # Top-level tabs: LLM Benchmark | OCR Benchmark | |
| with gr.Tabs(elem_classes="tab-buttons") as top_tabs: | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # LLM BENCHMARK SECTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("LLM Benchmark", id="llm"): | |
| with gr.Tabs() as llm_tabs: | |
| # ββ LLM Tab 1: Leaderboard ββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Leaderboard", id=0): | |
| gr.Dataframe( | |
| value=DISPLAY_DF, | |
| datatype=["number", "markdown", "str", "str", "str", "markdown", "markdown"], | |
| interactive=False, | |
| wrap=True, | |
| show_search="filter", | |
| column_widths=[60, 250, 130, 110, 80, 90, 60], | |
| ) | |
| # ββ LLM Tab 2: Skills Breakdown βββββββββββββββββββββββββ | |
| with gr.TabItem("Skills Breakdown", id=1): | |
| skill_model_dropdown = gr.Dropdown( | |
| choices=model_names, | |
| value=model_names[0] if model_names else None, | |
| label="Select Model", | |
| ) | |
| skills_table = gr.Dataframe( | |
| value=build_skills_df(model_names[0]) if model_names else pd.DataFrame(), | |
| datatype=["str", "number", "str"], | |
| interactive=False, | |
| wrap=True, | |
| column_widths=[280, 100, 60], | |
| ) | |
| skill_bar_chart = gr.Plot( | |
| make_bar_chart(model_names[0]) if model_names else None | |
| ) | |
| def update_skills(model_name): | |
| return build_skills_df(model_name), make_bar_chart(model_name) | |
| skill_model_dropdown.change( | |
| update_skills, | |
| inputs=skill_model_dropdown, | |
| outputs=[skills_table, skill_bar_chart], | |
| ) | |
| # ββ LLM Tab 3: Compare ββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Compare", id=2): | |
| compare_dropdown = gr.Dropdown( | |
| choices=model_names, | |
| value=model_names[:min(2, len(model_names))], | |
| label="Select Models to Compare", | |
| multiselect=True, | |
| ) | |
| radar_chart = gr.Plot( | |
| make_radar_chart(model_names[:min(2, len(model_names))]) if model_names else None | |
| ) | |
| def update_radar(selected): | |
| if not selected: | |
| return go.Figure() | |
| return make_radar_chart(selected) | |
| compare_dropdown.change( | |
| update_radar, | |
| inputs=compare_dropdown, | |
| outputs=radar_chart, | |
| ) | |
| # ββ LLM Tab 4: About ββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("About", id=3): | |
| gr.Markdown(ABOUT_MD) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # OCR BENCHMARK SECTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("OCR Benchmark", id="ocr"): | |
| with gr.Tabs() as ocr_tabs: | |
| # ββ OCR Tab 1: Leaderboard ββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Leaderboard", id=10): | |
| if OCR_DISPLAY_DF.empty: | |
| gr.Markdown("*No OCR benchmark results available yet.*") | |
| else: | |
| gr.Dataframe( | |
| value=OCR_DISPLAY_DF, | |
| datatype=["number", "str", "str", "str", | |
| "str", "str", "str", "str", "str", "str", "markdown"], | |
| interactive=False, | |
| wrap=True, | |
| show_search="filter", | |
| column_widths=[50, 200, 120, 80, 80, 80, 80, 80, 80, 80, 50], | |
| ) | |
| # ββ OCR Tab 2: Compare ββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Compare", id=11): | |
| if ocr_model_names: | |
| ocr_compare_dropdown = gr.Dropdown( | |
| choices=ocr_model_names, | |
| value=ocr_model_names[:min(2, len(ocr_model_names))], | |
| label="Select OCR Models to Compare", | |
| multiselect=True, | |
| ) | |
| ocr_compare_chart = gr.Plot( | |
| make_ocr_compare_chart(ocr_model_names[:min(2, len(ocr_model_names))]) | |
| ) | |
| def update_ocr_compare(selected): | |
| if not selected: | |
| return go.Figure() | |
| return make_ocr_compare_chart(selected) | |
| ocr_compare_dropdown.change( | |
| update_ocr_compare, | |
| inputs=ocr_compare_dropdown, | |
| outputs=ocr_compare_chart, | |
| ) | |
| else: | |
| gr.Markdown("*No OCR benchmark results available yet.*") | |
| # ββ OCR Tab 3: About ββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("About", id=12): | |
| gr.Markdown(OCR_ABOUT_MD) | |
| # ββ Citation Tab (shared) βββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Citation", id="citation"): | |
| gr.Markdown("### Cite the Tawkeed Arabic Benchmark") | |
| gr.Textbox( | |
| value="""@dataset{tawkeed_arabic_benchmark_2026, | |
| title={Tawkeed Arabic Benchmark}, | |
| author={Tawkeed Team}, | |
| year={2026}, | |
| publisher={HuggingFace}, | |
| url={https://huggingface.co/datasets/tawkeed-sa/tawkeed-arabic-benchmark} | |
| }""", | |
| label="BibTeX", | |
| lines=7, | |
| show_copy_button=True, | |
| elem_id="citation-button", | |
| ) | |
| # Footer | |
| gr.HTML( | |
| '<div class="footer-text">' | |
| 'Built by <strong>Tawkeed</strong> · ' | |
| '<a href="https://huggingface.co/datasets/tawkeed-sa/tawkeed-arabic-benchmark" target="_blank">Dataset</a> · ' | |
| '<a href="https://huggingface.co/tawkeed-sa" target="_blank">HuggingFace</a>' | |
| '</div>' | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(default_concurrency_limit=40).launch(ssr_mode=False) | |