Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

HeshamHaroon commited on 17 days ago

Commit

d9d7dd0

verified ·

1 Parent(s): 9a35f45

Update: Auto-evaluation on Space startup

Browse files

Files changed (1) hide show

afcl/app.py +326 -125

afcl/app.py CHANGED Viewed

@@ -2,8 +2,7 @@
 Arabic Function Calling Leaderboard (AFCL)
 ==========================================
-A Gradio-based leaderboard that evaluates LLMs on Arabic function calling.
-Evaluation runs on HuggingFace Space infrastructure.
 """
 import gradio as gr
@@ -13,70 +12,170 @@ import os
 import re
 import time
 import requests
-from pathlib import Path
 from typing import Dict, List, Optional
 from threading import Thread
 from datasets import load_dataset
-import huggingface_hub
 # Constants
-TITLE = "🏆 Arabic Function Calling Leaderboard"
-TITLE_AR = "🏆 لوحة تقييم استدعاء الدوال بالعربية"
-DESCRIPTION = """
-The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls.
-**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
-"""
 # All 28 Models to evaluate
 MODELS_TO_EVALUATE = [
     # Arabic-Native LLMs
-    {"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
-    {"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
-    {"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
-    {"model": "Fanar-Star-1.2B", "model_id": "QatarComputing/fanar-star-1.2b", "organization": "QCRI"},
-    {"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
-    {"model": "AraGPT2-Mega", "model_id": "aubmindlab/aragpt2-mega", "organization": "AUB MIND Lab"},
     # Multilingual with strong Arabic
-    {"model": "Qwen2.5-72B-Instruct", "model_id": "Qwen/Qwen2.5-72B-Instruct", "organization": "Alibaba Qwen"},
-    {"model": "Qwen2.5-32B-Instruct", "model_id": "Qwen/Qwen2.5-32B-Instruct", "organization": "Alibaba Qwen"},
-    {"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
-    {"model": "Llama-3.1-70B-Instruct", "model_id": "meta-llama/Llama-3.1-70B-Instruct", "organization": "Meta"},
-    {"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
-    {"model": "Gemma-2-27B-IT", "model_id": "google/gemma-2-27b-it", "organization": "Google"},
-    {"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
     # Cohere Arabic Models
-    {"model": "Aya-Expanse-32B", "model_id": "CohereForAI/aya-expanse-32b", "organization": "Cohere For AI"},
-    {"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
-    {"model": "c4ai-command-r7b-arabic", "model_id": "CohereForAI/c4ai-command-r7b-arabic-02-2025", "organization": "Cohere For AI"},
     # Falcon (UAE)
-    {"model": "Falcon-180B-Chat", "model_id": "tiiuae/falcon-180B-chat", "organization": "TII UAE"},
-    {"model": "Falcon-40B-Instruct", "model_id": "tiiuae/falcon-40b-instruct", "organization": "TII UAE"},
     # Mistral
-    {"model": "Mistral-Large-Instruct", "model_id": "mistralai/Mistral-Large-Instruct-2411", "organization": "Mistral AI"},
-    {"model": "Mixtral-8x22B-Instruct", "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "organization": "Mistral AI"},
-    {"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
     # Others
-    {"model": "DeepSeek-V3", "model_id": "deepseek-ai/DeepSeek-V3", "organization": "DeepSeek"},
-    {"model": "Phi-4", "model_id": "microsoft/phi-4", "organization": "Microsoft"},
-    {"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
-    {"model": "BLOOM-176B", "model_id": "bigscience/bloom", "organization": "BigScience"},
-    {"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
     # Arabic Fine-tuned
-    {"model": "Arabic-Llama-3.1-8B", "model_id": "Ammar-Arabi/Arabic-Llama-3.1-8B-Instruct", "organization": "Ammar Arabi"},
-    {"model": "Llama3-8B-Arabic-Instruct", "model_id": "MahmoudAshraf/Llama3-8B-Arabic-instruct", "organization": "Mahmoud Ashraf"},
 ]
 # Global state
 LEADERBOARD_DATA = []
-EVALUATION_STATUS = "Not started"
 def load_evaluation_dataset():
@@ -101,21 +200,18 @@ def load_evaluation_dataset():
 def create_prompt(query: str, functions: List[Dict]) -> str:
     """Create evaluation prompt."""
-    func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n"
     for f in functions:
         func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
     return f"""{func_desc}
-User Query (Arabic): {query}
-Respond ONLY with a JSON object:
-{{"name": "function_name", "arguments": {{"param1": "value1"}}}}
-If no function should be called:
-{{"name": null, "arguments": {{}}}}
-JSON Response:"""
 def call_model(model_id: str, prompt: str) -> str:
@@ -124,17 +220,13 @@ def call_model(model_id: str, prompt: str) -> str:
     headers = {"Authorization": f"Bearer {token}"}
     url = f"https://api-inference.huggingface.co/models/{model_id}"
-    payload = {
-        "inputs": prompt,
-        "parameters": {"max_new_tokens": 200, "temperature": 0.1}
-    }
     try:
         response = requests.post(url, headers=headers, json=payload, timeout=60)
         if response.status_code == 503:
             time.sleep(20)
             response = requests.post(url, headers=headers, json=payload, timeout=60)
         result = response.json()
         if isinstance(result, list) and result:
             return result[0].get("generated_text", "")
@@ -197,11 +289,11 @@ def run_evaluation():
     """Run full evaluation on all models."""
     global LEADERBOARD_DATA, EVALUATION_STATUS
-    EVALUATION_STATUS = "Loading dataset..."
     samples = load_evaluation_dataset()
     if not samples:
-        EVALUATION_STATUS = "Failed to load dataset"
         return
     results = []
@@ -211,7 +303,8 @@ def run_evaluation():
         model_name = model_config['model']
         model_id = model_config['model_id']
-        EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..."
         category_scores = {}
         category_counts = {}
@@ -228,7 +321,7 @@ def run_evaluation():
             except:
                 pass
             category_counts[cat] += 1
-            time.sleep(0.5)  # Rate limiting
         # Calculate scores
         scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
@@ -243,6 +336,8 @@ def run_evaluation():
             "model": model_name,
             "model_id": model_id,
             "organization": model_config['organization'],
             "overall": round(overall, 1),
             "simple": scores.get('simple', 0),
             "multiple": scores.get('multiple', 0),
@@ -253,108 +348,214 @@ def run_evaluation():
             "status": "completed"
         })
-    # Sort and rank
-    results = sorted(results, key=lambda x: x['overall'], reverse=True)
-    for i, r in enumerate(results, 1):
-        r['rank'] = i
-    LEADERBOARD_DATA = results
-    EVALUATION_STATUS = f"Completed - {len(results)} models evaluated"
 def get_leaderboard_df():
     """Get leaderboard as DataFrame."""
     if not LEADERBOARD_DATA:
-        # Return empty with pending status
-        data = [{"rank": i+1, "model": m["model"], "organization": m["organization"],
-                 "overall": "-", "status": "⏳ Pending"}
-                for i, m in enumerate(MODELS_TO_EVALUATE)]
         return pd.DataFrame(data)
-    df = pd.DataFrame(LEADERBOARD_DATA)
-    cols = ["rank", "model", "organization", "overall", "simple", "multiple",
-            "parallel", "parallel_multiple", "irrelevance", "dialect_handling"]
-    df = df[[c for c in cols if c in df.columns]]
-    # Format percentages
-    for col in df.columns:
-        if df[col].dtype in ['float64', 'float32', 'int64']:
-            if col != 'rank':
-                df[col] = df[col].apply(lambda x: f"{x:.1f}%")
-    return df
 def create_app():
     """Create the Gradio app."""
-    with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app:
-        gr.Markdown(f"""
-        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
-            <h1>{TITLE_AR}</h1>
-            <h2>{TITLE}</h2>
-            <p>Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
         </div>
         """)
-        gr.Markdown(DESCRIPTION)
         with gr.Row():
-            gr.Markdown(f"""
-            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div>
-                <div>Models | النماذج</div>
             </div>
             """)
-            gr.Markdown("""
-            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div>
-                <div>Test Samples | عينات</div>
             </div>
             """)
-            gr.Markdown("""
-            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
-                <div>Categories | الفئات</div>
             </div>
             """)
-        status_text = gr.Markdown(f"**Status:** {EVALUATION_STATUS}")
         with gr.Tabs():
             with gr.TabItem("🏆 Leaderboard"):
-                leaderboard_df = gr.DataFrame(
                     value=get_leaderboard_df(),
-                    interactive=False
                 )
-                def refresh_leaderboard():
-                    return get_leaderboard_df(), f"**Status:** {EVALUATION_STATUS}"
-                refresh_btn = gr.Button("🔄 Refresh | تحديث")
-                refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text])
-            with gr.TabItem("📊 About"):
-                gr.Markdown("""
-                ## Evaluation Categories
-                | Category | Samples | Description |
-                |----------|---------|-------------|
-                | Simple | ~20 | Single function call |
-                | Multiple | ~20 | Select from multiple functions |
-                | Parallel | ~20 | Multiple calls |
-                | Parallel Multiple | ~20 | Complex multi-call |
-                | Irrelevance | ~20 | Should not call |
-                | Dialect | ~15 | Egyptian/Gulf/Levantine |
-                ## Dataset
-                📊 [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
                 """)
-        gr.Markdown("""
-        ---
-        <div style="text-align: center; color: #666;">
-            Built for the Arabic NLP community | بُني لمجتمع معالجة اللغة العربية
         </div>
         """)

 Arabic Function Calling Leaderboard (AFCL)
 ==========================================
+Professional leaderboard for evaluating LLMs on Arabic function calling.
 """
 import gradio as gr
 import re
 import time
 import requests
 from typing import Dict, List, Optional
 from threading import Thread
 from datasets import load_dataset
 # Constants
+TITLE = "Arabic Function Calling Leaderboard"
+TITLE_AR = "لوحة تقييم استدعاء الدوال بالعربية"
 # All 28 Models to evaluate
 MODELS_TO_EVALUATE = [
     # Arabic-Native LLMs
+    {"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI", "params": "30B", "type": "Arabic-Native"},
+    {"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA", "params": "7B", "type": "Arabic-Native"},
+    {"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI", "params": "9B", "type": "Arabic-Native"},
+    {"model": "Fanar-Star-1.2B", "model_id": "QatarComputing/fanar-star-1.2b", "organization": "QCRI", "params": "1.2B", "type": "Arabic-Native"},
+    {"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence", "params": "13B", "type": "Arabic-Native"},
+    {"model": "AraGPT2-Mega", "model_id": "aubmindlab/aragpt2-mega", "organization": "AUB MIND Lab", "params": "1.5B", "type": "Arabic-Native"},
     # Multilingual with strong Arabic
+    {"model": "Qwen2.5-72B-Instruct", "model_id": "Qwen/Qwen2.5-72B-Instruct", "organization": "Alibaba", "params": "72B", "type": "Multilingual"},
+    {"model": "Qwen2.5-32B-Instruct", "model_id": "Qwen/Qwen2.5-32B-Instruct", "organization": "Alibaba", "params": "32B", "type": "Multilingual"},
+    {"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba", "params": "7B", "type": "Multilingual"},
+    {"model": "Llama-3.1-70B-Instruct", "model_id": "meta-llama/Llama-3.1-70B-Instruct", "organization": "Meta", "params": "70B", "type": "Multilingual"},
+    {"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta", "params": "8B", "type": "Multilingual"},
+    {"model": "Gemma-2-27B-IT", "model_id": "google/gemma-2-27b-it", "organization": "Google", "params": "27B", "type": "Multilingual"},
+    {"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google", "params": "9B", "type": "Multilingual"},
     # Cohere Arabic Models
+    {"model": "Aya-Expanse-32B", "model_id": "CohereForAI/aya-expanse-32b", "organization": "Cohere", "params": "32B", "type": "Multilingual"},
+    {"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere", "params": "8B", "type": "Multilingual"},
+    {"model": "Command-R7B-Arabic", "model_id": "CohereForAI/c4ai-command-r7b-arabic-02-2025", "organization": "Cohere", "params": "7B", "type": "Arabic-Tuned"},
     # Falcon (UAE)
+    {"model": "Falcon-180B-Chat", "model_id": "tiiuae/falcon-180B-chat", "organization": "TII UAE", "params": "180B", "type": "Multilingual"},
+    {"model": "Falcon-40B-Instruct", "model_id": "tiiuae/falcon-40b-instruct", "organization": "TII UAE", "params": "40B", "type": "Multilingual"},
     # Mistral
+    {"model": "Mistral-Large", "model_id": "mistralai/Mistral-Large-Instruct-2411", "organization": "Mistral AI", "params": "123B", "type": "Multilingual"},
+    {"model": "Mixtral-8x22B", "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "organization": "Mistral AI", "params": "141B", "type": "Multilingual"},
+    {"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI", "params": "7B", "type": "Multilingual"},
     # Others
+    {"model": "DeepSeek-V3", "model_id": "deepseek-ai/DeepSeek-V3", "organization": "DeepSeek", "params": "671B", "type": "Multilingual"},
+    {"model": "Phi-4", "model_id": "microsoft/phi-4", "organization": "Microsoft", "params": "14B", "type": "Multilingual"},
+    {"model": "Phi-3-Mini", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft", "params": "3.8B", "type": "Multilingual"},
+    {"model": "BLOOM-176B", "model_id": "bigscience/bloom", "organization": "BigScience", "params": "176B", "type": "Multilingual"},
+    {"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience", "params": "7B", "type": "Multilingual"},
     # Arabic Fine-tuned
+    {"model": "Arabic-Llama-3.1-8B", "model_id": "Ammar-Arabi/Arabic-Llama-3.1-8B-Instruct", "organization": "Community", "params": "8B", "type": "Arabic-Tuned"},
+    {"model": "Llama3-8B-Arabic", "model_id": "MahmoudAshraf/Llama3-8B-Arabic-instruct", "organization": "Community", "params": "8B", "type": "Arabic-Tuned"},
 ]
 # Global state
 LEADERBOARD_DATA = []
+EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
+# Custom CSS for professional look
+CUSTOM_CSS = """
+/* Professional Dark Theme */
+.gradio-container {
+    background: linear-gradient(135deg, #0f0f1a 0%, #1a1a2e 100%) !important;
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
+}
+/* Header styling */
+.header-container {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border-radius: 16px;
+    padding: 32px;
+    margin-bottom: 24px;
+    box-shadow: 0 20px 40px rgba(102, 126, 234, 0.3);
+}
+/* Stats cards */
+.stat-card {
+    background: rgba(255,255,255,0.05);
+    backdrop-filter: blur(10px);
+    border: 1px solid rgba(255,255,255,0.1);
+    border-radius: 12px;
+    padding: 24px;
+    text-align: center;
+    transition: transform 0.3s ease;
+}
+.stat-card:hover {
+    transform: translateY(-4px);
+}
+.stat-value {
+    font-size: 2.5rem;
+    font-weight: 700;
+    background: linear-gradient(135deg, #667eea, #764ba2);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.stat-label {
+    color: #a0a0a0;
+    font-size: 0.9rem;
+    margin-top: 8px;
+}
+/* Table styling */
+.leaderboard-table {
+    background: rgba(255,255,255,0.02) !important;
+    border-radius: 12px !important;
+    border: 1px solid rgba(255,255,255,0.1) !important;
+}
+/* Rank badges */
+.rank-1 { color: #ffd700 !important; font-weight: bold; }
+.rank-2 { color: #c0c0c0 !important; font-weight: bold; }
+.rank-3 { color: #cd7f32 !important; font-weight: bold; }
+/* Progress bar */
+.progress-container {
+    background: rgba(255,255,255,0.1);
+    border-radius: 8px;
+    padding: 16px;
+    margin: 16px 0;
+}
+.progress-bar {
+    height: 8px;
+    background: linear-gradient(90deg, #667eea, #764ba2);
+    border-radius: 4px;
+    transition: width 0.5s ease;
+}
+/* Tabs */
+.tabs {
+    border: none !important;
+}
+.tab-nav {
+    background: transparent !important;
+    border-bottom: 2px solid rgba(255,255,255,0.1) !important;
+}
+.tab-nav button {
+    color: #a0a0a0 !important;
+    font-weight: 500 !important;
+    padding: 12px 24px !important;
+}
+.tab-nav button.selected {
+    color: #667eea !important;
+    border-bottom: 2px solid #667eea !important;
+}
+/* Category pills */
+.category-pill {
+    display: inline-block;
+    padding: 4px 12px;
+    border-radius: 20px;
+    font-size: 0.75rem;
+    font-weight: 500;
+}
+.cat-arabic { background: #22c55e20; color: #22c55e; }
+.cat-multilingual { background: #3b82f620; color: #3b82f6; }
+.cat-tuned { background: #f59e0b20; color: #f59e0b; }
+"""
 def load_evaluation_dataset():
 def create_prompt(query: str, functions: List[Dict]) -> str:
     """Create evaluation prompt."""
+    func_desc = "You are a function calling AI. Respond with JSON only.\n\nFunctions:\n"
     for f in functions:
         func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
     return f"""{func_desc}
+Query: {query}
+Response format: {{"name": "function_name", "arguments": {{"key": "value"}}}}
+If no function applies: {{"name": null, "arguments": {{}}}}
+JSON:"""
 def call_model(model_id: str, prompt: str) -> str:
     headers = {"Authorization": f"Bearer {token}"}
     url = f"https://api-inference.huggingface.co/models/{model_id}"
+    payload = {"inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.1}}
     try:
         response = requests.post(url, headers=headers, json=payload, timeout=60)
         if response.status_code == 503:
             time.sleep(20)
             response = requests.post(url, headers=headers, json=payload, timeout=60)
         result = response.json()
         if isinstance(result, list) and result:
             return result[0].get("generated_text", "")
     """Run full evaluation on all models."""
     global LEADERBOARD_DATA, EVALUATION_STATUS
+    EVALUATION_STATUS["current"] = "Loading dataset..."
     samples = load_evaluation_dataset()
     if not samples:
+        EVALUATION_STATUS["current"] = "Failed to load dataset"
         return
     results = []
         model_name = model_config['model']
         model_id = model_config['model_id']
+        EVALUATION_STATUS["current"] = f"Evaluating {model_name}..."
+        EVALUATION_STATUS["progress"] = idx + 1
         category_scores = {}
         category_counts = {}
             except:
                 pass
             category_counts[cat] += 1
+            time.sleep(0.5)
         # Calculate scores
         scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
             "model": model_name,
             "model_id": model_id,
             "organization": model_config['organization'],
+            "params": model_config['params'],
+            "type": model_config['type'],
             "overall": round(overall, 1),
             "simple": scores.get('simple', 0),
             "multiple": scores.get('multiple', 0),
             "status": "completed"
         })
+        # Update global data after each model
+        temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)
+        for i, r in enumerate(temp_results, 1):
+            r['rank'] = i
+        LEADERBOARD_DATA = temp_results
+    EVALUATION_STATUS["current"] = "Evaluation Complete"
+    EVALUATION_STATUS["progress"] = total_models
 def get_leaderboard_df():
     """Get leaderboard as DataFrame."""
     if not LEADERBOARD_DATA:
+        data = []
+        for i, m in enumerate(MODELS_TO_EVALUATE, 1):
+            data.append({
+                "Rank": i,
+                "Model": m["model"],
+                "Org": m["organization"],
+                "Size": m["params"],
+                "Type": m["type"],
+                "Overall": "—",
+                "Simple": "—",
+                "Multiple": "—",
+                "Parallel": "—",
+                "Irrelevance": "—",
+                "Dialect": "—",
+            })
         return pd.DataFrame(data)
+    data = []
+    for r in LEADERBOARD_DATA:
+        data.append({
+            "Rank": f"🥇 {r['rank']}" if r['rank'] == 1 else f"🥈 {r['rank']}" if r['rank'] == 2 else f"🥉 {r['rank']}" if r['rank'] == 3 else r['rank'],
+            "Model": r['model'],
+            "Org": r['organization'],
+            "Size": r['params'],
+            "Type": r['type'],
+            "Overall": f"{r['overall']}%",
+            "Simple": f"{r['simple']}%",
+            "Multiple": f"{r['multiple']}%",
+            "Parallel": f"{r['parallel']}%",
+            "Irrelevance": f"{r['irrelevance']}%",
+            "Dialect": f"{r['dialect_handling']}%",
+        })
+    return pd.DataFrame(data)
+def get_status_html():
+    """Get evaluation status as HTML."""
+    progress = EVALUATION_STATUS["progress"]
+    total = EVALUATION_STATUS["total"]
+    current = EVALUATION_STATUS["current"]
+    pct = (progress / total) * 100 if total > 0 else 0
+    return f"""
+    <div style="background: rgba(102,126,234,0.1); border: 1px solid rgba(102,126,234,0.3); border-radius: 12px; padding: 20px; margin: 16px 0;">
+        <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 12px;">
+            <span style="color: #667eea; font-weight: 600;">📊 {current}</span>
+            <span style="color: #a0a0a0;">{progress}/{total} models</span>
+        </div>
+        <div style="background: rgba(255,255,255,0.1); border-radius: 8px; height: 8px; overflow: hidden;">
+            <div style="background: linear-gradient(90deg, #667eea, #764ba2); height: 100%; width: {pct}%; transition: width 0.5s ease;"></div>
+        </div>
+    </div>
+    """
 def create_app():
     """Create the Gradio app."""
+    with gr.Blocks(title="AFCL - Arabic Function Calling Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Base()) as app:
+        # Header
+        gr.HTML("""
+        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 16px; padding: 40px; margin-bottom: 24px; text-align: center;">
+            <h1 style="color: white; font-size: 2.5rem; margin: 0; font-weight: 700;">
+                🏆 Arabic Function Calling Leaderboard
+            </h1>
+            <p style="color: rgba(255,255,255,0.9); font-size: 1.1rem; margin-top: 8px;">
+                لوحة تقييم استدعاء الدوال بالعربية
+            </p>
+            <p style="color: rgba(255,255,255,0.7); font-size: 0.95rem; margin-top: 16px; max-width: 600px; margin-left: auto; margin-right: auto;">
+                Comprehensive benchmark evaluating LLMs on Arabic function calling across 10 categories including dialects
+            </p>
         </div>
         """)
+        # Stats Row
         with gr.Row():
+            gr.HTML(f"""
+            <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
+                <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">{len(MODELS_TO_EVALUATE)}</div>
+                <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Models</div>
+            </div>
+            """)
+            gr.HTML("""
+            <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
+                <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #22c55e, #16a34a); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">147</div>
+                <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Test Samples</div>
             </div>
             """)
+            gr.HTML("""
+            <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
+                <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #f59e0b, #d97706); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">10</div>
+                <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Categories</div>
             </div>
             """)
+            gr.HTML("""
+            <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
+                <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #ec4899, #be185d); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">3</div>
+                <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Dialects</div>
             </div>
             """)
+        # Status
+        status_html = gr.HTML(get_status_html())
+        # Tabs
         with gr.Tabs():
             with gr.TabItem("🏆 Leaderboard"):
+                leaderboard_table = gr.DataFrame(
                     value=get_leaderboard_df(),
+                    interactive=False,
+                    wrap=True,
                 )
+                with gr.Row():
+                    refresh_btn = gr.Button("🔄 Refresh Results", variant="primary", size="lg")
+                def refresh():
+                    return get_leaderboard_df(), get_status_html()
+                refresh_btn.click(refresh, outputs=[leaderboard_table, status_html])
+            with gr.TabItem("📊 Categories"):
+                gr.HTML("""
+                <div style="padding: 24px;">
+                    <h3 style="color: #667eea; margin-bottom: 24px;">Evaluation Categories</h3>
+                    <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 16px;">
+                        <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
+                            <h4 style="color: #22c55e; margin: 0;">Simple</h4>
+                            <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Single function, single call scenarios</p>
+                        </div>
+                        <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
+                            <h4 style="color: #3b82f6; margin: 0;">Multiple</h4>
+                            <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Select correct function from 2-4 options</p>
+                        </div>
+                        <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
+                            <h4 style="color: #f59e0b; margin: 0;">Parallel</h4>
+                            <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Multiple calls of same function</p>
+                        </div>
+                        <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
+                            <h4 style="color: #ec4899; margin: 0;">Parallel Multiple</h4>
+                            <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Multiple functions, multiple calls</p>
+                        </div>
+                        <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
+                            <h4 style="color: #ef4444; margin: 0;">Irrelevance</h4>
+                            <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Correctly reject when no function applies</p>
+                        </div>
+                        <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
+                            <h4 style="color: #8b5cf6; margin: 0;">Dialect Handling</h4>
+                            <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Egyptian 🇪🇬 / Gulf 🇸🇦 / Levantine 🇱🇧</p>
+                        </div>
+                    </div>
+                </div>
+                """)
+            with gr.TabItem("📖 About"):
+                gr.HTML("""
+                <div style="padding: 24px; max-width: 800px;">
+                    <h3 style="color: #667eea;">About AFCL</h3>
+                    <p style="color: #c0c0c0; line-height: 1.8;">
+                        The <strong>Arabic Function Calling Leaderboard (AFCL)</strong> is the first comprehensive benchmark
+                        for evaluating LLMs on function calling capabilities in Arabic. It tests models across Modern Standard
+                        Arabic (MSA) and three major dialects: Egyptian, Gulf, and Levantine.
+                    </p>
+                    <h4 style="color: #22c55e; margin-top: 24px;">Dataset</h4>
+                    <p style="color: #c0c0c0;">
+                        📊 <a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling" style="color: #667eea;">HeshamHaroon/Arabic_Function_Calling</a>
+                    </p>
+                    <h4 style="color: #f59e0b; margin-top: 24px;">Scoring</h4>
+                    <p style="color: #c0c0c0; line-height: 1.8;">
+                        Models are scored using AST-based matching with Arabic text normalization.
+                        The overall score is a weighted average across all categories, with emphasis on
+                        irrelevance detection and dialect handling.
+                    </p>
+                    <h4 style="color: #ec4899; margin-top: 24px;">Citation</h4>
+                    <pre style="background: rgba(255,255,255,0.05); padding: 16px; border-radius: 8px; color: #a0a0a0; overflow-x: auto;">
+@misc{afcl2024,
+    title={Arabic Function Calling Leaderboard},
+    author={Hesham Haroon},
+    year={2024},
+    url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
+}</pre>
+                </div>
                 """)
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; padding: 24px; margin-top: 24px; border-top: 1px solid rgba(255,255,255,0.1);">
+            <p style="color: #666; font-size: 0.9rem;">
+                Built for the Arabic NLP Community | بُني لمجتمع معالجة اللغة العربية
+            </p>
         </div>
         """)