Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

HeshamHaroon commited on 17 days ago

Commit

6add5d0

verified ·

1 Parent(s): 6315c7d

Initial release: Arabic Function Calling Leaderboard

Browse files

Files changed (1) hide show

afcl/app.py +77 -84

afcl/app.py CHANGED Viewed

@@ -17,10 +17,6 @@ from .data.loader import (
     load_leaderboard, save_leaderboard, load_benchmark,
     calculate_overall_score, CATEGORY_WEIGHTS
 )
-from .visualization.charts import (
-    create_radar_chart, create_bar_chart,
-    create_category_comparison, create_dialect_breakdown
-)
 # Constants
 TITLE = "🏆 Arabic Function Calling Leaderboard"
@@ -34,14 +30,14 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
 # Column definitions
 LEADERBOARD_COLUMNS = {
-    "rank": {"label": "المرتبة", "label_en": "Rank", "type": "number"},
     "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
     "organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
     "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
     "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
     "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
     "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
-    "parallel_multiple": {"label": "متوازي متعدد", "label_en": "Parallel Multiple", "type": "number"},
     "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
     "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
     "status": {"label": "الحالة", "label_en": "Status", "type": "str"},
@@ -64,11 +60,13 @@ def get_leaderboard_data() -> List[Dict]:
 def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
     """Convert leaderboard data to pandas DataFrame."""
     df = pd.DataFrame(data)
-    # Select columns to display
-    display_cols = ["rank", "model", "organization", "overall", "simple", "multiple",
-                    "parallel", "parallel_multiple", "irrelevance", "dialect_handling", "status"]
     df = df[[c for c in display_cols if c in df.columns]]
     # Rename columns based on language preference
@@ -80,67 +78,54 @@ def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> p
     df = df.rename(columns=column_mapping)
-    # Format numeric columns (show as percentage, but mark 0.0 as "Pending")
     for col in df.columns:
         if df[col].dtype in ['float64', 'float32']:
-            df[col] = df[col].apply(lambda x: "⏳ Pending" if x == 0.0 else f"{x:.1f}%")
     # Format status column
     status_col = "الحالة" if use_arabic else "Status"
     if status_col in df.columns:
-        df[status_col] = df[status_col].apply(lambda x: "⏳ قيد التقييم" if x == "pending" else "✅ مكتمل" if use_arabic else "⏳ Pending" if x == "pending" else "✅ Completed")
     return df
-def create_leaderboard_tab(use_arabic: bool = True):
-    """Create the main leaderboard tab content."""
     data = get_leaderboard_data()
-    df = format_leaderboard_dataframe(data, use_arabic)
-    return gr.DataFrame(
-        value=df,
-        interactive=False,
-        wrap=True,
-    )
-def create_visualization_tab():
-    """Create the visualization tab with charts."""
-    data = get_leaderboard_data()
-    # Prepare data for charts
-    model_scores = {
-        entry["model"]: {k: v for k, v in entry.items() if k not in ["rank", "model"]}
-        for entry in data
-    }
-    with gr.Row():
-        with gr.Column():
-            radar_chart = create_radar_chart(
-                {k: v for k, v in list(model_scores.items())[:5]},
-                use_arabic=True,
-                title="مقارنة النماذج - Category Comparison"
-            )
-            gr.Plot(value=radar_chart)
-    with gr.Row():
-        with gr.Column():
-            bar_chart = create_bar_chart(
-                data,
-                metric="overall",
-                use_arabic=True,
-                title="أفضل النماذج - Top Models"
-            )
-            gr.Plot(value=bar_chart)
-    with gr.Row():
-        category_chart = create_category_comparison(
-            data,
-            use_arabic=True,
-            title="أداء الفئات - Category Performance"
-        )
-        gr.Plot(value=category_chart)
 def create_submit_tab():
@@ -220,14 +205,20 @@ def create_about_tab():
     ## Evaluation Categories | فئات التقييم
-    | Category | الفئة | Description | الوصف |
-    |----------|-------|-------------|-------|
-    | Simple | بسيط | Single function, single call | دالة واحدة، استدعاء واحد |
-    | Multiple | متعدد | Select correct function from options | اختيار الدالة الصحيحة من عدة خيارات |
-    | Parallel | متوازي | Multiple calls of same function | استدعاءات متعددة لنفس الدالة |
-    | Parallel Multiple | متوازي متعدد | Multiple functions, multiple calls | دوال متعددة، استدعاءات متعددة |
-    | Irrelevance | اللا صلة | No function should be called | لا يجب استدعاء أي دالة |
-    | Dialect Handling | اللهجات | Egyptian/Gulf/Levantine queries | استعلامات مصرية/خليجية/شامية |
     ## Scoring Formula | معادلة التقييم
@@ -245,19 +236,14 @@ def create_about_tab():
     - Multi-Turn: 15%
     - Native Arabic: 10%
-    ## Evaluation Methodology | منهجية التقييم
-    1. **AST-Based Matching**: Function calls are compared using Abstract Syntax Tree matching with Arabic text normalization.
-    2. **Arabic Normalization**: Handles diacritics (tashkeel), alef variants, and Arabic-Indic numerals.
-    3. **Order-Agnostic Parallel Evaluation**: For parallel calls, order doesn't matter - we use bipartite matching.
     ## Dataset | مجموعة البيانات
-    - **Total Samples**: 1,470+
     - **Languages**: Arabic (MSA + Dialects) & English
-    - **Source**: Translated from BFCL with additional dialect variants
     ## Citation | الاقتباس
@@ -269,12 +255,6 @@ def create_about_tab():
         url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
     }
     ```
-    ## Contact | التواصل
-    For questions or contributions, please open an issue on the repository.
-    للأسئلة أو المساهمات، يرجى فتح مشكلة في المستودع.
     """)
@@ -305,11 +285,20 @@ def create_app():
         # Stats row
         data = get_leaderboard_data()
         with gr.Row():
             gr.Markdown(f"""
             <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
                 <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
-                <div style="color: #666;">Models to Evaluate | النماذج للتقييم</div>
             </div>
             """)
             gr.Markdown("""
@@ -318,10 +307,14 @@ def create_app():
                 <div style="color: #666;">Test Samples | عينات الاختبار</div>
             </div>
             """)
-            gr.Markdown("""
-            <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
-                <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
-                <div style="color: #666;">Categories | الفئات</div>
             </div>
             """)
@@ -335,8 +328,8 @@ def create_app():
                     wrap=True,
                 )
-            with gr.TabItem("📊 Visualizations | الرسوم البيانية"):
-                create_visualization_tab()
             with gr.TabItem("📤 Submit | إرسال"):
                 create_submit_tab()

     load_leaderboard, save_leaderboard, load_benchmark,
     calculate_overall_score, CATEGORY_WEIGHTS
 )
 # Constants
 TITLE = "🏆 Arabic Function Calling Leaderboard"
 # Column definitions
 LEADERBOARD_COLUMNS = {
+    "rank": {"label": "#", "label_en": "#", "type": "number"},
     "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
     "organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
     "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
     "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
     "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
     "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
+    "parallel_multiple": {"label": "متوازي متعدد", "label_en": "P. Multiple", "type": "number"},
     "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
     "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
     "status": {"label": "الحالة", "label_en": "Status", "type": "str"},
 def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
     """Convert leaderboard data to pandas DataFrame."""
+    if not data:
+        return pd.DataFrame()
     df = pd.DataFrame(data)
+    # Select columns to display (fewer columns for cleaner view)
+    display_cols = ["rank", "model", "organization", "overall", "status"]
     df = df[[c for c in display_cols if c in df.columns]]
     # Rename columns based on language preference
     df = df.rename(columns=column_mapping)
+    # Format numeric columns (show as percentage, but mark 0.0 as "-")
     for col in df.columns:
         if df[col].dtype in ['float64', 'float32']:
+            df[col] = df[col].apply(lambda x: "-" if x == 0.0 else f"{x:.1f}%")
     # Format status column
     status_col = "الحالة" if use_arabic else "Status"
     if status_col in df.columns:
+        df[status_col] = df[status_col].apply(
+            lambda x: "⏳ قيد الانتظار" if x == "pending" else "✅ مكتمل"
+            if use_arabic else "⏳ Pending" if x == "pending" else "✅ Done"
+        )
     return df
+def create_models_list_tab():
+    """Create the models list tab showing all models to be evaluated."""
     data = get_leaderboard_data()
+    # Group by organization
+    orgs = {}
+    for entry in data:
+        org = entry.get("organization", "Other")
+        if org not in orgs:
+            orgs[org] = []
+        orgs[org].append(entry)
+    # Create markdown content
+    md_content = """
+## 📋 Models Queue | قائمة النماذج للتقييم
+The following **{total}** models are queued for evaluation on the Arabic Function Calling benchmark:
+النماذج التالية (**{total}** نموذج) في قائمة الانتظار للتقييم:
+---
+""".format(total=len(data))
+    for org, models in sorted(orgs.items()):
+        md_content += f"### {org}\n"
+        for m in models:
+            model_url = m.get("model_url", "#")
+            md_content += f"- [{m['model']}]({model_url}) - ⏳ Pending\n"
+        md_content += "\n"
+    return gr.Markdown(md_content)
 def create_submit_tab():
     ## Evaluation Categories | فئات التقييم
+    | Category | الفئة | Samples | Description |
+    |----------|-------|---------|-------------|
+    | Simple | بسيط | 200 | Single function, single call |
+    | Multiple | متعدد | 200 | Select correct function from options |
+    | Parallel | متوازي | 200 | Multiple calls of same function |
+    | Parallel Multiple | متوازي متعدد | 200 | Multiple functions, multiple calls |
+    | Irrelevance | اللا صلة | 200 | No function should be called |
+    | Dialect Handling | اللهجات | 150 | Egyptian/Gulf/Levantine queries |
+    | Java | جافا | 100 | Java API function calls |
+    | JavaScript | جافاسكريبت | 50 | JS function calls |
+    | REST | REST | 70 | REST API calls |
+    | SQL | SQL | 100 | SQL query generation |
+    **Total: 1,470 samples**
     ## Scoring Formula | معادلة التقييم
     - Multi-Turn: 15%
     - Native Arabic: 10%
     ## Dataset | مجموعة البيانات
+    📊 **[HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)**
+    - **Total Samples**: 1,470
     - **Languages**: Arabic (MSA + Dialects) & English
+    - **Categories**: 10 evaluation categories
+    - **Source**: Translated from BFCL with dialect variants
     ## Citation | الاقتباس
         url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
     }
     ```
     """)
         # Stats row
         data = get_leaderboard_data()
+        evaluated = len([d for d in data if d.get("status") != "pending"])
+        pending = len([d for d in data if d.get("status") == "pending"])
         with gr.Row():
             gr.Markdown(f"""
             <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
                 <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
+                <div style="color: #666;">Total Models | إجمالي النماذج</div>
+            </div>
+            """)
+            gr.Markdown(f"""
+            <div style="text-align: center; padding: 15px; background: #fff3cd; border-radius: 8px;">
+                <div style="font-size: 2rem; font-weight: bold; color: #856404;">{pending}</div>
+                <div style="color: #856404;">⏳ Pending | قيد الانتظار</div>
             </div>
             """)
             gr.Markdown("""
                 <div style="color: #666;">Test Samples | عينات الاختبار</div>
             </div>
             """)
+        # Notice about pending evaluation
+        if pending > 0:
+            gr.Markdown(f"""
+            <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 8px; margin: 15px 0;">
+                ⏳ <strong>Evaluation in Progress | التقييم قيد التنفيذ</strong><br>
+                {pending} models are waiting to be evaluated. Results will be updated as evaluations complete.<br>
+                {pending} نموذج في انتظار التقييم. سيتم تحديث النتائج فور اكتمال التقييم.
             </div>
             """)
                     wrap=True,
                 )
+            with gr.TabItem("📋 Models | النماذج"):
+                create_models_list_tab()
             with gr.TabItem("📤 Submit | إرسال"):
                 create_submit_tab()