Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

File size: 4,745 Bytes

566d03e

"""
Data Loader
===========

Load benchmark data and evaluation results.
"""

import json
import os
from typing import Dict, List, Optional
from pathlib import Path
from datasets import load_dataset


# Category weights for overall score
CATEGORY_WEIGHTS = {
    "simple": 0.15,
    "multiple": 0.10,
    "parallel": 0.10,
    "parallel_multiple": 0.10,
    "irrelevance": 0.15,
    "dialect_handling": 0.15,
    "multi_turn": 0.15,
    "native_arabic": 0.10,
    # Programming categories (included in evaluation but lower weight)
    "java": 0.0,
    "javascript": 0.0,
    "rest": 0.0,
    "sql": 0.0,
}


def load_benchmark(
    dataset_name: str = "HeshamHaroon/Arabic_Function_Calling",
    split: str = "test",
    category: Optional[str] = None
) -> List[Dict]:
    """
    Load benchmark samples from HuggingFace dataset.

    Args:
        dataset_name: HuggingFace dataset repository
        split: Dataset split ('train' or 'test')
        category: Optional category filter

    Returns:
        List of sample dictionaries
    """
    try:
        dataset = load_dataset(dataset_name, split=split)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        # Fallback to local data
        local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json"
        if local_path.exists():
            with open(local_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            samples = data.get('samples', [])
            if category:
                samples = [s for s in samples if s.get('category') == category]
            return samples
        raise

    samples = []
    for item in dataset:
        sample = {
            'id': item['id'],
            'query_en': item['query_en'],
            'query_ar': item['query_ar'],
            'functions': json.loads(item['functions']) if item['functions'] else [],
            'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
            'category': item['category'],
            'source': item.get('source', ''),
            'dialect': item.get('dialect', ''),
        }
        if category is None or sample['category'] == category:
            samples.append(sample)

    return samples


def load_results(results_dir: str = "data/results") -> Dict[str, Dict]:
    """
    Load evaluation results for all models.

    Args:
        results_dir: Directory containing result JSON files

    Returns:
        Dictionary mapping model names to their results
    """
    results = {}
    results_path = Path(results_dir)

    if not results_path.exists():
        return results

    for file_path in results_path.glob("*.json"):
        model_name = file_path.stem
        with open(file_path, 'r', encoding='utf-8') as f:
            results[model_name] = json.load(f)

    return results


def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]:
    """
    Load the current leaderboard rankings.

    Returns:
        List of model entries sorted by overall score
    """
    path = Path(leaderboard_path)
    if not path.exists():
        return []

    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    return sorted(data, key=lambda x: x.get('overall', 0), reverse=True)


def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"):
    """Save leaderboard data to file."""
    path = Path(leaderboard_path)
    path.parent.mkdir(parents=True, exist_ok=True)

    # Sort by overall score
    sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True)

    # Add ranks
    for i, entry in enumerate(sorted_entries, 1):
        entry['rank'] = i

    with open(path, 'w', encoding='utf-8') as f:
        json.dump(sorted_entries, f, ensure_ascii=False, indent=2)


def calculate_overall_score(category_scores: Dict[str, float]) -> float:
    """
    Calculate weighted overall score from category scores.

    Args:
        category_scores: Dictionary mapping category names to scores (0-100)

    Returns:
        Overall weighted score (0-100)
    """
    total_weight = 0
    weighted_sum = 0

    for category, weight in CATEGORY_WEIGHTS.items():
        if category in category_scores and weight > 0:
            weighted_sum += category_scores[category] * weight
            total_weight += weight

    if total_weight == 0:
        return 0.0

    return weighted_sum / total_weight


def get_category_stats(samples: List[Dict]) -> Dict[str, int]:
    """Get sample counts by category."""
    stats = {}
    for sample in samples:
        category = sample.get('category', 'unknown')
        stats[category] = stats.get(category, 0) + 1
    return stats