Spaces:

alinf
/

leaderboard

Sleeping

File size: 6,087 Bytes

d220025

import json
import os
from datetime import datetime

import gradio as gr
import pandas as pd
from datasets import load_dataset

from envs import API, EVAL_REQUESTS_PATH, REQUEST_QUEUE_REPO, OWNER, RESULT_DATASET_NAME

custom_css = """
.gradio-container {
    font-family: 'IBM Plex Sans', sans-serif;
}

.leaderboard-table {
    margin-top: 20px;
}

h1 {
    color: #2c3e50;
    margin-bottom: 10px;
}

.description {
    color: #7f8c8d;
    margin-bottom: 30px;
}

.tab-nav button {
    font-size: 16px;
    font-weight: 600;
}

#leaderboard-table {
    margin-top: 15px;
    text-align: center;
}

#leaderboard-table th,
#leaderboard-table td {
    text-align: center;
    vertical-align: middle;
}

#leaderboard-table td:first-child,
#leaderboard-table th:first-child {
    text-align: left;
    max-width: 500px;
}
"""

LLM_BENCHMARKS_ABOUT_TEXT = f"""
# CRCIS LLM Leaderboard

This leaderboard tracks the performance of Large Language Models on the **CRCIS Benchmark**.

## Evaluation Details
- **Tasks**: Multiple-choice questions across multiple subjects including:
  - Humanities - History
  - Quran - General Information
  - Quran - Tafsir
- **Evaluation Method**: 5-shot evaluation using lm-evaluation-harness
- **Metric**: Accuracy (%)
- **Dataset**: [{OWNER}/{RESULT_DATASET_NAME}](https://huggingface.co/datasets/{OWNER}/{RESULT_DATASET_NAME})

## Background and Goals

This leaderboard provides a comprehensive benchmarking system for evaluating LLMs on CRCIS-specific tasks. 
The evaluation framework is based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), 
offering a reliable platform for assessing model performance on specialized domain knowledge.

## Data Integrity

To maintain evaluation integrity and prevent overfitting, the full benchmark dataset is used for evaluation.
This approach ensures that results genuinely represent each model's capabilities.

"""

LLM_BENCHMARKS_SUBMIT_TEXT = """## Submitting a Model for Evaluation

To submit your model for evaluation, follow these steps:

1. **Ensure your model is on Hugging Face**: Your model must be publicly available on [Hugging Face](https://huggingface.co/).

2. **Submit Request**: Fill out the form below with your model's information and Hugging Face identifier.

3. **Evaluation Queue**: Submissions will be queued and processed. The evaluation may take some time depending on the queue.

4. **Results**: Once the evaluation is complete, your model's results will be E-mailed to you.

We appreciate your contributions to the LLM ecosystem!
"""


def load_results_from_hf_dataset():
    """Load results from HuggingFace dataset."""
    try:
        dataset = load_dataset(
            f"{OWNER}/{RESULT_DATASET_NAME}", 
            split="results",
            download_mode="force_redownload"
        )
        print(f"Loaded {len(dataset)} entries from the dataset.")
        return pd.DataFrame(dataset)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        # Return sample data if loading fails
        return pd.DataFrame([
            {
                "model": "Sample Model", 
                "overall": 75.5,
                "precision": "fp16",
                "#parameters (B)": 7.0
            }
        ])


def sort_dataframe_by_column(df, column_name):
    """Sort dataframe by specified column in descending order."""
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
    return df.sort_values(by=column_name, ascending=False).reset_index(drop=True)


def make_clickable_model(model_name):
    """Make model name clickable with link to HuggingFace."""
    link = f"https://huggingface.co/{model_name}"
    style = "color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;"
    return f'<a target="_blank" href="{link}" style="{style}">{model_name}</a>'


def prepare_leaderboard_data(overall_score_column="overall", model_name_column="model"):
    """Prepare and format the leaderboard DataFrame."""
    df = load_results_from_hf_dataset()
    
    # Remove columns with parentheses (except those with #)
    df = df[[col for col in df.columns if ("(" not in col and ")" not in col) or "#" in col]]
    
    # Sort by overall score (descending)
    df = sort_dataframe_by_column(df, overall_score_column)
    
    # Make model names clickable
    df[model_name_column] = df[model_name_column].apply(make_clickable_model)
    
    return df


def submit(model_name, model_id, contact_email):
    """Handle model submission to evaluation queue."""
    if model_name == "" or model_id == "" or contact_email == "":
        gr.Info("Please fill all the fields")
        return

    try:
        user_name = ""
        if "/" in model_id:
            user_name = model_id.split("/")[0]
            model_path = model_id.split("/")[1]
        else:
            gr.Error("Model ID must be in the format 'username/model-name'")
            return

        eval_entry = {
            "model_name": model_name,
            "model_id": model_id,
            "contact_email": contact_email,
        }

        # Get the current timestamp to add to the filename
        timestamp = datetime.now().strftime("%Y%m%d")

        OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
        os.makedirs(OUT_DIR, exist_ok=True)

        # Add the timestamp to the filename
        out_path = f"{OUT_DIR}/{user_name}_{model_path}_{timestamp}.json"

        with open(out_path, "w") as f:
            f.write(json.dumps(eval_entry))

        print("Uploading eval file")
        API.upload_file(
            path_or_fileobj=out_path,
            path_in_repo=out_path.split("eval-queue/")[1],
            repo_id=REQUEST_QUEUE_REPO,
            repo_type="dataset",
            commit_message=f"Add {model_name} to eval queue",
        )

        gr.Info("Successfully submitted", duration=10)
        # Remove the local file
        os.remove(out_path)
    except Exception as e:
        gr.Error(f"Error submitting the model: {e}")