leaderboard / utils.py
anikfekr
refactor: streamline leaderboard functionality and enhance submission process
d220025
import json
import os
from datetime import datetime
import gradio as gr
import pandas as pd
from datasets import load_dataset
from envs import API, EVAL_REQUESTS_PATH, REQUEST_QUEUE_REPO, OWNER, RESULT_DATASET_NAME
custom_css = """
.gradio-container {
font-family: 'IBM Plex Sans', sans-serif;
}
.leaderboard-table {
margin-top: 20px;
}
h1 {
color: #2c3e50;
margin-bottom: 10px;
}
.description {
color: #7f8c8d;
margin-bottom: 30px;
}
.tab-nav button {
font-size: 16px;
font-weight: 600;
}
#leaderboard-table {
margin-top: 15px;
text-align: center;
}
#leaderboard-table th,
#leaderboard-table td {
text-align: center;
vertical-align: middle;
}
#leaderboard-table td:first-child,
#leaderboard-table th:first-child {
text-align: left;
max-width: 500px;
}
"""
LLM_BENCHMARKS_ABOUT_TEXT = f"""
# CRCIS LLM Leaderboard
This leaderboard tracks the performance of Large Language Models on the **CRCIS Benchmark**.
## Evaluation Details
- **Tasks**: Multiple-choice questions across multiple subjects including:
- Humanities - History
- Quran - General Information
- Quran - Tafsir
- **Evaluation Method**: 5-shot evaluation using lm-evaluation-harness
- **Metric**: Accuracy (%)
- **Dataset**: [{OWNER}/{RESULT_DATASET_NAME}](https://huggingface.co/datasets/{OWNER}/{RESULT_DATASET_NAME})
## Background and Goals
This leaderboard provides a comprehensive benchmarking system for evaluating LLMs on CRCIS-specific tasks.
The evaluation framework is based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness),
offering a reliable platform for assessing model performance on specialized domain knowledge.
## Data Integrity
To maintain evaluation integrity and prevent overfitting, the full benchmark dataset is used for evaluation.
This approach ensures that results genuinely represent each model's capabilities.
"""
LLM_BENCHMARKS_SUBMIT_TEXT = """## Submitting a Model for Evaluation
To submit your model for evaluation, follow these steps:
1. **Ensure your model is on Hugging Face**: Your model must be publicly available on [Hugging Face](https://huggingface.co/).
2. **Submit Request**: Fill out the form below with your model's information and Hugging Face identifier.
3. **Evaluation Queue**: Submissions will be queued and processed. The evaluation may take some time depending on the queue.
4. **Results**: Once the evaluation is complete, your model's results will be E-mailed to you.
We appreciate your contributions to the LLM ecosystem!
"""
def load_results_from_hf_dataset():
"""Load results from HuggingFace dataset."""
try:
dataset = load_dataset(
f"{OWNER}/{RESULT_DATASET_NAME}",
split="results",
download_mode="force_redownload"
)
print(f"Loaded {len(dataset)} entries from the dataset.")
return pd.DataFrame(dataset)
except Exception as e:
print(f"Error loading dataset: {e}")
# Return sample data if loading fails
return pd.DataFrame([
{
"model": "Sample Model",
"overall": 75.5,
"precision": "fp16",
"#parameters (B)": 7.0
}
])
def sort_dataframe_by_column(df, column_name):
"""Sort dataframe by specified column in descending order."""
if column_name not in df.columns:
raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
return df.sort_values(by=column_name, ascending=False).reset_index(drop=True)
def make_clickable_model(model_name):
"""Make model name clickable with link to HuggingFace."""
link = f"https://huggingface.co/{model_name}"
style = "color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;"
return f'<a target="_blank" href="{link}" style="{style}">{model_name}</a>'
def prepare_leaderboard_data(overall_score_column="overall", model_name_column="model"):
"""Prepare and format the leaderboard DataFrame."""
df = load_results_from_hf_dataset()
# Remove columns with parentheses (except those with #)
df = df[[col for col in df.columns if ("(" not in col and ")" not in col) or "#" in col]]
# Sort by overall score (descending)
df = sort_dataframe_by_column(df, overall_score_column)
# Make model names clickable
df[model_name_column] = df[model_name_column].apply(make_clickable_model)
return df
def submit(model_name, model_id, contact_email):
"""Handle model submission to evaluation queue."""
if model_name == "" or model_id == "" or contact_email == "":
gr.Info("Please fill all the fields")
return
try:
user_name = ""
if "/" in model_id:
user_name = model_id.split("/")[0]
model_path = model_id.split("/")[1]
else:
gr.Error("Model ID must be in the format 'username/model-name'")
return
eval_entry = {
"model_name": model_name,
"model_id": model_id,
"contact_email": contact_email,
}
# Get the current timestamp to add to the filename
timestamp = datetime.now().strftime("%Y%m%d")
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
os.makedirs(OUT_DIR, exist_ok=True)
# Add the timestamp to the filename
out_path = f"{OUT_DIR}/{user_name}_{model_path}_{timestamp}.json"
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry))
print("Uploading eval file")
API.upload_file(
path_or_fileobj=out_path,
path_in_repo=out_path.split("eval-queue/")[1],
repo_id=REQUEST_QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add {model_name} to eval queue",
)
gr.Info("Successfully submitted", duration=10)
# Remove the local file
os.remove(out_path)
except Exception as e:
gr.Error(f"Error submitting the model: {e}")