import json import os from datetime import datetime import gradio as gr import pandas as pd from datasets import load_dataset from envs import API, EVAL_REQUESTS_PATH, REQUEST_QUEUE_REPO, OWNER, RESULT_DATASET_NAME custom_css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .leaderboard-table { margin-top: 20px; } h1 { color: #2c3e50; margin-bottom: 10px; } .description { color: #7f8c8d; margin-bottom: 30px; } .tab-nav button { font-size: 16px; font-weight: 600; } #leaderboard-table { margin-top: 15px; text-align: center; } #leaderboard-table th, #leaderboard-table td { text-align: center; vertical-align: middle; } #leaderboard-table td:first-child, #leaderboard-table th:first-child { text-align: left; max-width: 500px; } """ LLM_BENCHMARKS_ABOUT_TEXT = f""" # CRCIS LLM Leaderboard This leaderboard tracks the performance of Large Language Models on the **CRCIS Benchmark**. ## Evaluation Details - **Tasks**: Multiple-choice questions across multiple subjects including: - Humanities - History - Quran - General Information - Quran - Tafsir - **Evaluation Method**: 5-shot evaluation using lm-evaluation-harness - **Metric**: Accuracy (%) - **Dataset**: [{OWNER}/{RESULT_DATASET_NAME}](https://huggingface.co/datasets/{OWNER}/{RESULT_DATASET_NAME}) ## Background and Goals This leaderboard provides a comprehensive benchmarking system for evaluating LLMs on CRCIS-specific tasks. The evaluation framework is based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offering a reliable platform for assessing model performance on specialized domain knowledge. ## Data Integrity To maintain evaluation integrity and prevent overfitting, the full benchmark dataset is used for evaluation. This approach ensures that results genuinely represent each model's capabilities. """ LLM_BENCHMARKS_SUBMIT_TEXT = """## Submitting a Model for Evaluation To submit your model for evaluation, follow these steps: 1. **Ensure your model is on Hugging Face**: Your model must be publicly available on [Hugging Face](https://huggingface.co/). 2. **Submit Request**: Fill out the form below with your model's information and Hugging Face identifier. 3. **Evaluation Queue**: Submissions will be queued and processed. The evaluation may take some time depending on the queue. 4. **Results**: Once the evaluation is complete, your model's results will be E-mailed to you. We appreciate your contributions to the LLM ecosystem! """ def load_results_from_hf_dataset(): """Load results from HuggingFace dataset.""" try: dataset = load_dataset( f"{OWNER}/{RESULT_DATASET_NAME}", split="results", download_mode="force_redownload" ) print(f"Loaded {len(dataset)} entries from the dataset.") return pd.DataFrame(dataset) except Exception as e: print(f"Error loading dataset: {e}") # Return sample data if loading fails return pd.DataFrame([ { "model": "Sample Model", "overall": 75.5, "precision": "fp16", "#parameters (B)": 7.0 } ]) def sort_dataframe_by_column(df, column_name): """Sort dataframe by specified column in descending order.""" if column_name not in df.columns: raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.") return df.sort_values(by=column_name, ascending=False).reset_index(drop=True) def make_clickable_model(model_name): """Make model name clickable with link to HuggingFace.""" link = f"https://huggingface.co/{model_name}" style = "color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;" return f'{model_name}' def prepare_leaderboard_data(overall_score_column="overall", model_name_column="model"): """Prepare and format the leaderboard DataFrame.""" df = load_results_from_hf_dataset() # Remove columns with parentheses (except those with #) df = df[[col for col in df.columns if ("(" not in col and ")" not in col) or "#" in col]] # Sort by overall score (descending) df = sort_dataframe_by_column(df, overall_score_column) # Make model names clickable df[model_name_column] = df[model_name_column].apply(make_clickable_model) return df def submit(model_name, model_id, contact_email): """Handle model submission to evaluation queue.""" if model_name == "" or model_id == "" or contact_email == "": gr.Info("Please fill all the fields") return try: user_name = "" if "/" in model_id: user_name = model_id.split("/")[0] model_path = model_id.split("/")[1] else: gr.Error("Model ID must be in the format 'username/model-name'") return eval_entry = { "model_name": model_name, "model_id": model_id, "contact_email": contact_email, } # Get the current timestamp to add to the filename timestamp = datetime.now().strftime("%Y%m%d") OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" os.makedirs(OUT_DIR, exist_ok=True) # Add the timestamp to the filename out_path = f"{OUT_DIR}/{user_name}_{model_path}_{timestamp}.json" with open(out_path, "w") as f: f.write(json.dumps(eval_entry)) print("Uploading eval file") API.upload_file( path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1], repo_id=REQUEST_QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model_name} to eval queue", ) gr.Info("Successfully submitted", duration=10) # Remove the local file os.remove(out_path) except Exception as e: gr.Error(f"Error submitting the model: {e}")