Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from datetime import datetime | |
| import gradio as gr | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from envs import API, EVAL_REQUESTS_PATH, REQUEST_QUEUE_REPO, OWNER, RESULT_DATASET_NAME | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| } | |
| .leaderboard-table { | |
| margin-top: 20px; | |
| } | |
| h1 { | |
| color: #2c3e50; | |
| margin-bottom: 10px; | |
| } | |
| .description { | |
| color: #7f8c8d; | |
| margin-bottom: 30px; | |
| } | |
| .tab-nav button { | |
| font-size: 16px; | |
| font-weight: 600; | |
| } | |
| #leaderboard-table { | |
| margin-top: 15px; | |
| text-align: center; | |
| } | |
| #leaderboard-table th, | |
| #leaderboard-table td { | |
| text-align: center; | |
| vertical-align: middle; | |
| } | |
| #leaderboard-table td:first-child, | |
| #leaderboard-table th:first-child { | |
| text-align: left; | |
| max-width: 500px; | |
| } | |
| """ | |
| LLM_BENCHMARKS_ABOUT_TEXT = f""" | |
| # CRCIS LLM Leaderboard | |
| This leaderboard tracks the performance of Large Language Models on the **CRCIS Benchmark**. | |
| ## Evaluation Details | |
| - **Tasks**: Multiple-choice questions across multiple subjects including: | |
| - Humanities - History | |
| - Quran - General Information | |
| - Quran - Tafsir | |
| - **Evaluation Method**: 5-shot evaluation using lm-evaluation-harness | |
| - **Metric**: Accuracy (%) | |
| - **Dataset**: [{OWNER}/{RESULT_DATASET_NAME}](https://huggingface.co/datasets/{OWNER}/{RESULT_DATASET_NAME}) | |
| ## Background and Goals | |
| This leaderboard provides a comprehensive benchmarking system for evaluating LLMs on CRCIS-specific tasks. | |
| The evaluation framework is based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), | |
| offering a reliable platform for assessing model performance on specialized domain knowledge. | |
| ## Data Integrity | |
| To maintain evaluation integrity and prevent overfitting, the full benchmark dataset is used for evaluation. | |
| This approach ensures that results genuinely represent each model's capabilities. | |
| """ | |
| LLM_BENCHMARKS_SUBMIT_TEXT = """## Submitting a Model for Evaluation | |
| To submit your model for evaluation, follow these steps: | |
| 1. **Ensure your model is on Hugging Face**: Your model must be publicly available on [Hugging Face](https://huggingface.co/). | |
| 2. **Submit Request**: Fill out the form below with your model's information and Hugging Face identifier. | |
| 3. **Evaluation Queue**: Submissions will be queued and processed. The evaluation may take some time depending on the queue. | |
| 4. **Results**: Once the evaluation is complete, your model's results will be E-mailed to you. | |
| We appreciate your contributions to the LLM ecosystem! | |
| """ | |
| def load_results_from_hf_dataset(): | |
| """Load results from HuggingFace dataset.""" | |
| try: | |
| dataset = load_dataset( | |
| f"{OWNER}/{RESULT_DATASET_NAME}", | |
| split="results", | |
| download_mode="force_redownload" | |
| ) | |
| print(f"Loaded {len(dataset)} entries from the dataset.") | |
| return pd.DataFrame(dataset) | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| # Return sample data if loading fails | |
| return pd.DataFrame([ | |
| { | |
| "model": "Sample Model", | |
| "overall": 75.5, | |
| "precision": "fp16", | |
| "#parameters (B)": 7.0 | |
| } | |
| ]) | |
| def sort_dataframe_by_column(df, column_name): | |
| """Sort dataframe by specified column in descending order.""" | |
| if column_name not in df.columns: | |
| raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.") | |
| return df.sort_values(by=column_name, ascending=False).reset_index(drop=True) | |
| def make_clickable_model(model_name): | |
| """Make model name clickable with link to HuggingFace.""" | |
| link = f"https://huggingface.co/{model_name}" | |
| style = "color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;" | |
| return f'<a target="_blank" href="{link}" style="{style}">{model_name}</a>' | |
| def prepare_leaderboard_data(overall_score_column="overall", model_name_column="model"): | |
| """Prepare and format the leaderboard DataFrame.""" | |
| df = load_results_from_hf_dataset() | |
| # Remove columns with parentheses (except those with #) | |
| df = df[[col for col in df.columns if ("(" not in col and ")" not in col) or "#" in col]] | |
| # Sort by overall score (descending) | |
| df = sort_dataframe_by_column(df, overall_score_column) | |
| # Make model names clickable | |
| df[model_name_column] = df[model_name_column].apply(make_clickable_model) | |
| return df | |
| def submit(model_name, model_id, contact_email): | |
| """Handle model submission to evaluation queue.""" | |
| if model_name == "" or model_id == "" or contact_email == "": | |
| gr.Info("Please fill all the fields") | |
| return | |
| try: | |
| user_name = "" | |
| if "/" in model_id: | |
| user_name = model_id.split("/")[0] | |
| model_path = model_id.split("/")[1] | |
| else: | |
| gr.Error("Model ID must be in the format 'username/model-name'") | |
| return | |
| eval_entry = { | |
| "model_name": model_name, | |
| "model_id": model_id, | |
| "contact_email": contact_email, | |
| } | |
| # Get the current timestamp to add to the filename | |
| timestamp = datetime.now().strftime("%Y%m%d") | |
| OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| # Add the timestamp to the filename | |
| out_path = f"{OUT_DIR}/{user_name}_{model_path}_{timestamp}.json" | |
| with open(out_path, "w") as f: | |
| f.write(json.dumps(eval_entry)) | |
| print("Uploading eval file") | |
| API.upload_file( | |
| path_or_fileobj=out_path, | |
| path_in_repo=out_path.split("eval-queue/")[1], | |
| repo_id=REQUEST_QUEUE_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add {model_name} to eval queue", | |
| ) | |
| gr.Info("Successfully submitted", duration=10) | |
| # Remove the local file | |
| os.remove(out_path) | |
| except Exception as e: | |
| gr.Error(f"Error submitting the model: {e}") | |