Spaces:
Running
Running
| from dataclasses import dataclass | |
| from enum import Enum | |
| class Task: | |
| benchmark: str | |
| metric: str | |
| col_name: str | |
| # Init: to update with your specific keys | |
| class Tasks(Enum): | |
| # task_key in the json file, metric_key in the json file, name to display in the leaderboard | |
| task0 = Task("task_name1", "metric_name", "First task") | |
| task1 = Task("task_name2", "metric_name", "Second task") | |
| # Your leaderboard name | |
| TITLE = """<h1 align="center" id="space-title">π Auto Arena of LLMs</h1>""" | |
| # subtitle | |
| SUB_TITLE = """<h2 align="center" id="space-title">Automating LLM Evaluations with Agent Peer-battles and Committee Discussions</h1>""" | |
| # What does your leaderboard evaluate? | |
| INTRODUCTION_TEXT = """ | |
| This leaderboard is from a completely automated large language model (LLM) evaluation framework by employing various LLM agents in peer-battles and committee discussions. | |
| You can find more details from the [project page](https://auto-arena.github.io/) and our [paper](). | |
| """ | |
| # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab. | |
| # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings. | |
| # """ | |
| # Which evaluations are you running? how can people reproduce what you have? | |
| LLM_BENCHMARKS_TEXT = f""" | |
| ``` | |
| """ | |
| # You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results | |
| EVALUATION_QUEUE_TEXT = """ | |
| """ | |
| CITATION_BUTTON_LABEL = "" | |
| CITATION_BUTTON_TEXT = r""" | |
| """ | |
| CONTACT_TEXT = f""" | |
| ## Contact | |
| """ | |