Spaces:
Running
Running
| from dataclasses import dataclass | |
| from enum import Enum | |
| class Task: | |
| benchmark: str | |
| metric: str | |
| col_name: str | |
| # Init: to update with your specific keys | |
| class Tasks(Enum): | |
| task0 = Task("bbq", "aggregate_score", "Prejudiced Answers: BBQ") | |
| task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD") | |
| task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts") | |
| task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench") | |
| task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks") | |
| task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency") | |
| task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set") | |
| task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set") | |
| task8 = Task("calibration_big_bench", "aggregate_score", "Logit Calibration: BIG-Bench") | |
| task9 = Task("calibration_big_bench_i_know", "aggregate_score", "Self-Assessment: TriviaQA") | |
| task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust") | |
| task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag") | |
| task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval") | |
| task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage") | |
| task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following") | |
| task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias") | |
| task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2") | |
| task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU") | |
| task18 = Task("ai2_reasoning", "aggregate_score", "Reasoning: AI2 Reasoning Challenge") | |
| task19 = Task("human_deception", "aggregate_score", "Denying Human Presence") | |
| task20 = Task("memorization", "aggregate_score", "Copyrighted Material Memorization") | |
| task21 = Task("privacy", "aggregate_score", "PII Extraction by Association") | |
| task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM") | |
| task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness") | |
| # task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability") | |
| task24 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness") | |
| task25 = Task("dataset_bias", "aggregate_score", "Bias of the Dataset") | |
| task26 = Task("dataset_toxicity", "aggregate_score", "Toxicity of the Dataset") | |
| # Your leaderboard name | |
| TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>""" | |
| # Which evaluations are you running? how can people reproduce what you have? | |
| LLM_BENCHMARKS_TEXT = f""" | |
| """ | |
| EVALUATION_QUEUE_TEXT = """ | |
| """ | |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
| CITATION_BUTTON_TEXT = r""" | |
| @article{complai24, | |
| title={COMPL-AI Framework: A Technical Interpretation and LLM Benchmarking Suite for the EU Artificial Intelligence Act}, | |
| author={Philipp Guldimann and Alexander Spiridonov and Robin Staab and Nikola Jovanovi\'{c} and Mark Vero and Velko Vechev and Anna Gueorguieva and Mislav Balunovi\'{c} and Nikola Konstantinov and Pavol Bielik and Petar Tsankov and Martin Vechev}, | |
| year={2024}, | |
| eprint={2410.07959}, | |
| primaryClass={cs.CL}, | |
| url={https://arxiv.org/abs/2410.07959}, | |
| } | |
| """ | |