diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f7f7ed4d82f04224440a0d164d2cc24c0e758990 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/README.md @@ -0,0 +1,50 @@ +# MathQA + +### Paper + +IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models +https://arxiv.org/pdf/2406.03368 + +IrokoBench is a human-translated benchmark dataset for 16 typologically diverse +low-resource African languages covering three tasks: natural language inference (AfriXNLI), +mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU). + + +### Citation + +``` +@misc{adelani2024irokobenchnewbenchmarkafrican, + title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, + author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp}, + year={2024}, + eprint={2406.03368}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2406.03368}, +} +``` + +### Groups and Tasks + +#### Groups + +* `afrimmlu`: All afrimmlu tasks +* `afrimmlu_direct`: afrimmlu_direct evaluates models performance on the curated dataset +* `afrimmlu_translate`: afrimmlu_translate evaluates models in translate-test setting + +#### Tasks +* `afrimmlu_direct_{language_code}`: each task evaluates for one language +* `afrimmlu_translate_{language_code}`: each task evaluates for one language + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? + * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml new file mode 100644 index 0000000000000000000000000000000000000000..2cda741a7e757bd28010b916e20d0c9ee11fc989 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml @@ -0,0 +1,37 @@ +group: + - afrimmlu + - afrimmlu_direct +task: null +dataset_path: masakhane/afrimmlu +dataset_name: null +output_type: multiple_choice +validation_split: validation +test_split: test +fewshot_split: validation +doc_to_text: !function utils.doc_to_text +doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}" +doc_to_choice: !function utils.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: "Question: {{question}}\nAnswer:" +metric_list: + - metric: f1 + aggregation: !function utils.weighted_f1_score + # aggregation: mean + average: weighted + hf_evaluate: true + higher_is_better: True + ignore_case: true + ignore_punctuation: true + regexes_to_ignore: + - "," + - "\\$" + - metric: acc + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + regexes_to_ignore: + - "," + - "\\$" +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa60c668fd9b2879f020f990655e7eedce2b3a81 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml @@ -0,0 +1,3 @@ +dataset_name: amh +include: afrimmlu_common_yaml +task: afrimmlu_direct_amh diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a1e647cdf1d0278c73744288fa61cd7709550231 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml @@ -0,0 +1,3 @@ +dataset_name: eng +include: afrimmlu_common_yaml +task: afrimmlu_direct_eng diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1cc45ddc0e50d1bb4992aecdb4f5208dbb77881b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml @@ -0,0 +1,3 @@ +dataset_name: ewe +include: afrimmlu_common_yaml +task: afrimmlu_direct_ewe diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6abb2c4a467986751376679b31ec5db8a7af0886 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml @@ -0,0 +1,3 @@ +dataset_name: ibo +include: afrimmlu_common_yaml +task: afrimmlu_direct_ibo diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f81f709c4812db3ecfa71bbb9cfb74099a10aab --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml @@ -0,0 +1,3 @@ +dataset_name: kin +include: afrimmlu_common_yaml +task: afrimmlu_direct_kin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2695d4a156d4b59dbb2c483ebdbbc16e01c7a415 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml @@ -0,0 +1,3 @@ +dataset_name: twi +include: afrimmlu_common_yaml +task: afrimmlu_direct_twi diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml new file mode 100644 index 0000000000000000000000000000000000000000..027f837637fb061d227d33e925d3030af51c3cbe --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml @@ -0,0 +1,3 @@ +dataset_name: wol +include: afrimmlu_common_yaml +task: afrimmlu_direct_wol diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e0c12972d01be342a6838b0eab4c1f609d6dc48 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml @@ -0,0 +1,3 @@ +dataset_name: xho +include: afrimmlu_common_yaml +task: afrimmlu_direct_xho diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a9f7645c2259a607f871e54b07c14ab962ed04c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml @@ -0,0 +1,3 @@ +dataset_name: yor +include: afrimmlu_common_yaml +task: afrimmlu_direct_yor diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9d8d3b415b44ef4ab0b762f411006c7b00d54226 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml @@ -0,0 +1,3 @@ +dataset_name: zul +include: afrimmlu_common_yaml +task: afrimmlu_direct_zul diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/fewshot.sh b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/fewshot.sh new file mode 100644 index 0000000000000000000000000000000000000000..c69c48d7dff4e2495485023187dc162742c7ca6a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/fewshot.sh @@ -0,0 +1,8 @@ +lm_eval --model hf \ + --model_args pretrained=masakhane/African-ultrachat-alpaca \ + --tasks afrimmlu_direct_amh,afrimmlu_direct_eng,afrimmlu_direct_ewe,afrimmlu_direct_fra,afrimmlu_direct_hau,afrimmlu_direct_ibo,afrimmlu_direct_kin,afrimmlu_direct_lin,afrimmlu_direct_lug,afrimmlu_direct_orm,afrimmlu_direct_sna,afrimmlu_direct_sot,afrimmlu_direct_twi,afrimmlu_direct_wol,afrimmlu_direct_xho,afrimmlu_direct_yor,afrimmlu_direct_zul \ + --device cuda:0 \ + --batch_size 1 \ + --num_fewshot 0 \ + --verbosity DEBUG \ + --wandb_args project=afrimmlu diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9d02b342b2e3c9f3d3bd66d3f62330aa53c9159c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/utils.py @@ -0,0 +1,32 @@ +from lm_eval.utils import weighted_f1_score + + +def doc_to_choice(doc): + choices = eval(doc["choices"]) + return choices + + +def doc_to_text(doc): + output = """You are a highly knowledgeable and intelligent artificial intelligence + model answers multiple-choice questions about '{subject}' + + Question: '''{question}''' + + Choices: + A: ''{choice1}''' + B: ''{choice2}''' + C: ''{choice3}''' + D: ''{choice4}''' + + Answer: """ + + choices = eval(doc["choices"]) + text = output.format( + subject=doc["subject"], + question=doc["question"], + choice1=choices[0], + choice2=choices[1], + choice3=choices[2], + choice4=choices[3], + ) + return text diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/README.md new file mode 100644 index 0000000000000000000000000000000000000000..091b8bb6e26e6584a1ec19afd02331b36f604ad9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/README.md @@ -0,0 +1,127 @@ +# C-Eval (Validation) + +### Paper +C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models +https://arxiv.org/pdf/2305.08322.pdf + +C-Eval is a comprehensive Chinese evaluation suite for foundation models. +It consists of 13948 multi-choice questions spanning 52 diverse disciplines +and four difficulty levels. + +Homepage: https://cevalbenchmark.com/ + +### Citation + +```bibtex +@article{huang2023ceval, + title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models}, + author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian}, + journal={arXiv preprint arXiv:2305.08322}, + year={2023} +} +``` + + +SUBJECTS = { + "computer_network":"计算机网络", + "operating_system":"操作系统", + "computer_architecture":"计算机组成", + "college_programming":"大学编程", + "college_physics":"大学物理", + "college_chemistry":"大学化学", + "advanced_mathematics":"高等数学", + "probability_and_statistics":"概率统计", + "discrete_mathematics":"离散数学", + "electrical_engineer":"注册电气工程师", + "metrology_engineer":"注册计量师", + "high_school_mathematics":"高中数学", + "high_school_physics":"高中物理", + "high_school_chemistry":"高中化学", + "high_school_biology":"高中生物", + "middle_school_mathematics":"初中数学", + "middle_school_biology":"初中生物", + "middle_school_physics":"初中物理", + "middle_school_chemistry":"初中化学", + "veterinary_medicine":"兽医学", + "college_economics":"大学经济学", + "business_administration":"工商管理", + "marxism":"马克思主义基本原理", + "mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论", + "education_science":"教育学", + "teacher_qualification":"教师资格", + "high_school_politics":"高中政治", + "high_school_geography":"高中地理", + "middle_school_politics":"初中政治", + "middle_school_geography":"初中地理", + "modern_chinese_history":"近代史纲要", + "ideological_and_moral_cultivation":"思想道德修养与法律基础", + "logic":"逻辑学", + "law":"法学", + "chinese_language_and_literature":"中国语言文学", + "art_studies":"艺术学", + "professional_tour_guide":"导游资格", + "legal_professional":"法律职业资格", + "high_school_chinese":"高中语文", + "high_school_history":"高中历史", + "middle_school_history":"初中历史", + "civil_servant":"公务员", + "sports_science":"体育学", + "plant_protection":"植物保护", + "basic_medicine":"基础医学", + "clinical_medicine":"临床医学", + "urban_and_rural_planner":"注册城乡规划师", + "accountant":"注册会计师", + "fire_engineer":"注册消防工程师", + "environmental_impact_assessment_engineer":"环境影响评价工程师", + "tax_accountant":"税务师", + "physician":"医师资格" +} + + +# CMMLU + +### Paper + +CMMLU: Measuring massive multitask language understanding in Chinese +https://arxiv.org/abs/2306.09212 + +CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture. +CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels. + +Homepage: https://github.com/haonan-li/CMMLU + +### Citation + +```bibtex +@misc{li2023cmmlu, + title={CMMLU: Measuring massive multitask language understanding in Chinese}, + author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin}, + year={2023}, + eprint={2306.09212}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +- `ceval-valid`: All 52 subjects of the C-Eval dataset, evaluated following the methodology in MMLU's original implementation. This implementation consists solely of the validation set of C-Eval, as the test set requires submission of model predictions to an external site. + +#### Tasks + + +The following tasks evaluate subjects in the C-Eval dataset using loglikelihood-based multiple-choice scoring: +- `ceval-valid_{subject_english}` + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_ceval-valid.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_ceval-valid.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ce636dab975ed507cd1112751183d2bae3779f0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_ceval-valid.yaml @@ -0,0 +1,63 @@ +aggregate_metric_list: +- aggregation: mean + metric: acc + weight_by_size: true +- aggregation: mean + metric: acc_norm + weight_by_size: true +group: ceval-valid +metadata: + version: 2.0 +task: + - ceval-valid_computer_network + - ceval-valid_operating_system + - ceval-valid_computer_architecture + - ceval-valid_college_programming + - ceval-valid_college_physics + - ceval-valid_college_chemistry + - ceval-valid_advanced_mathematics + - ceval-valid_probability_and_statistics + - ceval-valid_discrete_mathematics + - ceval-valid_electrical_engineer + - ceval-valid_metrology_engineer + - ceval-valid_high_school_mathematics + - ceval-valid_high_school_physics + - ceval-valid_high_school_chemistry + - ceval-valid_high_school_biology + - ceval-valid_middle_school_mathematics + - ceval-valid_middle_school_biology + - ceval-valid_middle_school_physics + - ceval-valid_middle_school_chemistry + - ceval-valid_veterinary_medicine + - ceval-valid_college_economics + - ceval-valid_business_administration + - ceval-valid_marxism + - ceval-valid_mao_zedong_thought + - ceval-valid_education_science + - ceval-valid_teacher_qualification + - ceval-valid_high_school_politics + - ceval-valid_high_school_geography + - ceval-valid_middle_school_politics + - ceval-valid_middle_school_geography + - ceval-valid_modern_chinese_history + - ceval-valid_ideological_and_moral_cultivation + - ceval-valid_logic + - ceval-valid_law + - ceval-valid_chinese_language_and_literature + - ceval-valid_art_studies + - ceval-valid_professional_tour_guide + - ceval-valid_legal_professional + - ceval-valid_high_school_chinese + - ceval-valid_high_school_history + - ceval-valid_middle_school_history + - ceval-valid_civil_servant + - ceval-valid_sports_science + - ceval-valid_plant_protection + - ceval-valid_basic_medicine + - ceval-valid_clinical_medicine + - ceval-valid_urban_and_rural_planner + - ceval-valid_accountant + - ceval-valid_fire_engineer + - ceval-valid_environmental_impact_assessment_engineer + - ceval-valid_tax_accountant + - ceval-valid_physician diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_accountant.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_accountant.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04f669eda4c5b27bc8efb719820667040da8ae8c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_accountant.yaml @@ -0,0 +1,4 @@ +"dataset_name": "accountant" +"description": "以下是中国关于注册会计师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_accountant" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9778347b0c6fbc67b7d65b33aba7d9fdb1487a54 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "advanced_mathematics" +"description": "以下是中国关于高等数学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_advanced_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..627226260a6388a13d0f1759b54d8251339eb194 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml @@ -0,0 +1,4 @@ +"dataset_name": "art_studies" +"description": "以下是中国关于艺术学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_art_studies" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1f96f334259ecadf0504d7bc107c96ef2049a9e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml @@ -0,0 +1,4 @@ +"dataset_name": "business_administration" +"description": "以下是中国关于工商管理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_business_administration" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e285b59d0992148421c5a10094c8ff94e97a87c9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml @@ -0,0 +1,4 @@ +"dataset_name": "chinese_language_and_literature" +"description": "以下是中国关于中国语言文学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_chinese_language_and_literature" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0aad21b99c3e10fadd916a5a091d7499af718729 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml @@ -0,0 +1,4 @@ +"dataset_name": "civil_servant" +"description": "以下是中国关于公务员的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_civil_servant" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97c08d06266727d43161cea97f8a914024a20ca8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml @@ -0,0 +1,4 @@ +"dataset_name": "clinical_medicine" +"description": "以下是中国关于临床医学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_clinical_medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ba89714e621bb9e693a2b738a02027ec70169ef --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_chemistry" +"description": "以下是中国关于大学化学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_college_chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10b89f8fd6fc0938dce44f27ad3cc67c83f60178 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_economics" +"description": "以下是中国关于大学经济学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_college_economics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4cfe029a7e888aa0da80e3b3ba1c071dd8b7d5cc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_programming" +"description": "以下是中国关于大学编程的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_college_programming" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d17454a73eb34c6eefe32b1bdb9697cbd931e8d3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml @@ -0,0 +1,4 @@ +"dataset_name": "computer_architecture" +"description": "以下是中国关于计算机组成的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_computer_architecture" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_network.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_network.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9391dbbdc8c2f307b0553d401413a5159d46a53f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_network.yaml @@ -0,0 +1,4 @@ +"dataset_name": "computer_network" +"description": "以下是中国关于计算机网络的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_computer_network" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2bd42046300cd3eff136817cbd85031e7b8fecc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "discrete_mathematics" +"description": "以下是中国关于离散数学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_discrete_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_education_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_education_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..985edf982226b4ab5a8de90c4cc27b5b4e331405 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_education_science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "education_science" +"description": "以下是中国关于教育学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_education_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc946b99d36b2ab5215c9ab1458891284a1d93ac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml @@ -0,0 +1,4 @@ +"dataset_name": "electrical_engineer" +"description": "以下是中国关于注册电气工程师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_electrical_engineer" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6ed06fd740324ff89840529fa889334a7bbc832 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml @@ -0,0 +1,4 @@ +"dataset_name": "environmental_impact_assessment_engineer" +"description": "以下是中国关于环境影响评价工程师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_environmental_impact_assessment_engineer" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26cbc8b5a8af3bc90363d86e8e0744fcf3b90654 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_biology" +"description": "以下是中国关于高中生物的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2bb10ca6132dd5d3619802d1502240c620986f1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_geography" +"description": "以下是中国关于高中地理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_geography" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9335bc0f791cb174744b6bfd6d0d612cb6721346 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_history" +"description": "以下是中国关于高中历史的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3aa084a99fb253cf0a96db80449217d80927eb6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_physics" +"description": "以下是中国关于高中物理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33e341eb2eeebf898641469adb4092e44bb675c9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ideological_and_moral_cultivation" +"description": "以下是中国关于思想道德修养与法律基础的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_ideological_and_moral_cultivation" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..921709ce12b34c703bb5f5439bac45d188c26e7a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_law.yaml @@ -0,0 +1,4 @@ +"dataset_name": "law" +"description": "以下是中国关于法学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_law" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml new file mode 100644 index 0000000000000000000000000000000000000000..897ed74ffd9c99436fd8d6cec99d79c6b82170be --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml @@ -0,0 +1,4 @@ +"dataset_name": "legal_professional" +"description": "以下是中国关于法律职业资格的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_legal_professional" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_logic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_logic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12912dafe5af2997bcaefb60bb2850fbca54c749 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_logic.yaml @@ -0,0 +1,4 @@ +"dataset_name": "logic" +"description": "以下是中国关于逻辑学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_logic" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0899b735436349b9db8aebaa189d9893df7d477d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml @@ -0,0 +1,4 @@ +"dataset_name": "mao_zedong_thought" +"description": "以下是中国关于毛泽东思想和中国特色社会主义理论体系概论的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_mao_zedong_thought" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_marxism.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_marxism.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bfd3d5dbed6dade2f8e04d0c037b394f5a87d8ab --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_marxism.yaml @@ -0,0 +1,4 @@ +"dataset_name": "marxism" +"description": "以下是中国关于马克思主义基本原理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_marxism" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d312ceff04bd9d874c5eea8e84a72003b9f5be46 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml @@ -0,0 +1,4 @@ +"dataset_name": "metrology_engineer" +"description": "以下是中国关于注册计量师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_metrology_engineer" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d179a2f592baac85a39d55c6a103203433283b6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_geography" +"description": "以下是中国关于初中地理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_geography" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b61a8ee835d45987b9ff347a9a9a3f9510e7617a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_history" +"description": "以下是中国关于初中历史的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..698103d463233fad3ad8444d14bb2752167eeabd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_mathematics" +"description": "以下是中国关于初中数学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dbe69686af439ee3331ba2b9f8d246b1dd454e55 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_physics" +"description": "以下是中国关于初中物理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0cf20c29b47857772b6c8c3f71f4353589b8e69d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_politics" +"description": "以下是中国关于初中政治的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_politics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1621075b85f65e213009cfc00e530c5f974fd8b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml @@ -0,0 +1,4 @@ +"dataset_name": "modern_chinese_history" +"description": "以下是中国关于近代史纲要的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_modern_chinese_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_operating_system.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_operating_system.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0c7afea6f22276f496fc6df1a30151f47fabc6b4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_operating_system.yaml @@ -0,0 +1,4 @@ +"dataset_name": "operating_system" +"description": "以下是中国关于操作系统的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_operating_system" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_plant_protection.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_plant_protection.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d73e014bb71d7a8df5990726fffd44c0eefe679 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_plant_protection.yaml @@ -0,0 +1,4 @@ +"dataset_name": "plant_protection" +"description": "以下是中国关于植物保护的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_plant_protection" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_probability_and_statistics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_probability_and_statistics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..82d1fcbb2f410b47cd2956f82741f25ceefcf118 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_probability_and_statistics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "probability_and_statistics" +"description": "以下是中国关于概率统计的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_probability_and_statistics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_tax_accountant.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_tax_accountant.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c461a3401b0bddc816486be34039b1832759ebb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_tax_accountant.yaml @@ -0,0 +1,4 @@ +"dataset_name": "tax_accountant" +"description": "以下是中国关于税务师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_tax_accountant" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_urban_and_rural_planner.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_urban_and_rural_planner.yaml new file mode 100644 index 0000000000000000000000000000000000000000..957a53fbd6ccff5574cad2b5a325e24086df1ee2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_urban_and_rural_planner.yaml @@ -0,0 +1,4 @@ +"dataset_name": "urban_and_rural_planner" +"description": "以下是中国关于注册城乡规划师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_urban_and_rural_planner" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_veterinary_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_veterinary_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a493fd6518e4513db06949228d9b381f37c75c9b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_veterinary_medicine.yaml @@ -0,0 +1,4 @@ +"dataset_name": "veterinary_medicine" +"description": "以下是中国关于兽医学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_veterinary_medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..837c704dfd5219fe49016b0eb9052b75dc612b99 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/README.md @@ -0,0 +1,57 @@ +# PolEmo 2.0 + +### Paper + +Title: `Multi-Level Sentiment Analysis of PolEmo 2.0: Extended Corpus of Multi-Domain Consumer Reviews` + +Abstract: https://aclanthology.org/K19-1092/ + +The PolEmo 2.0 is a dataset of online consumer reviews in Polish from four domains: medicine, hotels, products, and university. It is human-annotated on a level of full reviews and individual sentences. It comprises over 8000 reviews, about 85% from the medicine and hotel domains. +The goal is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation. + +Homepage: https://clarin-pl.eu/dspace/handle/11321/710 + + +### Citation + +``` +@inproceedings{kocon-etal-2019-multi, + title = "Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews", + author = "Koco{\'n}, Jan and + Mi{\l}kowski, Piotr and + Za{\'s}ko-Zieli{\'n}ska, Monika", + booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)", + month = nov, + year = "2019", + address = "Hong Kong, China", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/K19-1092", + doi = "10.18653/v1/K19-1092", + pages = "980--991", + abstract = "In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).", +} +``` + +### Groups and Tasks + +#### Groups + +* `polemo2`: Evaluates `polemo2_in` and `polemo2_out` + +#### Tasks + +* `polemo2_in`: evaluates sentiment predictions of in-domain (medicine and hotels) reviews +* `polemo2_out`: evaluates sentiment predictions of out-of-domain (products and university) reviews + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_in.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_in.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c667cf6e43f4abb3e73ca7226978c747e626eac8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_in.yaml @@ -0,0 +1,46 @@ +tag: + - polemo2 +task: polemo2_in +dataset_path: allegro/klej-polemo2-in +dataset_name: null +output_type: generate_until +training_split: train +validation_split: validation +test_split: test +doc_to_text: "Opinia: \"{{sentence}}\"\nOkreśl sentyment podanej opinii. Możliwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawidłowa odpowiedź:" +doc_to_target: "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}" +should_decontaminate: true +doc_to_decontamination_query: "{{sentence}}" +generation_kwargs: + until: + - "." + - "," + do_sample: false + temperature: 0.0 + max_gen_toks: 50 +filter_list: + - name: "score-first" + filter: + - function: "regex" + regex_pattern: "(\\b[ABCD]\\b)" + - function: "take_first" + - function: "map" + mapping_dict: + A: 0 + B: 1 + C: 2 + D: 3 + default_value: -1 + - function: "take_first" +metric_list: + - metric: f1 + aggregation: mean + higher_is_better: true + hf_evaluate: true + average: micro + - metric: accuracy + aggregation: mean + higher_is_better: true + hf_evaluate: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_out.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_out.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc1508faf7e33261dc9a4a44b3fd269147730f01 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_out.yaml @@ -0,0 +1,4 @@ +include: polemo2_in.yaml +task: polemo2_out +dataset_path: allegro/klej-polemo2-out +dataset_name: null diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd36302619a2cc1b40b57ef758d328d85580e420 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/README.md @@ -0,0 +1,39 @@ +# Translation Tasks + +### Paper + + + +### Citation + +``` + +``` + +### Groups and Tasks + +#### Groups + +* `gpt3_translation_tasks` +* `wmt14` +* `wmt16` +* `wmt20` +* `iwslt2017` + +#### Tasks + +* + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? + * [ ] Checked for equivalence with v0.3.0 LM Evaluation Harness diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/iwslt2017_ar-en.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/iwslt2017_ar-en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..824f4eba6730f57ef5282ec557b884b1dc772db9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/iwslt2017_ar-en.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: iwslt2017-en-ar +dataset_path: iwslt2017 +doc_to_target: ' {{translation["en"]}}' +doc_to_text: 'Arabic phrase: {{translation["ar"]}} + + English phrase:' +tag: +- translation +- iwslt2017 +include: wmt_common_yaml +task: iwslt2017-ar-en diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/iwslt2017_en-ar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/iwslt2017_en-ar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c4b124b01807c5af10b95c30d6251107c0c95c9c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/iwslt2017_en-ar.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: iwslt2017-en-ar +dataset_path: iwslt2017 +doc_to_target: ' {{translation["ar"]}}' +doc_to_text: 'English phrase: {{translation["en"]}} + + Arabic phrase:' +tag: +- translation +- iwslt2017 +include: wmt_common_yaml +task: iwslt2017-en-ar diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f30c4d86259259a325edcee3b64ad3199b966c96 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/utils.py @@ -0,0 +1,118 @@ +import argparse + +import yaml + + +try: + import pycountry +except ModuleNotFoundError: + raise Exception( + "`pycountry` is required for generating translation task prompt templates. \ +please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]", + ) + + +# Different translation benchmarks included in the library. Mostly WMT. +# These correspond to dataset names (subsets) on HuggingFace for each dataset. +# A yaml file is generated by this script for each language pair. + +gpt3_translation_benchmarks = { + "wmt14": ["fr-en"], # ["en-fr", "fr-en"], # French + "wmt16": [ + "ro-en", + "de-en", + ], # ["en-ro", "ro-en", "de-en", "en-de"], # German, Romanian +} + +# 28 total +LANGUAGES = { + **gpt3_translation_benchmarks, + # "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"), + "iwslt2017": ["en-ar"], # Arabic +} + + +def code_to_language(code): + # key is alpha_2 or alpha_3 depending on the code length + language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code}) + return language_tuple.name + + +def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: + """ + Generate a yaml file for each language. + + :param output_dir: The directory to output the files to. + :param overwrite: Whether to overwrite files if they already exist. + """ + err = [] + for lang in LANGUAGES.keys(): + for dataset_name in LANGUAGES[lang]: + src_lang, _, tgt_lang = dataset_name.partition("-") + for src, tgt in [[src_lang, tgt_lang], [tgt_lang, src_lang]]: + # both translation directions for each lang pair + lang_pair = src + "-" + tgt + file_name = f"{lang}_{lang_pair}.yaml" + try: + source, target = code_to_language(src), code_to_language(tgt) + + groups = ["generate_until", "translation", lang] + if lang in gpt3_translation_benchmarks.keys(): + groups += ["gpt3_translation_benchmarks"] + + with open( + f"{output_dir}/{file_name}", + "w" if overwrite else "x", + encoding="utf8", + ) as f: + f.write("# Generated by utils.py\n") + yaml.dump( + { + "include": "wmt_common_yaml", + "group": groups, + "dataset_path": lang, + "dataset_name": dataset_name + if not (lang == "iwslt2017") + else "iwslt2017-" + dataset_name, + "task": f"{lang}-{lang_pair}", + "doc_to_text": f"{source} phrase: " + + "{{translation[" + + f'"{src}"' + + "]}}\n" + + f"{target} phrase:", + "doc_to_target": " {{" + + "translation[" + + f'"{tgt}"]' + + "}}", + }, + f, + ) + except FileExistsError: + err.append(file_name) + + if len(err) > 0: + raise FileExistsError( + "Files were not created because they already exist (use --overwrite flag):" + f" {', '.join(err)}" + ) + + +def main() -> None: + """Parse CLI args and generate language-specific yaml files.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--overwrite", + default=False, + action="store_true", + help="Overwrite files if they already exist", + ) + parser.add_argument( + "--output-dir", default=".", help="Directory to write yaml files to" + ) + args = parser.parse_args() + + gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite) + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt14_en-fr.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt14_en-fr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd5f3ce0438746a0a3dbfd0151b178b383191175 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt14_en-fr.yaml @@ -0,0 +1,13 @@ +# Generated by utils.py +dataset_name: fr-en +dataset_path: wmt14 +doc_to_target: ' {{translation["fr"]}}' +doc_to_text: 'English phrase: {{translation["en"]}} + + French phrase:' +tag: +- translation +- wmt14 +- gpt3_translation_benchmarks +include: wmt_common_yaml +task: wmt14-en-fr diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt14_fr-en.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt14_fr-en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d91fed417607115a6b3c2a67afdb0c1b4c6410cf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt14_fr-en.yaml @@ -0,0 +1,13 @@ +# Generated by utils.py +dataset_name: fr-en +dataset_path: wmt14 +doc_to_target: ' {{translation["en"]}}' +doc_to_text: 'French phrase: {{translation["fr"]}} + + English phrase:' +tag: +- translation +- wmt14 +- gpt3_translation_benchmarks +include: wmt_common_yaml +task: wmt14-fr-en diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_de-en.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_de-en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5ac1805ee77927129d1d668b455731511874485 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_de-en.yaml @@ -0,0 +1,13 @@ +# Generated by utils.py +dataset_name: de-en +dataset_path: wmt16 +doc_to_target: ' {{translation["en"]}}' +doc_to_text: 'German phrase: {{translation["de"]}} + + English phrase:' +tag: +- translation +- wmt16 +- gpt3_translation_benchmarks +include: wmt_common_yaml +task: wmt16-de-en diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_en-de.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_en-de.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d52ab498f7d6baa2cba68b1c35e3eb8dcd34a6df --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_en-de.yaml @@ -0,0 +1,13 @@ +# Generated by utils.py +dataset_name: de-en +dataset_path: wmt16 +doc_to_target: ' {{translation["de"]}}' +doc_to_text: 'English phrase: {{translation["en"]}} + + German phrase:' +tag: +- translation +- wmt16 +- gpt3_translation_benchmarks +include: wmt_common_yaml +task: wmt16-en-de diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_en-ro.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_en-ro.yaml new file mode 100644 index 0000000000000000000000000000000000000000..096f8743b4bfb56b332aea69b0056ade8d200fd7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_en-ro.yaml @@ -0,0 +1,13 @@ +# Generated by utils.py +dataset_name: ro-en +dataset_path: wmt16 +doc_to_target: ' {{translation["ro"]}}' +doc_to_text: 'English phrase: {{translation["en"]}} + + Romanian phrase:' +tag: +- translation +- wmt16 +- gpt3_translation_benchmarks +include: wmt_common_yaml +task: wmt16-en-ro diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_ro-en.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_ro-en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9e338347cc9d885d03e639856976b795a50a0ce2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt16_ro-en.yaml @@ -0,0 +1,13 @@ +# Generated by utils.py +dataset_name: ro-en +dataset_path: wmt16 +doc_to_target: ' {{translation["en"]}}' +doc_to_text: 'Romanian phrase: {{translation["ro"]}} + + English phrase:' +tag: +- translation +- wmt16 +- gpt3_translation_benchmarks +include: wmt_common_yaml +task: wmt16-ro-en diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt_common_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt_common_yaml new file mode 100644 index 0000000000000000000000000000000000000000..2cb3c7c8f8d8305e9907c89c94d6f8fd95c709fc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/translation/wmt_common_yaml @@ -0,0 +1,17 @@ +output_type: generate_until +training_split: train +validation_split: validation +fewshot_split: validation +test_split: test +metric_list: + - metric: bleu + - metric: ter + - metric: chrf +generation_kwargs: + until: + - "\n" + do_sample: false + temperature: 0.0 +repeats: 1 +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..aadde0c1dce80cf0f6fe17fbb8a2a1563f34051b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/README.md @@ -0,0 +1,50 @@ +# WMDP + +### Paper + +Title: `The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning` + +Abstract: `https://arxiv.org/abs/2403.03218` + +`The Weapons of Mass Destruction Proxy (WMDP) benchmark is a dataset of 4,157 multiple-choice questions surrounding hazardous knowledge in biosecurity cybersecurity, and chemical security. WMDP serves as both a proxy evaluation for hazardous knowledge in large language models (LLMs) and a benchmark for unlearning methods to remove such knowledge.` + +Homepage: https://wmdp.ai + + +### Citation + +``` +@misc{li2024wmdp, + title={The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning}, + author={Nathaniel Li and Alexander Pan and Anjali Gopal and Summer Yue and Daniel Berrios and Alice Gatti and Justin D. Li and Ann-Kathrin Dombrowski and Shashwat Goel and Long Phan and Gabriel Mukobi and Nathan Helm-Burger and Rassin Lababidi and Lennart Justen and Andrew B. Liu and Michael Chen and Isabelle Barrass and Oliver Zhang and Xiaoyuan Zhu and Rishub Tamirisa and Bhrugu Bharathi and Adam Khoja and Zhenqi Zhao and Ariel Herbert-Voss and Cort B. Breuer and Andy Zou and Mantas Mazeika and Zifan Wang and Palash Oswal and Weiran Liu and Adam A. Hunt and Justin Tienken-Harder and Kevin Y. Shih and Kemper Talley and John Guan and Russell Kaplan and Ian Steneker and David Campbell and Brad Jokubaitis and Alex Levinson and Jean Wang and William Qian and Kallol Krishna Karmakar and Steven Basart and Stephen Fitz and Mindy Levine and Ponnurangam Kumaraguru and Uday Tupakula and Vijay Varadharajan and Yan Shoshitaishvili and Jimmy Ba and Kevin M. Esvelt and Alexandr Wang and Dan Hendrycks}, + year={2024}, + eprint={2403.03218}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} +``` + +### Groups, Tags, and Tasks + +#### Groups + +* `wmdp`: All 4,157 multiple-choice questions in biosecurity, cybersecurity, and chemical security + +#### Tasks + +* `wmdp_bio`: 1,520 multiple-choice questions in biosecurity +* `wmdp_cyber`: 2,225 multiple-choice questions in cybersecurity +* `wmdp_chemistry`: 412 multiple-choice questions in chemical security + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c2f25c1d7ae111422411d8d27e6210300adff4f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/_default_template_yaml @@ -0,0 +1,15 @@ +dataset_path: cais/wmdp +test_split: test +training_split: null +validation_split: null +num_fewshot: 0 +output_type: multiple_choice +doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/_wmdp.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/_wmdp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec1c795264d990b018d1112bf490591c43a815fa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/_wmdp.yaml @@ -0,0 +1,11 @@ +group: wmdp +task: + - wmdp_bio + - wmdp_chem + - wmdp_cyber +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: True +metadata: + version: 1 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_bio.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_bio.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1096b6f873048709ea16b189c3a244856a2272c0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_bio.yaml @@ -0,0 +1,4 @@ +"task": "wmdp_bio" +"dataset_name": "wmdp-bio" +"include": "_default_template_yaml" +"description": "The following are multiple choice questions (with answers) about biology.\n\n" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_chem.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_chem.yaml new file mode 100644 index 0000000000000000000000000000000000000000..788d6d618bb6f7328841374b2a98a675f9f51849 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_chem.yaml @@ -0,0 +1,4 @@ +"task": "wmdp_chem" +"dataset_name": "wmdp-chem" +"include": "_default_template_yaml" +"description": "The following are multiple choice questions (with answers) about chemistry.\n\n" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_cyber.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_cyber.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cac9ba825d719ac7a651ba24443ee6d7fa22567f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wmdp/wmdp_cyber.yaml @@ -0,0 +1,4 @@ +"task": "wmdp_cyber" +"dataset_name": "wmdp-cyber" +"include": "_default_template_yaml" +"description": "The following are multiple choice questions (with answers) about cybersecurity.\n\n"