diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md new file mode 100644 index 0000000000000000000000000000000000000000..972acb9f7431d34c216bbd27fee35f7ca138dcf5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md @@ -0,0 +1,40 @@ +#Arabic COPA + +### Paper + +Original Title: `COPA` + + + +The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning. + +[Homepage](https://people.ict.usc.edu/~gordon/copa.html) + +AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf) + +The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/copa_ar) + +### Citation + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `copa_ar` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e35d1688babf0b5386f70f563fa923242540d0d5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml @@ -0,0 +1,21 @@ +task: copa_ar +dataset_path: Hennara/copa_ar +dataset_name: null +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +doc_to_text: "السؤال: {{query}}\nالجواب:" +doc_to_choice: "{{[sol1, sol2]}}" +doc_to_target: label +should_decontaminate: true +doc_to_decontamination_query: query +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1b71e93da4c00104c38c03b9d4486966e8ad567 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md @@ -0,0 +1,43 @@ +#Arabic PIQA + +### Paper + +Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language` + +Original paper: [PICA](https://arxiv.org/abs/1911.11641) + +Physical Interaction: Question Answering (PIQA) is a physical commonsense +reasoning and a corresponding benchmark dataset. PIQA was designed to investigate +the physical knowledge of existing models. To what extent are current approaches +actually learning about the world? + +[Homepage](https://yonatanbisk.com/piqa) + +AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf) + +The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/pica_ar) + +### Citation + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `piqa_ar` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..19dfaee0c609f409d3bd6e37163054c2e80af37a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml @@ -0,0 +1,21 @@ +task: piqa_ar +dataset_path: Hennara/pica_ar +dataset_name: null +output_type: multiple_choice +training_split: null +validation_split: null +test_split: test +doc_to_text: "السؤال: {{goal}}\nالجواب:" +doc_to_choice: "{{[sol1, sol2]}}" +doc_to_target: label +should_decontaminate: true +doc_to_decontamination_query: goal +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7690c205c45e0c425acb025940097f10ad181c73 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md @@ -0,0 +1,48 @@ +# CMMLU + +### Paper + +CMMLU: Measuring massive multitask language understanding in Chinese +https://arxiv.org/abs/2306.09212 + +CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture. +CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels. + +Homepage: https://github.com/haonan-li/CMMLU + +### Citation + +```bibtex +@misc{li2023cmmlu, + title={CMMLU: Measuring massive multitask language understanding in Chinese}, + author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin}, + year={2023}, + eprint={2306.09212}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +- `cmmlu`: All 67 subjects of the CMMLU dataset, evaluated following the methodology in MMLU's original implementation. + +#### Tasks + + +The following tasks evaluate subjects in the CMMLU dataset using loglikelihood-based multiple-choice scoring: +- `cmmlu_{subject_english}` + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? + * [x] Yes, original implementation contributed by author of the benchmark + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4101b18ee4e65cddb3ee71f3e238894b8a667f81 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml @@ -0,0 +1,78 @@ +group: cmmlu +task: + - cmmlu_agronomy + - cmmlu_anatomy + - cmmlu_ancient_chinese + - cmmlu_arts + - cmmlu_astronomy + - cmmlu_business_ethics + - cmmlu_chinese_civil_service_exam + - cmmlu_chinese_driving_rule + - cmmlu_chinese_food_culture + - cmmlu_chinese_foreign_policy + - cmmlu_chinese_history + - cmmlu_chinese_literature + - cmmlu_chinese_teacher_qualification + - cmmlu_clinical_knowledge + - cmmlu_college_actuarial_science + - cmmlu_college_education + - cmmlu_college_engineering_hydrology + - cmmlu_college_law + - cmmlu_college_mathematics + - cmmlu_college_medical_statistics + - cmmlu_college_medicine + - cmmlu_computer_science + - cmmlu_computer_security + - cmmlu_conceptual_physics + - cmmlu_construction_project_management + - cmmlu_economics + - cmmlu_education + - cmmlu_electrical_engineering + - cmmlu_elementary_chinese + - cmmlu_elementary_commonsense + - cmmlu_elementary_information_and_technology + - cmmlu_elementary_mathematics + - cmmlu_ethnology + - cmmlu_food_science + - cmmlu_genetics + - cmmlu_global_facts + - cmmlu_high_school_biology + - cmmlu_high_school_chemistry + - cmmlu_high_school_geography + - cmmlu_high_school_mathematics + - cmmlu_high_school_physics + - cmmlu_high_school_politics + - cmmlu_human_sexuality + - cmmlu_international_law + - cmmlu_journalism + - cmmlu_jurisprudence + - cmmlu_legal_and_moral_basis + - cmmlu_logical + - cmmlu_machine_learning + - cmmlu_management + - cmmlu_marketing + - cmmlu_marxist_theory + - cmmlu_modern_chinese + - cmmlu_nutrition + - cmmlu_philosophy + - cmmlu_professional_accounting + - cmmlu_professional_law + - cmmlu_professional_medicine + - cmmlu_professional_psychology + - cmmlu_public_relations + - cmmlu_security_study + - cmmlu_sociology + - cmmlu_sports_science + - cmmlu_traditional_chinese_medicine + - cmmlu_virology + - cmmlu_world_history + - cmmlu_world_religions +aggregate_metric_list: + - aggregation: mean + metric: acc + weight_by_size: true + - aggregation: mean + metric: acc_norm + weight_by_size: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..18bcd59c8ef7f8adf7139a70eee029517b44e257 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml @@ -0,0 +1,18 @@ +dataset_path: haonan-li/cmmlu +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6007825cb9f3cd8c0af7e25c7de6d1c965f612a0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml @@ -0,0 +1,4 @@ +"dataset_name": "arts" +"description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_arts" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3543486b113bdc0a56ac96feadbbc1f3a8ed997b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_actuarial_science" +"description": "以下是关于大学精算学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_college_actuarial_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d52288a4d96e3eee909a7f33c845ba2fa9590aba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_engineering_hydrology" +"description": "以下是关于大学工程水文学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_college_engineering_hydrology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7876a584e7e3c936d30c7e4ad81381ec7e535493 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_mathematics" +"description": "以下是关于大学数学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_college_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f4839bdcac6dc3ba2ee7b874a1700db1d760b49c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_medical_statistics" +"description": "以下是关于大学医学统计的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_college_medical_statistics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86c874e539d21d55540e7e5adce32a624d4a706c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "computer_science" +"description": "以下是关于计算机科学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_computer_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9471546184de5dde5edeb8031a64e588c7594f8f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml @@ -0,0 +1,4 @@ +"dataset_name": "computer_security" +"description": "以下是关于计算机安全的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_computer_security" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..84161ec30ee875253d988a395f892b7982631765 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml @@ -0,0 +1,4 @@ +"dataset_name": "anatomy" +"description": "以下是关于解剖学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_anatomy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6007825cb9f3cd8c0af7e25c7de6d1c965f612a0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml @@ -0,0 +1,4 @@ +"dataset_name": "arts" +"description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_arts" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ad9a8f2c886e189c380b9f01104fca11a2ef529 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml @@ -0,0 +1,4 @@ +"dataset_name": "astronomy" +"description": "以下是关于天文学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_astronomy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dcf6c7e6eeb52f551442de521ed4cc4fdfd272f1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml @@ -0,0 +1,4 @@ +"dataset_name": "chinese_civil_service_exam" +"description": "以下是关于中国公务员考试的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_chinese_civil_service_exam" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml new file mode 100644 index 0000000000000000000000000000000000000000..952f351cb005d300becc2f5e3b7d5b8579b979a5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_education" +"description": "以下是关于大学教育学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_college_education" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86c874e539d21d55540e7e5adce32a624d4a706c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "computer_science" +"description": "以下是关于计算机科学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_computer_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9471546184de5dde5edeb8031a64e588c7594f8f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml @@ -0,0 +1,4 @@ +"dataset_name": "computer_security" +"description": "以下是关于计算机安全的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_computer_security" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4602efb430d49e3a876b7243c4cfffe506094b34 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "economics" +"description": "以下是关于经济学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_economics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6f67be3fc40f5c038b455edcc6076675a4451261 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml @@ -0,0 +1,4 @@ +"dataset_name": "elementary_chinese" +"description": "以下是关于小学语文的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_elementary_chinese" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98c7d3c8f2d85f3c52a3314253d2d2151f7116ae --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "elementary_information_and_technology" +"description": "以下是关于小学信息技术的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_elementary_information_and_technology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f702312ca07c2b882d17c88d30dbe87a837ce5c6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "elementary_mathematics" +"description": "以下是关于初等数学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_elementary_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88a653a9ee5e5978113626a35acbe50bd2ea5437 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ethnology" +"description": "以下是关于民族学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_ethnology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be57628b6f0d3dd2bc6719e08f9aaddb45ac7fa2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "genetics" +"description": "以下是关于遗传学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_genetics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a5e3ee13b6e9670f33068bc731acebf7489737ec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml @@ -0,0 +1,4 @@ +"dataset_name": "legal_and_moral_basis" +"description": "以下是关于法律与道德基础的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_legal_and_moral_basis" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f99fa17514a10e8bf587b50ae9dd997b80c00225 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml @@ -0,0 +1,4 @@ +"dataset_name": "marxist_theory" +"description": "以下是关于马克思主义理论的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_marxist_theory" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml new file mode 100644 index 0000000000000000000000000000000000000000..13b2ccc4f939876616ceeda42d211e96347ce060 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml @@ -0,0 +1,4 @@ +"dataset_name": "modern_chinese" +"description": "以下是关于现代汉语的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_modern_chinese" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17340fa490f0350e6e532b2c67f8c81fa63bfb3a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml @@ -0,0 +1,4 @@ +"dataset_name": "philosophy" +"description": "以下是关于哲学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_philosophy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..92fed45e74f9b69b2c7b595a4bb682318fe0b81c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml @@ -0,0 +1,4 @@ +"dataset_name": "professional_medicine" +"description": "以下是关于专业医学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_professional_medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83f0255591a17711d6ac99cf164a29ffe2a69866 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "professional_psychology" +"description": "以下是关于专业心理学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_professional_psychology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9660f041fcb24ed83089c624f7ef6c6962c5d8b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml @@ -0,0 +1,4 @@ +"dataset_name": "security_study" +"description": "以下是关于安全研究的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_security_study" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..35e5bb9cc4c40abcf271955f068788f85e44794a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "sports_science" +"description": "以下是关于体育学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_sports_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1560b84f682493ef53a9c26ae1d36ac520ff46c7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "virology" +"description": "以下是关于病毒学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_virology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f1dc8a8a4fbc9664da04e2288cf782a9cc1e1877 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml @@ -0,0 +1,4 @@ +"dataset_name": "education" +"description": "以下是关于教育学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_education" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6f67be3fc40f5c038b455edcc6076675a4451261 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml @@ -0,0 +1,4 @@ +"dataset_name": "elementary_chinese" +"description": "以下是关于小学语文的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_elementary_chinese" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3017edd999a0ee04de4a5dd8c7dc4b1b6218f5e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml @@ -0,0 +1,4 @@ +"dataset_name": "elementary_commonsense" +"description": "以下是关于小学常识的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_elementary_commonsense" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f702312ca07c2b882d17c88d30dbe87a837ce5c6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "elementary_mathematics" +"description": "以下是关于初等数学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_elementary_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88a653a9ee5e5978113626a35acbe50bd2ea5437 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ethnology" +"description": "以下是关于民族学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_ethnology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be57628b6f0d3dd2bc6719e08f9aaddb45ac7fa2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "genetics" +"description": "以下是关于遗传学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_genetics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3598501c1763d5f1c19444e1b18bb242149fdd34 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_mathematics" +"description": "以下是关于高中数学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_high_school_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f689dff61a4ea55628b04f9bed5202e48c6eb70 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_politics" +"description": "以下是关于高中政治的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_high_school_politics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..32112d3c8b6ee26ee786439053c2d1f1da5b04c2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml @@ -0,0 +1,4 @@ +"dataset_name": "international_law" +"description": "以下是关于国际法学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_international_law" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c4ac2e12abb2fa29dd2e194f5f1b9417f61142b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml @@ -0,0 +1,4 @@ +"dataset_name": "logical" +"description": "以下是关于逻辑学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_logical" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml new file mode 100644 index 0000000000000000000000000000000000000000..062cd1cd73add5caf387f6b4717c5ed837e2c7f7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml @@ -0,0 +1,4 @@ +"dataset_name": "machine_learning" +"description": "以下是关于机器学习的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_machine_learning" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f99fa17514a10e8bf587b50ae9dd997b80c00225 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml @@ -0,0 +1,4 @@ +"dataset_name": "marxist_theory" +"description": "以下是关于马克思主义理论的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_marxist_theory" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..23d52c45e07134b2ff4f7c1a8e55ba19acfbcfd9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml @@ -0,0 +1,4 @@ +"dataset_name": "nutrition" +"description": "以下是关于营养学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_nutrition" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17340fa490f0350e6e532b2c67f8c81fa63bfb3a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml @@ -0,0 +1,4 @@ +"dataset_name": "philosophy" +"description": "以下是关于哲学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_philosophy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bed3485d787d921fb25bbbfbad7671118acfc42b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml @@ -0,0 +1,4 @@ +"dataset_name": "professional_accounting" +"description": "以下是关于专业会计的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_professional_accounting" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a1c3711ef7734df27852065cf894f9c9cff9d776 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml @@ -0,0 +1,4 @@ +"dataset_name": "public_relations" +"description": "以下是关于公共关系的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_public_relations" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9660f041fcb24ed83089c624f7ef6c6962c5d8b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml @@ -0,0 +1,4 @@ +"dataset_name": "security_study" +"description": "以下是关于安全研究的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_security_study" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..993ce0ab6e390a81286df213e5d3ddd9fe3908bd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml @@ -0,0 +1,4 @@ +"dataset_name": "world_history" +"description": "以下是关于世界历史的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "cmmlu_world_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..868b3a931d7c1c1d5658baccfe7f9e77e8afaf4a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/README.md @@ -0,0 +1,81 @@ +# SuperGLUE + +### Paper + +Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems` +Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf` + +SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language +understanding tasks. + +Homepage: https://super.gluebenchmark.com/ + +### Citation + +``` +@inproceedings{NEURIPS2019_4496bf24, + author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems}, + url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf}, + volume = {32}, + year = {2019} +} +``` + +### Groups, Tags, and Tasks + +#### Groups + +None. + +#### Tags + +* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1 +* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.) + +#### Tasks + +Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF) +| T5V1.1 Base | SGLUE | BoolQ | CB | Copa | MultiRC | ReCoRD | RTE | WiC | WSC | +| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- | +| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) | +| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) | + + + +* `super-glue-lm-eval-v1` + - `boolq` + - `cb` + - `copa` + - `multirc` + - `record` + - `rte` + - `wic` + - `wsc` + +* `super-glue-t5-prompt` + - `super_glue-boolq-t5-prompt` + - `super_glue-cb-t5-prompt` + - `super_glue-copa-t5-prompt` + - `super_glue-multirc-t5-prompt` + - `super_glue-record-t5-prompt` + - `super_glue-rte-t5-prompt` + - `super_glue-wic-t5-prompt` + - `super_glue-wsc-t5-prompt` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/aggregate.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/aggregate.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c1af3580eea906fed64990a317e9cd7766db15 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/aggregate.py @@ -0,0 +1,14 @@ +import numpy as np + + +def cb_multi_fi(items): + from sklearn.metrics import f1_score + + preds, golds = zip(*items) + preds = np.array(preds) + golds = np.array(golds) + f11 = f1_score(y_true=golds == 0, y_pred=preds == 0) + f12 = f1_score(y_true=golds == 1, y_pred=preds == 1) + f13 = f1_score(y_true=golds == 2, y_pred=preds == 2) + avg_f1 = np.mean([f11, f12, f13]) + return avg_f1 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..550635ed78bc87b32f8f1a55167faeff5ebddeb2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/default.yaml @@ -0,0 +1,17 @@ +tag: + - super-glue-lm-eval-v1 +task: cb +dataset_path: super_glue +dataset_name: cb +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:" +doc_to_target: label +doc_to_choice: ['True', 'False', 'Neither'] +metric_list: + - metric: acc + - metric: f1 + aggregation: !function "aggregate.cb_multi_fi" +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5-prompt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8b6f512ca2a246a5b208a616ab6e0df2fc30c5b7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5-prompt.yaml @@ -0,0 +1,25 @@ +tag: + - super-glue-t5-prompt +task: super_glue-cb-t5-prompt +dataset_path: super_glue +dataset_name: cb +training_split: train +validation_split: validation +output_type: generate_until +doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}" +doc_to_target: label +doc_to_choice: ['entailment', 'contradiction', 'neutral'] +generation_kwargs: + until: + - "" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + - metric: !function "t5_utils.mean_3class_f1" + aggregation: !function "t5_utils.agg_mean_3class_f1" + higher_is_better: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..33cbaddf43988a4b7253a647b59885bf91437a23 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5_utils.py @@ -0,0 +1,29 @@ +def mean_3class_f1(predictions, references): # This is a passthrough function + string_label = ["entailment", "contradiction", "neutral"] + predictions = ( + string_label.index(predictions[0]) if predictions[0] in string_label else 0 + ) + references = string_label.index(references[0]) + + return (predictions, references) + + +def agg_mean_3class_f1(items): + predictions, references = zip(*items) + + """Computes the unweighted average of the F1 per class.""" + metric_str = "fbeta_score" + metric_fn_kwargs = { + "beta": 1, + "labels": range(3), + "average": "macro", + } + + def _fn(predictions, references): + import sklearn.metrics + + metric_fn = getattr(sklearn.metrics, metric_str) + metric_val = metric_fn(references, predictions, **metric_fn_kwargs) + return metric_val + + return _fn(predictions, references) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c14f7040d54a7e49854fbcb92e0ce06fc37ffbdd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/default.yaml @@ -0,0 +1,15 @@ +tag: + - super-glue-lm-eval-v1 +task: multirc +dataset_path: super_glue +dataset_name: multirc +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{paragraph}}\nQuestion: {{question}}\nAnswer:" +doc_to_target: label +doc_to_choice: "['''{{answer}}\\nIs the answer correct? yes''', '''{{answer}}\\nIs the answer correct? no''']" +metric_list: + - metric: acc +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..566a65ccf9bcac696622b456ef92b9577593d3f7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml @@ -0,0 +1,23 @@ +tag: + - super-glue-t5-prompt +task: super_glue-multirc-t5-prompt +dataset_path: super_glue +dataset_name: multirc +training_split: train +validation_split: validation +output_type: generate_until +doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}" +doc_to_target: label +doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}" +generation_kwargs: + until: + - "" +metric_list: + - metric: !function t5_utils.f1 + aggregation: !function t5_utils.agg_f1 + higher_is_better: true + - metric: !function t5_utils.em + aggregation: !function t5_utils.agg_em + higher_is_better: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..04f3652b2193bc562ca4a9a067bd803f4f6bdce1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5_utils.py @@ -0,0 +1,54 @@ +import collections + +import numpy as np + + +def f1(predictions, references): # This is a passthrough function + _prediction = predictions[0] + _reference = references[0].split("_")[-1] + string_label = ["False", "True"] + reference = string_label.index(_reference) + prediction = ( + string_label.index(_prediction) + if _prediction in string_label + else not bool(reference) + ) + + return (prediction, reference) + + +def agg_f1(items): + from sklearn.metrics import f1_score + + predictions, references = zip(*items) + references, predictions = np.asarray(references), np.asarray(predictions) + + return f1_score(references, predictions) + + +def em(predictions, references): # This is a passthrough function + _prediction = predictions[0] + _group, _reference = references[0].split("_") + string_label = ["False", "True"] + reference = string_label.index(_reference) + prediction = ( + string_label.index(_prediction) + if _prediction in string_label + else not bool(reference) + ) + + return (_group, prediction, reference) + + +def agg_em(items): + grouped_values = collections.defaultdict(lambda: ([], [])) + for group, prediction, reference in items: + grouped_values[group][0].append(reference) + grouped_values[group][1].append(prediction) + + group_scores = [] + for group, (targets, predictions) in grouped_values.items(): + score = float(np.array_equal(targets, predictions)) + group_scores.append(score) + + return np.mean(group_scores) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b660f36dd557e406002394c56defce3c032470ec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/default.yaml @@ -0,0 +1,15 @@ +tag: + - super-glue-lm-eval-v1 +task: sglue_rte +dataset_path: super_glue +dataset_name: rte +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True or False?\nAnswer:" +doc_to_target: label +doc_to_choice: ['True', 'False'] +metric_list: + - metric: acc +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/t5-prompt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..27caab0dde4e42db1d0e9298ea6c0ecf6af21303 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/t5-prompt.yaml @@ -0,0 +1,22 @@ +tag: + - super-glue-t5-prompt +task: super_glue-rte-t5-prompt +dataset_path: super_glue +dataset_name: rte +training_split: train +validation_split: validation +output_type: generate_until +doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}" +doc_to_target: label +doc_to_choice: ['entailment', 'not_entailment'] +generation_kwargs: + until: + - "" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2cd12679c020f217b39e2c4e4fb6a7a2d7a537df --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/default.yaml @@ -0,0 +1,15 @@ +tag: + - super-glue-lm-eval-v1 +task: wsc +dataset_path: super_glue +dataset_name: wsc.fixed +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: !function preprocess_wsc.default_doc_to_text +doc_to_target: label +doc_to_choice: ['no', 'yes'] +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py new file mode 100644 index 0000000000000000000000000000000000000000..c62c25676a51fd8e60a4d9fc6f8755041bba7534 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py @@ -0,0 +1,17 @@ +from lm_eval.utils import general_detokenize + + +def default_doc_to_text(x): + raw_passage = x["text"] + # NOTE: HuggingFace span indices are word-based not character-based. + pre = " ".join(raw_passage.split()[: x["span2_index"]]) + post = raw_passage[len(pre) + len(x["span2_text"]) + 1 :] + passage = general_detokenize(pre + " *{}*".format(x["span2_text"]) + post) + noun = x["span1_text"] + pronoun = x["span2_text"] + text = ( + f"Passage: {passage}\n" + + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n' + + "Answer:" + ) + return text diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..77bfe7d0da7b2206d70a43771e60577c338dd73d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml @@ -0,0 +1,20 @@ +tag: + - super-glue-t5-prompt +task: super_glue-wsc-t5-prompt +dataset_path: super_glue +dataset_name: wsc.fixed +training_split: train +validation_split: validation +output_type: generate_until +doc_to_text: !function "t5_utils.doc_to_text" +process_results: !function "t5_utils.process_results" +doc_to_target: label +generation_kwargs: + until: + - "" +metric_list: + - metric: accuracy + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2860a2a903944a11fff0e981c5135214a8cf8f17 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5_utils.py @@ -0,0 +1,104 @@ +import re +from typing import List + + +def doc_to_text(x): + text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x)) + return "wsc: " + text + + +def _wsc_inputs(x): + words = x["text"].split(" ") + + # We would need some special logic to handle the case where the pronoun is the + # first or last word in the text. None of the examples in WSC seem to have + # this, so we are ignoring these cases. + assert x["span2_index"] > 0 + assert x["span2_index"] < len(words) + pronoun_index = x["span2_index"] + + def create_input(): + assert words[pronoun_index] == x["span2_text"] + + return " ".join( + [ + " ".join(words[:pronoun_index]), + "X", + " ".join(words[pronoun_index + 1 :]), + ] + ) + + # Handle some special cases. + if ( + x["text"] + == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. ' + ): + return ( + "The boy continued to whip the pony , and eventually the pony threw " + 'him over. John laughed out quite loud. "Good for X ," he said.' + ) + + # Using the span2_index, we get 'use' instead of 'it'. + if ( + x["text"] + == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?" + ): + return ( + "When they had eventually calmed down a bit , and had gotten home, " + "Mr. Farley put the magic pebble in an iron safe . Some day they might " + "want to use X , but really for now, what more could they wish for?" + ) + + return create_input() + + +DETERMINERS = { + "a", + "an", + "few", + "her", + "his", + "each", + "every", + "many", + "much", + "my", + "our", + "some", + "that", + "the", + "their", + "these", + "this", + "those", + "which", + "whose", + "your", +} + + +def clean(s: str) -> str: + """Ignore capitalization and determiners.""" + s = s.strip().lower() + return " ".join([w for w in s.split(" ") if w not in DETERMINERS]) + + +def process_results(docs: dict, resps: List): + prediction = clean(resps[0]) + reference = clean(docs["span1_text"]) + + if ("'" in prediction) != ("'" in reference): + # referent is "Bob's hat" as predicting the referent. + predicted_referent = False + else: + prediction_words = set(prediction.split(" ")) + referent_words = set(reference.split(" ")) + + # Handle cases where the prediction is "fuzzy bunny" and the referent is + # "bunny". + predicted_referent = prediction_words.issubset( + referent_words + ) or referent_words.issubset(prediction_words) + + acc = 1.0 if predicted_referent == docs["label"] else 0.0 + return {"accuracy": acc} diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/README.md new file mode 100644 index 0000000000000000000000000000000000000000..237946631345068184361be3dd0df3542b8a69e8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/README.md @@ -0,0 +1,49 @@ +# Wikitext + +### Paper + +Pointer Sentinel Mixture Models +https://arxiv.org/pdf/1609.07843.pdf + +The WikiText language modeling dataset is a collection of over 100 million tokens +extracted from the set of verified Good and Featured articles on Wikipedia. + +NOTE: This `Task` is based on WikiText-2. + +Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/ + + +### Citation + +``` +@misc{merity2016pointer, + title={Pointer Sentinel Mixture Models}, + author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher}, + year={2016}, + eprint={1609.07843}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods. + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/preprocess_wikitext.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/preprocess_wikitext.py new file mode 100644 index 0000000000000000000000000000000000000000..e5dff22b2805e0e912d8ad263fd3ffda7e529d4c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/preprocess_wikitext.py @@ -0,0 +1,48 @@ +import re + + +def wikitext_detokenizer(doc): + string = doc["page"] + # contractions + string = string.replace("s '", "s'") + string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) + # number separators + string = string.replace(" @-@ ", "-") + string = string.replace(" @,@ ", ",") + string = string.replace(" @.@ ", ".") + # punctuation + string = string.replace(" : ", ": ") + string = string.replace(" ; ", "; ") + string = string.replace(" . ", ". ") + string = string.replace(" ! ", "! ") + string = string.replace(" ? ", "? ") + string = string.replace(" , ", ", ") + # double brackets + string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) + string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) + string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) + string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) + string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) + # miscellaneous + string = string.replace("= = = =", "====") + string = string.replace("= = =", "===") + string = string.replace("= =", "==") + string = string.replace(" " + chr(176) + " ", chr(176)) + string = string.replace(" \n", "\n") + string = string.replace("\n ", "\n") + string = string.replace(" N ", " 1 ") + string = string.replace(" 's", "'s") + + return string + + +def process_results(doc, results): + (loglikelihood,) = results + # IMPORTANT: wikitext counts number of words in *original doc before detokenization* + _words = len(re.split(r"\s+", doc["page"])) + _bytes = len(doc["page"].encode("utf-8")) + return { + "word_perplexity": (loglikelihood, _words), + "byte_perplexity": (loglikelihood, _bytes), + "bits_per_byte": (loglikelihood, _bytes), + } diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/wikitext.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/wikitext.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc95b1026103695f50db7ec3931e4bbd63932910 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/wikitext.yaml @@ -0,0 +1,20 @@ +task: wikitext +dataset_path: EleutherAI/wikitext_document_level +dataset_name: wikitext-2-raw-v1 +output_type: loglikelihood_rolling +training_split: train +validation_split: validation +test_split: test +doc_to_text: "" +doc_to_target: !function preprocess_wikitext.wikitext_detokenizer +process_results: !function preprocess_wikitext.process_results +should_decontaminate: true +doc_to_decontamination_query: "{{page}}" +metric_list: + - metric: word_perplexity + - metric: byte_perplexity + - metric: bits_per_byte +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d763dffc02ada2e9c619e3ab74423f81dd368d8a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/README.md @@ -0,0 +1,54 @@ +# WinoGrande + +### Paper + +Title: `WinoGrande: An Adversarial Winograd Schema Challenge at Scale` + +Abstract: https://arxiv.org/abs/1907.10641 + +WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge +(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and +robustness against the dataset-specific bias. Formulated as a fill-in-a-blank +task with binary options, the goal is to choose the right option for a given +sentence which requires commonsense reasoning. + +NOTE: This evaluation of Winogrande uses partial evaluation as described by +Trinh & Le in Simple Method for Commonsense Reasoning (2018). +See: https://arxiv.org/abs/1806.02847 + +Homepage: https://leaderboard.allenai.org/winogrande/submissions/public + + +### Citation + +``` +@article{sakaguchi2019winogrande, + title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale}, + author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin}, + journal={arXiv preprint arXiv:1907.10641}, + year={2019} +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `winogrande` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/__pycache__/preprocess_winogrande.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/__pycache__/preprocess_winogrande.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64b65c267ae3fe9b50fce09d4be56df9e772b21d Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/__pycache__/preprocess_winogrande.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12e0077a70f79a333c273b4be2feddc498f8fa31 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/default.yaml @@ -0,0 +1,19 @@ +task: winogrande +dataset_path: winogrande +dataset_name: winogrande_xl +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: !function preprocess_winogrande.doc_to_text +doc_to_target: !function preprocess_winogrande.doc_to_target +doc_to_choice: !function preprocess_winogrande.doc_to_choice +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/preprocess_winogrande.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/preprocess_winogrande.py new file mode 100644 index 0000000000000000000000000000000000000000..2f2076a762905cd151db382ec78109795975d74f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/preprocess_winogrande.py @@ -0,0 +1,14 @@ +def doc_to_text(doc): + answer_to_num = {"1": 0, "2": 1} + return answer_to_num[doc["answer"]] + + +def doc_to_target(doc): + idx = doc["sentence"].index("_") + 1 + return doc["sentence"][idx:].strip() + + +def doc_to_choice(doc): + idx = doc["sentence"].index("_") + options = [doc["option1"], doc["option2"]] + return [doc["sentence"][:idx] + opt for opt in options]