diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5e218e599d3b5147cb0417b4656b9fb2ebb1ee9e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/README.md @@ -0,0 +1,50 @@ +# ACLUE + +### Paper + +Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE +https://arxiv.org/abs/2310.09550 + +The Ancient Chinese Language Understanding Evaluation (ACLUE) is an evaluation benchmark focused on ancient Chinese language comprehension. It aims to assess the performance of large-scale language models on understanding ancient Chinese. The benchmark comprises 15 tasks spanning various domains, including lexical, syntactic, semantic, inference, and knowledge. ACLUE's tasks are derived from a combination of manually curated questions from publicly available resources, and automatically +generated questions from classical Chinese language corpora. The range of questions span from the Xia dynasty (2070 BCE) to the Ming dynasty (1368 CE). ACLUE adopts a multiple-choice question format for all tasks. + +Homepage: https://github.com/isen-zhang/ACLUE + +### Citation + +```bibtex +@inproceedings{zhang-li-2023-large, + title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}", + author = "Zhang, Yixuan and Li, Haonan", + booktitle = "Proceedings of the Ancient Language Processing Workshop", + month = sep, + year = "2023", + address = "Varna, Bulgaria", + publisher = "INCOMA Ltd., Shoumen, Bulgaria", + url = "https://aclanthology.org/2023.alp-1.9", + pages = "80--87" +} +``` + +### Groups, Tags, and Tasks + +#### Groups + +- `aclue`: All 15 subjects of the ACLUE dataset, evaluated following the methodology in CMMLU's original implementation. + +#### Tasks + +The following tasks evaluate subjects in the ACLUE dataset using loglikelihood-based multiple-choice scoring: +- `aclue_{subject_english}` + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? + * [x] Yes, original implementation contributed by author of the benchmark + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_aclue.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_aclue.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2ae37ef5a6794a0db58005ea14f3943e56c87e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_aclue.yaml @@ -0,0 +1,26 @@ +group: aclue +task: + - aclue_ancient_chinese_culture + - aclue_ancient_literature + - aclue_ancient_medical + - aclue_ancient_phonetics + - aclue_basic_ancient_chinese + - aclue_couplet_prediction + - aclue_homographic_character_resolution + - aclue_named_entity_recognition + - aclue_poetry_appreciate + - aclue_poetry_context_prediction + - aclue_poetry_quality_assessment + - aclue_poetry_sentiment_analysis + - aclue_polysemy_resolution + - aclue_reading_comprehension + - aclue_sentence_segmentation +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..9505197a72cef39b25bd5ef39d65c13bd97a89ea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_default_template_yaml @@ -0,0 +1,18 @@ +dataset_path: tyouisen/aclue +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..8bd1792ae3d200b422c6f804ef7d89252591b2a7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/_generate_configs.py @@ -0,0 +1,82 @@ +""" +Take in a YAML, and output all other splits with this YAML +""" + +import argparse +import os + +import yaml +from tqdm import tqdm + +from lm_eval.utils import eval_logger + + +SUBJECTS = { + "古文单字多义": "polysemy_resolution", + "诗词情感分类": "poetry_sentiment_analysis", + "古汉语命名体识别": "named_entity_recognition", + "古汉语知识": "basic_ancient_chinese", + "古诗词上下句预测": "poetry_context_prediction", + "古文断句": "sentence_segmentation", + "对联": "couplet_prediction", + "古诗词曲鉴赏": "poetry_appreciate", + "国学常识": "ancient_chinese_culture", + "古音学": "ancient_phonetics", + "通假字": "homographic_character_resolution", + "古代文学知识": "ancient_literature", + "医古文": "ancient_medical", + "古诗词质量评估": "poetry_quality_assessment", + "古文阅读理解": "reading_comprehension", +} + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--base_yaml_path", required=True) + parser.add_argument("--save_prefix_path", default="aclue") + parser.add_argument("--cot_prompt_path", default=None) + parser.add_argument("--task_prefix", default="") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + # get filename of base_yaml so we can `"include": ` it in our other YAMLs. + base_yaml_name = os.path.split(args.base_yaml_path)[-1] + with open(args.base_yaml_path, encoding="utf-8") as f: + base_yaml = yaml.full_load(f) + + if args.cot_prompt_path is not None: + import json + + with open(args.cot_prompt_path, encoding="utf-8") as f: + cot_file = json.load(f) + + for subject_zh, subject_eng in tqdm(SUBJECTS.items()): + if args.cot_prompt_path is not None: + description = cot_file[subject_eng] + else: + description = ( + f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n" + ) + + yaml_dict = { + "include": base_yaml_name, + "task": f"aclue_{args.task_prefix}_{subject_eng}" + if args.task_prefix != "" + else f"aclue_{subject_eng}", + "dataset_name": subject_eng, + "description": description, + } + + file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml" + eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}") + with open(file_save_path, "w", encoding="utf-8") as yaml_file: + yaml.dump( + yaml_dict, + yaml_file, + width=float("inf"), + allow_unicode=True, + default_style='"', + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9f52077dedd24ce500247a4b606eea83fac6320 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ancient_chinese_culture" +"description": "以下是关于国学常识的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_ancient_chinese_culture" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_literature.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_literature.yaml new file mode 100644 index 0000000000000000000000000000000000000000..641befa3aa1920d8dca1c7007a4fe8cd24ab8e77 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_literature.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ancient_literature" +"description": "以下是关于古代文学知识的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_ancient_literature" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_medical.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_medical.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bceaa702c53a1526fc84cf8f5141570352581a44 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_medical.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ancient_medical" +"description": "以下是关于医古文的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_ancient_medical" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2fe908e531a07466a66f58f2f5009d5111d5a02d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "ancient_phonetics" +"description": "以下是关于古音学的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_ancient_phonetics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5afb88be88b8778fde06cff3a2084bce14397174 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml @@ -0,0 +1,4 @@ +"dataset_name": "basic_ancient_chinese" +"description": "以下是关于古汉语知识的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_basic_ancient_chinese" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_couplet_prediction.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_couplet_prediction.yaml new file mode 100644 index 0000000000000000000000000000000000000000..63124eed8eb2c2987e7145ee4633e010407641be --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_couplet_prediction.yaml @@ -0,0 +1,4 @@ +"dataset_name": "couplet_prediction" +"description": "以下是关于对联的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_couplet_prediction" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d50e35d5f31badfc13b1815fffa487b3fc64c82 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml @@ -0,0 +1,4 @@ +"dataset_name": "homographic_character_resolution" +"description": "以下是关于通假字的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_homographic_character_resolution" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..566e93019b994528bb003f46fb458ed725ef8af1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml @@ -0,0 +1,4 @@ +"dataset_name": "named_entity_recognition" +"description": "以下是关于古汉语命名体识别的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_named_entity_recognition" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4642992674a1f159fe101859dead4509df6c8166 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml @@ -0,0 +1,4 @@ +"dataset_name": "poetry_appreciate" +"description": "以下是关于古诗词曲鉴赏的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_poetry_appreciate" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1b408b659657b4677e056f93c59f2a59ef60cb95 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml @@ -0,0 +1,4 @@ +"dataset_name": "poetry_context_prediction" +"description": "以下是关于古诗词上下句预测的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_poetry_context_prediction" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7a7bee2c4ca59e0dc7b2f3fdc08371a9a585d42 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml @@ -0,0 +1,4 @@ +"dataset_name": "poetry_quality_assessment" +"description": "以下是关于古诗词质量评估的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_poetry_quality_assessment" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6e1367f8043d7e1e9ebcd01dfbaacfbdeb0f9fec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml @@ -0,0 +1,4 @@ +"dataset_name": "poetry_sentiment_analysis" +"description": "以下是关于诗词情感分类的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_poetry_sentiment_analysis" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee0deea16f6bcb6906fd68e2e65bf72ea276e74a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml @@ -0,0 +1,4 @@ +"dataset_name": "polysemy_resolution" +"description": "以下是关于古文单字多义的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_polysemy_resolution" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_reading_comprehension.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_reading_comprehension.yaml new file mode 100644 index 0000000000000000000000000000000000000000..92f2455d8089bcc3b7d1ff8b99c03144b5b7d61d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_reading_comprehension.yaml @@ -0,0 +1,4 @@ +"dataset_name": "reading_comprehension" +"description": "以下是关于古文阅读理解的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_reading_comprehension" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9d81c3fe6eae35a6adc888d9c73430aa891bfe86 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml @@ -0,0 +1,4 @@ +"dataset_name": "sentence_segmentation" +"description": "以下是关于古文断句的单项选择题,请直接给出正确答案的选项。\n\n" +"include": "_default_template_yaml" +"task": "aclue_sentence_segmentation" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7bfc27cbc13f0fe8051ace916c67e7b54ca612ea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/README.md @@ -0,0 +1,60 @@ +# Arithmetic + +### Paper + +Title: `Language Models are Few-Shot Learners` +Abstract: https://arxiv.org/abs/2005.14165 + +A small battery of 10 tests that involve asking language models a simple arithmetic +problem in natural language. + +Homepage: https://github.com/openai/gpt-3/tree/master/data + + +### Citation + +``` +@inproceedings{NEURIPS2020_1457c0d6, + author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, + pages = {1877--1901}, + publisher = {Curran Associates, Inc.}, + title = {Language Models are Few-Shot Learners}, + url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, + volume = {33}, + year = {2020} +} +``` + +### Groups, Tags, and Tasks + +#### Tags + +* `arithmetic`: Evaluates `1dc` to `5ds` + +#### Tasks + +* `arithmetic_1dc` +* `arithmetic_2da` +* `arithmetic_2dm` +* `arithmetic_2ds` +* `arithmetic_3da` +* `arithmetic_3ds` +* `arithmetic_4da` +* `arithmetic_4ds` +* `arithmetic_5da` +* `arithmetic_5ds` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e9c9277bcde23f513674bf0da8d3b92c1428418 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml @@ -0,0 +1,18 @@ +tag: + - arithmetic +task: arithmetic_1dc +dataset_path: EleutherAI/arithmetic +dataset_name: arithmetic_1dc +output_type: loglikelihood +validation_split: validation +test_split: null +doc_to_text: "{{context}}" +doc_to_target: "{{completion}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2da.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2da.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a186d76e8971072947dd6e9322e701ecc8815e89 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2da.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_2da +dataset_name: arithmetic_2da +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..471bd4b4449f280412d9ee69566d4f80fd623671 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_2dm +dataset_name: arithmetic_2dm +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8e762486b818ee8b2962c94f46edaefb36da6b5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_2ds +dataset_name: arithmetic_2ds +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3da.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3da.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4870d04f0c47ea61a75504ce051bd929ee1840e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3da.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_3da +dataset_name: arithmetic_3da +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..37f9ff0d2536d6c55c3e0f1676fe8218395d7b6c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_3ds +dataset_name: arithmetic_3ds +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4da.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4da.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c04c6249fc520010317fe2503813acf86780844 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4da.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_4da +dataset_name: arithmetic_4da +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..282b3d1e51e886b3509a68ffb921238eb8e49cb0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_4ds +dataset_name: arithmetic_4ds +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5da.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5da.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5365cfbeb94d8fea5d782500a8f88ecfc19dafdb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5da.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_5da +dataset_name: arithmetic_5da +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51d95da0074dd32b7c99e0d80e2a54765279c5bc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml @@ -0,0 +1,5 @@ +include: arithmetic_1dc.yaml +task: arithmetic_5ds +dataset_name: arithmetic_5ds +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_default_ceval_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_default_ceval_yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e5af554355e651feb91e724768e6abd9c1208c4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_default_ceval_yaml @@ -0,0 +1,18 @@ +dataset_path: ceval/ceval-exam +validation_split: val +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..9050c75c0644f2dc8bfd800f2573b23c90988668 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_generate_configs.py @@ -0,0 +1,142 @@ +""" +Take in a YAML, and output all other splits with this YAML +""" + +import argparse +import os + +import yaml +from tqdm import tqdm + +from lm_eval.utils import eval_logger + + +SUBJECTS = { + "computer_network": "计算机网络", + "operating_system": "操作系统", + "computer_architecture": "计算机组成", + "college_programming": "大学编程", + "college_physics": "大学物理", + "college_chemistry": "大学化学", + "advanced_mathematics": "高等数学", + "probability_and_statistics": "概率统计", + "discrete_mathematics": "离散数学", + "electrical_engineer": "注册电气工程师", + "metrology_engineer": "注册计量师", + "high_school_mathematics": "高中数学", + "high_school_physics": "高中物理", + "high_school_chemistry": "高中化学", + "high_school_biology": "高中生物", + "middle_school_mathematics": "初中数学", + "middle_school_biology": "初中生物", + "middle_school_physics": "初中物理", + "middle_school_chemistry": "初中化学", + "veterinary_medicine": "兽医学", + "college_economics": "大学经济学", + "business_administration": "工商管理", + "marxism": "马克思主义基本原理", + "mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论", + "education_science": "教育学", + "teacher_qualification": "教师资格", + "high_school_politics": "高中政治", + "high_school_geography": "高中地理", + "middle_school_politics": "初中政治", + "middle_school_geography": "初中地理", + "modern_chinese_history": "近代史纲要", + "ideological_and_moral_cultivation": "思想道德修养与法律基础", + "logic": "逻辑学", + "law": "法学", + "chinese_language_and_literature": "中国语言文学", + "art_studies": "艺术学", + "professional_tour_guide": "导游资格", + "legal_professional": "法律职业资格", + "high_school_chinese": "高中语文", + "high_school_history": "高中历史", + "middle_school_history": "初中历史", + "civil_servant": "公务员", + "sports_science": "体育学", + "plant_protection": "植物保护", + "basic_medicine": "基础医学", + "clinical_medicine": "临床医学", + "urban_and_rural_planner": "注册城乡规划师", + "accountant": "注册会计师", + "fire_engineer": "注册消防工程师", + "environmental_impact_assessment_engineer": "环境影响评价工程师", + "tax_accountant": "税务师", + "physician": "医师资格", +} + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--base_yaml_path", required=True) + parser.add_argument("--save_prefix_path", default="ceval-valid") + parser.add_argument("--cot_prompt_path", default=None) + parser.add_argument("--task_prefix", default="") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + # get filename of base_yaml so we can `"include": ` it in our other YAMLs. + base_yaml_name = os.path.split(args.base_yaml_path)[-1] + with open(args.base_yaml_path, encoding="utf-8") as f: + base_yaml = yaml.full_load(f) + + if args.cot_prompt_path is not None: + import json + + with open(args.cot_prompt_path, encoding="utf-8") as f: + cot_file = json.load(f) + + for subject_eng, subject_zh in tqdm(SUBJECTS.items()): + if args.cot_prompt_path is not None: + description = cot_file[subject_eng] + else: + description = ( + f"以下是中国关于{subject_zh}的单项选择题,请选出其中的正确答案。\n\n" + ) + + yaml_dict = { + "include": base_yaml_name, + "task": f"ceval-valid_{args.task_prefix}_{subject_eng}" + if args.task_prefix != "" + else f"ceval-valid_{subject_eng}", + "dataset_name": subject_eng, + "description": description, + } + + file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml" + eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}") + with open(file_save_path, "w", encoding="utf-8") as yaml_file: + yaml.dump( + yaml_dict, + yaml_file, + width=float("inf"), + allow_unicode=True, + default_style='"', + ) + + # write group config out + + group_yaml_dict = { + "group": "ceval-valid", + "task": [f"ceval-valid_{task_name}" for task_name in SUBJECTS.keys()], + "aggregate_metric_list": [ + {"metric": "acc", "aggregation": "mean", "weight_by_size": True}, + {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True}, + ], + "metadata": {"version": 1.0}, + } + + file_save_path = "_" + args.save_prefix_path + ".yaml" + + with open(file_save_path, "w", encoding="utf-8") as group_yaml_file: + yaml.dump( + group_yaml_dict, + group_yaml_file, + width=float("inf"), + allow_unicode=True, + default_style='"', + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..907bf8eb361548775047a5cbfe03befb89041dba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml @@ -0,0 +1,4 @@ +"dataset_name": "basic_medicine" +"description": "以下是中国关于基础医学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_basic_medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..50e7f01c18e66ad47b6bac6db1f5ce4bc9cfec9a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_physics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "college_physics" +"description": "以下是中国关于大学物理的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_college_physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_fire_engineer.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_fire_engineer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ddb6fa779bd59c59bf65052f162ea4ddc0018eef --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_fire_engineer.yaml @@ -0,0 +1,4 @@ +"dataset_name": "fire_engineer" +"description": "以下是中国关于注册消防工程师的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_fire_engineer" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e140af2932ba9751ae0617bd50a48ae7c925e3d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_chemistry.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_chemistry" +"description": "以下是中国关于高中化学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_chinese.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a2befa89ceea3cd7ee86ada08eb5423b075dac8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_chinese.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_chinese" +"description": "以下是中国关于高中语文的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_chinese" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..abc00afa68ba2ffd8d7e63b4db228d7f379762ff --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_mathematics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_mathematics" +"description": "以下是中国关于高中数学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_politics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_politics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7f77391bffa4ae32afac484494a1c5b284a3a0e6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_politics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "high_school_politics" +"description": "以下是中国关于高中政治的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_high_school_politics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6f8725356987a3c55e8d5b346485a6318bae6c26 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_biology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_biology" +"description": "以下是中国关于初中生物的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..28708b5b0a7fda122443a9fe2bbd980cfb103804 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_chemistry.yaml @@ -0,0 +1,4 @@ +"dataset_name": "middle_school_chemistry" +"description": "以下是中国关于初中化学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_middle_school_chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_physician.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_physician.yaml new file mode 100644 index 0000000000000000000000000000000000000000..66abd59f32c1fd5ac47fa86f8afe2d080f9ad408 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_physician.yaml @@ -0,0 +1,4 @@ +"dataset_name": "physician" +"description": "以下是中国关于医师资格的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_physician" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_professional_tour_guide.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_professional_tour_guide.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e670fda482646b59273cecfe58c1af9d8d1e0b4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_professional_tour_guide.yaml @@ -0,0 +1,4 @@ +"dataset_name": "professional_tour_guide" +"description": "以下是中国关于导游资格的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_professional_tour_guide" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_sports_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_sports_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e47473994a4765804c6536f71e55c3fda5937279 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_sports_science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "sports_science" +"description": "以下是中国关于体育学的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_sports_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_teacher_qualification.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_teacher_qualification.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ca08e24a7103ebf2114cc9d1b370cb214c19f293 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_teacher_qualification.yaml @@ -0,0 +1,4 @@ +"dataset_name": "teacher_qualification" +"description": "以下是中国关于教师资格的单项选择题,请选出其中的正确答案。\n\n" +"include": "_default_ceval_yaml" +"task": "ceval-valid_teacher_qualification" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/commonsense_qa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/commonsense_qa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..94ef87a57a20ea744c11ec7f1304ff545470247f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/commonsense_qa/README.md @@ -0,0 +1,60 @@ +# Task-name + +### Paper + +Title: `COMMONSENSEQA: A Question Answering Challenge Targeting +Commonsense Knowledge` + +Abstract: https://arxiv.org/pdf/1811.00937.pdf + +CommonsenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict the correct answers. +It contains 12,102 questions with one correct answer and four distractor answers. + +Homepage: https://www.tau-nlp.org/commonsenseqa + + +### Citation + +``` +@inproceedings{talmor-etal-2019-commonsenseqa, + title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge", + author = "Talmor, Alon and + Herzig, Jonathan and + Lourie, Nicholas and + Berant, Jonathan", + booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", + month = jun, + year = "2019", + address = "Minneapolis, Minnesota", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/N19-1421", + doi = "10.18653/v1/N19-1421", + pages = "4149--4158", + archivePrefix = "arXiv", + eprint = "1811.00937", + primaryClass = "cs", +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `commonsense_qa`: Represents the "random" split from the paper. Uses an MMLU-style prompt, as (presumably) used by Llama evaluations. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/commonsense_qa/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/commonsense_qa/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..31d31b01253698a83d40cf6f4fa8a2dc4aa47d3e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/commonsense_qa/default.yaml @@ -0,0 +1,12 @@ +task: commonsense_qa +dataset_path: tau/commonsense_qa +training_split: train +validation_split: validation +output_type: multiple_choice +doc_to_text: "Question: {{ question.strip() }}\nA. {{choices['text'][0]}}\nB. {{choices['text'][1]}}\nC. {{choices['text'][2]}}\nD. {{choices['text'][3]}}\nE. {{choices['text'][4]}}\nAnswer:" +doc_to_target: answerKey +doc_to_choice: ['A', 'B', 'C', 'D', 'E'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ff1647e226ddff3bc10add35222aa8ea5b06570e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/README.md @@ -0,0 +1,47 @@ +# COPAL + +### Paper + +Title: `COPAL-ID: Indonesian Language Reasoning with Local Culture and Nuances` + +Abstract: `https://arxiv.org/abs/2311.01012` + +`COPAL-ID is an Indonesian causal commonsense reasoning dataset that captures local nuances. It provides a more natural portrayal of day-to-day causal reasoning within the Indonesian (especially Jakartan) cultural sphere. Professionally written and validatid from scratch by natives, COPAL-ID is more fluent and free from awkward phrases, unlike the translated XCOPA-ID.` + +Homepage: `https://github.com/haryoa/copal-id` + + +### Citation + +``` +@article{wibowo2023copal, + title={COPAL-ID: Indonesian Language Reasoning with Local Culture and Nuances}, + author={Wibowo, Haryo Akbarianto and Fuadi, Erland Hilman and Nityasya, Made Nindyatama and Prasojo, Radityo Eko and Aji, Alham Fikri}, + journal={arXiv preprint arXiv:2311.01012}, + year={2023} +} +``` + +### Groups and Tasks + +#### Groups + +* `copal_id` + +#### Tasks + +* `copal_id_standard`: `Standard version of COPAL dataset, use formal language and less local nuances` +* `copal_id_colloquial`: `Colloquial version of COPAL dataset, use informal language and more local nuances` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/colloquial.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/colloquial.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7976d1a26dbb1ed1e92042bad4e76d6883aae282 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/colloquial.yaml @@ -0,0 +1,4 @@ +include: standard.yaml +task: copal_id_colloquial +task_alias: colloquial +test_split: test_colloquial diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/standard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/standard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a09d0c8531c4f8668c2fcd90beb274b30c616237 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/standard.yaml @@ -0,0 +1,14 @@ +tag: copal_id +task: copal_id_standard +task_alias: standard +dataset_path: haryoaw/COPAL +dataset_name: id +output_type: multiple_choice +test_split: test +doc_to_text: !function utils.doc_to_text_id +doc_to_target: label +doc_to_choice: !function utils.doc_to_choice +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a2aeb61f7b9c62121475a68b32ee8bced3e9ebe3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/copal_id/utils.py @@ -0,0 +1,23 @@ +from functools import partial + + +def convert_choice(choice): + return choice[0].lower() + choice[1:] + + +def doc_to_text(doc, connector): + conn = connector[doc["question"]] + return doc["premise"].strip()[:-1] + f" {conn}" + + +def doc_to_choice(doc): + return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])] + + +doc_to_text_id = partial( + doc_to_text, + connector={ + "cause": "karena", + "effect": "maka", + }, +) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/README.md new file mode 100644 index 0000000000000000000000000000000000000000..693f60c3a3efda911fbfa0be9a7c64ce55fa22b1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/README.md @@ -0,0 +1,94 @@ +# FrenchBench + +### Paper + +FrenchBench is a benchmark for evaluating French language models, introduced in the paper +[CroissantLLM: A Truly Bilingual French-English Language Model](https://arxiv.org/abs/2402.00786). +It is a collection of tasks that evaluate the ability of a language model to understand and generate French text. +This benchmark is constructed both from openly available datasets, as well as newly released manually annotated data. + +### Citation + +```bibtex +@misc{faysse2024croissantllm, + title={CroissantLLM: A Truly Bilingual French-English Language Model}, + author={Manuel Faysse and Patrick Fernandes and Nuno M. Guerreiro and António Loison and Duarte M. Alves and Caio Corro and Nicolas Boizard and João Alves and Ricardo Rei and Pedro H. Martins and Antoni Bigata Casademunt and François Yvon and André F. T. Martins and Gautier Viaud and Céline Hudelot and Pierre Colombo}, + year={2024}, + eprint={2402.00786}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups, Tags, and Tasks + +#### Tags + +- `french_bench`: All tasks (non-perplexity based) +- `french_bench_gen`: All official generative tasks +- `french_bench_mc`: All official multiple choice tasks +- `french_bench_perplexity`: All perplexity-based tasks (0 shot is recommended) +- `french_bench_extra`: All extra tasks + +#### Tasks + + +The following tasks evaluate tasks on the French Bench dataset using various scoring methods. + - french_bench_boolqa + - french_bench_fquadv2 + - french_bench_fquadv2_bool + - french_bench_fquadv2_genq + - french_bench_fquadv2_hasAns + - french_bench_topic_based_nli + - french_bench_multifquad + - french_bench_grammar + - french_bench_vocab + - french_bench_reading_comp + - french_bench_xnli (modified XNLI) + - french_bench_orangesum_abstract + - french_bench_orangesum_title + - french_bench_trivia + - french_bench_hellaswag + - french_bench_arc_challenge + +The french bench also includes other tasks from various benchmarks: +- `belebele_fra_Latn`: Belebele French +- `wmt14-en-fr`: WMT14 English-French +- `wmt14-fr-en`: WMT14 French-English + +# Not to use in few-shot +- `crows_pairs_french`: Crows Pairs French +- `french_bench_opus_perplexity`: Opus Perplexity + + +### Usage + +```bash +# openai +lm_eval --model openai-completions --model_args engine=text-davinci-003 --tasks french_bench --limit 100 --num_fewshot 3 --batch_size auto --output_path data/french_bench/davinci-003/results_french_bench_3shot.json +lm_eval --model openai-completions --model_args engine=text-davinci-003 --tasks french_bench_opus_perplexity,crows_pairs_french --limit 100 --batch_size auto --output_path data/french_bench/davinci-003/results_french_bench2_0shot.json + + +lm_eval --model hf --model_args pretrained=gpt2 --tasks french_bench --device cuda:0 --limit 100 --num_fewshot 3 --batch_size 8 --output_path data/french_bench/gpt2/results_french_bench_3shot.json +lm_eval --model hf --model_args pretrained=gpt2 --tasks french_bench_opus_perplexity,crows_pairs_french --device cuda:0 --limit 100 --batch_size auto --output_path data/french_bench/gpt2/results_french_bench2_0shot.json + +lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks french_bench --device cuda:0 --limit 100 --num_fewshot 3 --batch_size 4 --output_path data/french_bench/llama-2-7b-hf/results_french_bench_3shot.json +lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks french_bench_opus_perplexity,crows_pairs_french --device cuda:0 --limit 100 --batch_size auto --output_path data/french_bench/llama-2-7b-hf/results_french_bench2_0shot.json +``` + +HF and Accelerate options can be added when loading a model: +```bash + accelerate launch -m lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf,dtype="float16" --tasks french_bench +``` + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? + * [x] Yes, original implementation contributed by author of the benchmark + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..ae3bfd1fc8d2974288922e55a7ec5d55054a90d4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/_default_template_yaml @@ -0,0 +1,4 @@ +test_split: test +fewshot_split: valid +fewshot_config: + sampler: first_n diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7884b0dc9cd9639d4a67cff0086b44978e84b14a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml @@ -0,0 +1,21 @@ +tag: + - french_bench + - french_bench_mc +task: french_bench_arc_challenge +dataset_path: manu/french_bench_arc_challenge +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: "Question: {{question}}\nRéponse:" +doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}" +doc_to_choice: "{{choices}}" +should_decontaminate: true +doc_to_decontamination_query: "Question: {{question}}\nRéponse:" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e07d2ec0d28505bceec367920e0a56617c6af45 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml @@ -0,0 +1,21 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +description: "D'après l'information présente dans le contexte, est il possible de répondre à la question ?" +task: french_bench_fquadv2_bool +dataset_path: manu/fquad2_test +output_type: multiple_choice +validation_split: valid +doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nD'après l'information présente dans le contexte, répondre à la question est:\nA. Possible \nB. Impossible\n\nRéponse:" +doc_to_choice: ["A", "B"] +doc_to_target: "{{[False, True].index(is_impossible)}}" +should_decontaminate: true +doc_to_decontamination_query: context +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml new file mode 100644 index 0000000000000000000000000000000000000000..380518520326753402e265f453f99ed6b1e1043d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml @@ -0,0 +1,31 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_gen +description: "D'après l'information dans le contexte donné, quelle question a été posée pour obtenir la réponse donnée ?" +task: french_bench_fquadv2_genq +dataset_path: manu/fquad2_test +output_type: generate_until +validation_split: valid_hasAns +test_split: test_hasAns +fewshot_split: valid_hasAns +doc_to_text: "\nContexte: {{context}}\n\nRéponse: {% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}\n\nQuestion:" +doc_to_target: "{{question}}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: question +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.rouge1 + higher_is_better: true + aggregation: !function utils.rouge1_agg + - metric: !function utils.f1 + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6eedbabb5f5e3c1a381c43d20a26ec7ce3a1d103 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml @@ -0,0 +1,34 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_gen +description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'." +task: french_bench_fquadv2_hasAns +dataset_path: manu/fquad2_test +output_type: generate_until +validation_split: valid_hasAns +test_split: test_hasAns +fewshot_split: valid_hasAns +doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:" +doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: context +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.exact + aggregation: mean + higher_is_better: true + - metric: !function utils.f1 + aggregation: mean + higher_is_better: true + - metric: !function utils.rouge1 + higher_is_better: true + aggregation: !function utils.rouge1_agg diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_multifquad.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_multifquad.yaml new file mode 100644 index 0000000000000000000000000000000000000000..71301bf29e55a0954527e7963cdcbbba92710337 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_multifquad.yaml @@ -0,0 +1,34 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_gen +description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques extraits du contexte." +task: french_bench_multifquad +dataset_path: manu/multifquad_test +output_type: generate_until +validation_split: valid +test_split: test +fewshot_split: valid +doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:" +doc_to_target: "{{', '.join(answers.text)}}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: context +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.exact + aggregation: mean + higher_is_better: true + - metric: !function utils.f1 + aggregation: mean + higher_is_better: true + - metric: !function utils.rouge1 + higher_is_better: true + aggregation: !function utils.rouge1_agg diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90b547e038ef3ff0086641b136461411240aaf5a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml @@ -0,0 +1,28 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +description: "Trouve le titre de l'article." +task: french_bench_orangesum_title +dataset_path: orange_sum +dataset_name: title +output_type: generate_until +validation_split: validation +fewshot_split: validation +doc_to_text: "\nArticle: {{text}}\n\nTitre:" +doc_to_target: "{{summary}}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: summary +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.rouge1 + higher_is_better: true + aggregation: !function utils.rouge1_agg diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3abeadad711b746f998f6b1f7253ca1285e5e24 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml @@ -0,0 +1,22 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +# description: "Répond au mieux en complétant la question avec une des réponses proposées." +dataset_path: manu/french-bench-grammar-vocab-reading +output_type: multiple_choice +validation_split: Reading +fewshot_split: Reading +test_split: Reading +# doc_to_text: "Context: {{context}}\nQuestion: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:" +# doc_to_choice: "{{['A: '+answerA, 'B: '+answerB, 'C: '+answerC, 'D: '+answerD]}}" +doc_to_text: "Context: {{context}}\n\n" +doc_to_choice: "{{[question.replace('<...>', answerA) if '<...>' in question else question + ' ' +answerA, question.replace('<...>', answerB) if '<...>' in question else question + ' ' + answerB, question.replace('<...>', answerC) if '<...>' in question else question + ' ' + answerC, question.replace('<...>', answerD) if '<...>' in question else question + ' ' + answerD]}}" +doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}' +# doc_to_choice: "{{['A: '+answerA, 'B: '+answerB, 'C: '+answerC, 'D: '+answerD]}}" +# doc_to_target: answer +task: french_bench_reading_comp +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_trivia.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_trivia.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b69b0f12b8be7078e5a0e35c812709ef496fa5f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_trivia.yaml @@ -0,0 +1,36 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_gen +task: french_bench_trivia +dataset_path: manu/french-trivia +output_type: generate_until +validation_split: train +test_split: train +fewshot_split: train +doc_to_text: "{{Question}}\nAnswer:" +doc_to_target: "{{Answer}}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: Question +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.exact + aggregation: mean + higher_is_better: true + - metric: !function utils.f1 + aggregation: mean + higher_is_better: true + - metric: !function utils.rouge1 + higher_is_better: true + aggregation: !function utils.rouge1_agg + - metric: !function utils.is_included + higher_is_better: true + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/preprocess_wikitext.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/preprocess_wikitext.py new file mode 100644 index 0000000000000000000000000000000000000000..6bea950f987a2185c40e7883869577dacb9ecb7a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/preprocess_wikitext.py @@ -0,0 +1,48 @@ +import re + + +def wikitext_detokenizer(doc): + string = doc["paragraph"] + # contractions + string = string.replace("s '", "s'") + string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) + # number separators + string = string.replace(" @-@ ", "-") + string = string.replace(" @,@ ", ",") + string = string.replace(" @.@ ", ".") + # punctuation + string = string.replace(" : ", ": ") + string = string.replace(" ; ", "; ") + string = string.replace(" . ", ". ") + string = string.replace(" ! ", "! ") + string = string.replace(" ? ", "? ") + string = string.replace(" , ", ", ") + # double brackets + string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) + string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) + string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) + string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) + string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) + # miscellaneous + string = string.replace("= = = =", "====") + string = string.replace("= = =", "===") + string = string.replace("= =", "==") + string = string.replace(" " + chr(176) + " ", chr(176)) + string = string.replace(" \n", "\n") + string = string.replace("\n ", "\n") + string = string.replace(" N ", " 1 ") + string = string.replace(" 's", "'s") + + return string + + +def process_results(doc, results): + (loglikelihood,) = results + # IMPORTANT: wikitext counts number of words in *original doc before detokenization* + _words = len(re.split(r"\s+", doc["paragraph"])) + _bytes = len(doc["paragraph"].encode("utf-8")) + return { + "word_perplexity": (loglikelihood, _words), + "byte_perplexity": (loglikelihood, _bytes), + "bits_per_byte": (loglikelihood, _bytes), + } diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..acbcbe83c86cd75c79ad8fbe1452a43776eaa12f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/utils.py @@ -0,0 +1,102 @@ +import collections +import re +import string + +import datasets +import evaluate + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + regex = re.compile(r"\b(un|une|des|le|la|les)\b", re.UNICODE) + return re.sub(regex, " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +# Exact match (the normalized answer exactly match the gold answer) +def exact(predictions, references): + return int(normalize_answer(references[0]) == normalize_answer(predictions[0])) + + +# The F-score of predicted tokens versus the gold answer +def f1(predictions, references): + gold_toks = get_tokens(references[0]) + pred_toks = get_tokens(predictions[0]) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def rouge1(items): + """ + # passthrough for efficiency + """ + return items + + +def rouge1_agg(items): + """ + Higher is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + rouge_scorer = evaluate.load("rouge") + return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"] + + +def is_included(items): + """ + # passthrough for efficiency + """ + if items[0] in items[1]: + return True + return False + + +def preprocess(text): + text = text.strip() + # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() + out_doc = { + "query": preprocess(doc["activity_label"] + ": " + ctx), + "choices": [preprocess(ending) for ending in doc["endings"]], + "gold": int(doc["label"]), + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/nq_open/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/nq_open/README.md new file mode 100644 index 0000000000000000000000000000000000000000..01792089a675f0cd17c28819e63212750815a554 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/nq_open/README.md @@ -0,0 +1,26 @@ +### Paper + +Question Answering dataset based on aggregated user queries from Google Search. + +Homepage: https://research.google/pubs/natural-questions-a-benchmark-for-question-answering-research/ + +Homepage: [google-research-datasets/natural-questions@master/nq_open](https://github.com/google-research-datasets/natural-questions/tree/master/nq_open) + +Paper: [aclanthology.org/P19-1612](https://aclanthology.org/P19-1612/) + +Derived from the Natural Questions dataset, introduced in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf . + + +### Citation + +``` +@article{47761, +title = {Natural Questions: a Benchmark for Question Answering Research}, +author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov}, +year = {2019}, +journal = {Transactions of the Association of Computational Linguistics}} +``` + +### Tasks + +* `nq_open` diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/nq_open/nq_open.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/nq_open/nq_open.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b2af0eee0171cdce7c133356d0312c6c10ef0ea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/nq_open/nq_open.yaml @@ -0,0 +1,32 @@ +task: nq_open +dataset_path: nq_open +output_type: generate_until +training_split: train +validation_split: validation +description: "Answer these questions:\n\n" +doc_to_text: "Q: {{question}}?\nA:" +doc_to_target: "{{answer}}" # TODO: should be multi-target +fewshot_delimiter: "\n" +generation_kwargs: + until: + - "\n" + - "." + - "," + do_sample: false + temperature: 0.0 +filter_list: + - name: remove_whitespace + filter: + - function: remove_whitespace + - function: take_first +target_delimiter: " " +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + regexes_to_ignore: + - "\\b(?:The |the |An |A |The |a |an )" +metadata: + version: 4.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8b2c1f1d828bf60f92c94ac98333d34fe15e3974 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/README.md @@ -0,0 +1,54 @@ +# Squad-completion + +### Paper + +Title: Simple Linear Attention Language Models Balance The Recall-Throughput Tradeoff + +A Variant of the SQuAD question answering task, as implemented by Based. See [https://github.com/EleutherAI/lm-evaluation-harness/lm_eval/tasks/squadv2/README.md] for more info. + +Homepage: https://github.com/HazyResearch/based-evaluation-harness + + + + +### Citation + +``` +@misc{arora2024simple, + title={Simple linear attention language models balance the recall-throughput tradeoff}, + author={Simran Arora and Sabri Eyuboglu and Michael Zhang and Aman Timalsina and Silas Alberti and Dylan Zinsley and James Zou and Atri Rudra and Christopher Ré}, + year={2024}, + eprint={2402.18668}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} + +@misc{rajpurkar2018know, + title={Know What You Don't Know: Unanswerable Questions for SQuAD}, + author={Pranav Rajpurkar and Robin Jia and Percy Liang}, + year={2018}, + eprint={1806.03822}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} + +``` + +### Groups and Tasks + +#### Tasks + +* `squad_completion`: the SQuAD task as implemented in the paper "Simple linear attention language models balance the recall-throughput tradeoff". Designed for zero-shot evaluation of small LMs. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/squad_completion.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/squad_completion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2c5ebe7092f3427e63cd0765e46441c894010fa2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/squad_completion.yaml @@ -0,0 +1,2 @@ +task: squad_completion +class: !function task.SQUADCompletion diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/task.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/task.py new file mode 100644 index 0000000000000000000000000000000000000000..7603a9868453875cf2605d08f22e67026f3fb101 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squad_completion/task.py @@ -0,0 +1,98 @@ +import re +from typing import List + +import numpy as np + +from lm_eval.api.instance import Instance +from lm_eval.api.task import ConfigurableTask + + +class SQUADCompletion(ConfigurableTask): + VERSION = 0 + DATASET_PATH = "hazyresearch/based-squad" + DATASET_NAME = "default" + + def __init__(self, **kwargs): + super().__init__(config={"metadata": {"version": self.VERSION}}) + + def has_training_docs(self): + return False + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return False + + def validation_docs(self): + return self.dataset["validation"] + + def doc_to_text(self, doc): + return doc["text"] + + def doc_to_target(self, doc): + return doc["value"] + + def construct_requests(self, doc, ctx, **kwargs): + """Uses RequestFactory to construct Requests and returns an iterable of + Requests which will be sent to the LM. + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param ctx: str + The context string, generated by fewshot_context. This includes the natural + language description, as well as the few shot examples, and the question + part of the document for `doc`. + """ + + return [ + Instance( + request_type="generate_until", + doc=doc, + arguments=(ctx, {"until": ["\n"], "max_gen_toks": 48}), + idx=0, + **kwargs, + ) + ] + + def process_results(self, doc, results): + """Take a single document and the LM results and evaluates, returning a + dict where keys are the names of submetrics and values are the values of + the metric for that one document + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param results: + The results of the requests created in construct_requests. + """ + # continuation, (logprob_unanswerable, _) = results + continuation = results + + return {"contains": contains_score(continuation[0], [doc["value"]])} + + def aggregation(self): + """ + :returns: {str: [float] -> float} + A dictionary where keys are the names of submetrics and values are + functions that aggregate a list of metrics + """ + return { + "contains": np.mean, # Exact match (the normalized answer exactly match the gold answer) + } + + def higher_is_better(self): + """ + :returns: {str: bool} + A dictionary where keys are the names of submetrics and values are + whether a higher value of the submetric is better + """ + return { + "contains": True, # Exact match (the normalized answer exactly match the gold answer + } + + +def contains_score(prediction: str, labels: List[str]): + return max( + int(bool(re.search(re.compile(re.escape(label), re.IGNORECASE), prediction))) + for label in labels + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/swag/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/swag/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ba1e71af5c93431a4fc051c7abc078d058d06827 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/swag/README.md @@ -0,0 +1,52 @@ +# SWAG + +### Paper + +Title: `SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference` + +Abstract: https://arxiv.org/pdf/1808.05326.pdf + +SWAG (Situations With Adversarial Generations) is an adversarial dataset +that consists of 113k multiple choice questions about grounded situations. Each +question is a video caption from LSMDC or ActivityNet Captions, with four answer +choices about what might happen next in the scene. The correct answer is the +(real) video caption for the next event in the video; the three incorrect +answers are adversarially generated and human verified, so as to fool machines +but not humans. + +Homepage: https://rowanzellers.com/swag/ + + +### Citation + +``` +@inproceedings{zellers2018swagaf, + title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference}, + author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin}, + booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + year={2018} +} +``` + +### Groups and Tasks + +#### Groups + +* Not a part of a task yet. + +#### Tasks + +* `swag` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/swag/swag.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/swag/swag.yaml new file mode 100644 index 0000000000000000000000000000000000000000..13e30566eaf91fc6ab51ac169c41ede3d9c2bedc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/swag/swag.yaml @@ -0,0 +1,19 @@ +task: swag +dataset_path: swag +dataset_name: regular +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: null +doc_to_text: startphrase +doc_to_target: label +doc_to_choice: "{{[ending0, ending1, ending2, ending3]}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ce646d4d9cb4e4e93a8a55d16b11b0cbf290225e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/README.md @@ -0,0 +1,50 @@ +# XNLIeu + +### Paper + +Title: XNLIeu: a dataset for cross-lingual NLI in Basque + +Abstract: https://arxiv.org/abs/2404.06996 + +XNLI is a popular Natural Language Inference (NLI) benchmark widely used to evaluate cross-lingual Natural Language Understanding (NLU) capabilities across languages. In this paper, we expand XNLI to include Basque, a low-resource language that can greatly benefit from transfer-learning approaches. The new dataset, dubbed XNLIeu, has been developed by first machine-translating the English XNLI corpus into Basque, followed by a manual post-edition step. We have conducted a series of experiments using mono- and multilingual LLMs to assess a) the effect of professional post-edition on the MT system; b) the best cross-lingual strategy for NLI in Basque; and c) whether the choice of the best cross-lingual strategy is influenced by the fact that the dataset is built by translation. The results show that post-edition is necessary and that the translate-train cross-lingual strategy obtains better results overall, although the gain is lower when tested in a dataset that has been built natively from scratch. Our code and datasets are publicly available under open licenses at https://github.com/hitz-zentroa/xnli-eu. + +Homepage: https://github.com/hitz-zentroa/xnli-eu + + +### Citation + +```bibtex +@misc{heredia2024xnlieu, + title={XNLIeu: a dataset for cross-lingual NLI in Basque}, + author={Maite Heredia and Julen Etxaniz and Muitze Zulaika and Xabier Saralegi and Jeremy Barnes and Aitor Soroa}, + year={2024}, + eprint={2404.06996}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups, Tags, and Tasks + +#### Tags + +* `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset. + +#### Tasks + +* `xnli_eu`: XNLI in Basque postedited from MT. +* `xnli_eu_mt`: XNLI in Basque machine translated from English. +* `xnli_eu_native`: XNLI in Basque natively created. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_common_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_common_yaml new file mode 100644 index 0000000000000000000000000000000000000000..4950a8996806739858b4261f9d0b005cd508fafe --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_common_yaml @@ -0,0 +1,15 @@ +task: null +dataset_path: xnli +dataset_name: null +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: null +doc_to_target: label +doc_to_choice: null +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b78eb7e771b48577a3fca3a29c6a9e921c6a8d26 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu.yaml @@ -0,0 +1,8 @@ +include: xnli_common_yaml +task: xnli_eu +dataset_path: HiTZ/xnli-eu +dataset_name: eu +doc_to_choice: '{{[premise+", ezta? Bai, "+hypothesis,premise+", ezta? Gainera, +"+hypothesis,premise+", ezta? Ez, "+hypothesis]}}' +doc_to_text: "" +test_split: test diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c0fbf5416b4c10bc640a25f8a3a63dd5fb903128 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml @@ -0,0 +1,4 @@ +include: xnli_eu.yaml +tag: xnli_eu_mt_native +task: xnli_eu_mt +dataset_name: eu_mt diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e841f37e7ff36b238b85f05f9de7fd7fc488cbb2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml @@ -0,0 +1,6 @@ +include: xnli_eu.yaml +tag: xnli_eu_mt_native +task: xnli_eu_native +training_split: null +validation_split: null +dataset_name: eu_native