koichi12 commited on
Commit
2644716
·
verified ·
1 Parent(s): 696104f

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/README.md +50 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/_bertaqa_template +15 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en.yaml +4 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_gemma-7b.yaml +4 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_hitz.yaml +4 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_itzuli.yaml +4 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.1.yaml +4 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.yaml +4 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.1.yaml +4 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.yaml +4 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.1.yaml +4 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.yaml +4 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-13b.yaml +4 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-70b.yaml +4 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-7b.yaml +4 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_madlad.yaml +4 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_nllb.yaml +4 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_eu.yaml +4 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_generate_configs.py +164 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml +4 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml +4 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml +4 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_business_ethics.yaml +4 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_foreign_policy.yaml +4 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_teacher_qualification.yaml +4 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_construction_project_management.yaml +4 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_high_school_geography.yaml +4 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_jurisprudence.yaml +4 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml +4 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml +4 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_management.yaml +4 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_trivia/README.md +54 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_trivia/eus_trivia.yaml +16 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_trivia/utils.py +41 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/README.md +47 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml +11 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml +26 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml +26 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml +26 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/README.md +52 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/logieval.yaml +29 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/logiqa2.yaml +21 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/utils_logiqa2.py +27 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/openbookqa/README.md +54 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/openbookqa/openbookqa.yaml +21 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/README.md +31 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_contractnli.yaml +3 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_govreport.yaml +3 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_narrativeqa.yaml +3 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_qasper.yaml +3 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BertaQA
2
+
3
+ ### Paper
4
+
5
+ Title: BertaQA: How Much Do Language Models Know About Local Culture?
6
+
7
+ Abstract: https://arxiv.org/abs/2406.07302
8
+
9
+ Large Language Models (LLMs) exhibit extensive knowledge about the world, but most evaluations have been limited to global or anglocentric subjects. This raises the question of how well these models perform on topics relevant to other cultures, whose presence on the web is not that prominent. To address this gap, we introduce BertaQA, a multiple-choice trivia dataset that is parallel in English and Basque. The dataset consists of a local subset with questions pertinent to the Basque culture, and a global subset with questions of broader interest. We find that state-of-the-art LLMs struggle with local cultural knowledge, even as they excel on global topics. However, we show that continued pre-training in Basque significantly improves the models' performance on Basque culture, even when queried in English. To our knowledge, this is the first solid evidence of knowledge transfer from a low-resource to a high-resource language. Our analysis sheds light on the complex interplay between language and knowledge, and reveals that some prior findings do not fully hold when reassessed on local topics. Our dataset and evaluation code are available under open licenses at https://github.com/juletx/BertaQA.
10
+
11
+ Homepage: https://github.com/juletx/BertaQA
12
+
13
+ ### Citation
14
+
15
+ ```
16
+ @misc{etxaniz2024bertaqa,
17
+ title={BertaQA: How Much Do Language Models Know About Local Culture?},
18
+ author={Julen Etxaniz and Gorka Azkune and Aitor Soroa and Oier Lopez de Lacalle and Mikel Artetxe},
19
+ year={2024},
20
+ eprint={2406.07302},
21
+ archivePrefix={arXiv},
22
+ primaryClass={cs.CL}
23
+ }
24
+ ```
25
+
26
+ ### Groups and Tasks
27
+
28
+ #### Groups
29
+
30
+ - `bertaqa`: Group of BertaQA tasks.
31
+
32
+ #### Tasks
33
+
34
+ - `bertaqa_eu`: Trivia questions in Basque.
35
+ - `bertaqa_en`: Trivia questions in English, human-translated from Basque.
36
+ - `bertaqa_en_mt_*`: Trivia questions in English, machine-translated from Basque with different models.
37
+
38
+ ### Checklist
39
+
40
+ For adding novel benchmarks/datasets to the library:
41
+
42
+ - [ ] Is the task an existing benchmark in the literature?
43
+ - [ ] Have you referenced the original paper that introduced the task?
44
+ - [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
45
+
46
+ If other tasks on this dataset are already supported:
47
+
48
+ - [ ] Is the "Main" variant of this task clearly denoted?
49
+ - [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
50
+ - [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/_bertaqa_template ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag: bertaqa
2
+ dataset_path: HiTZ/BertaQA
3
+ dataset_name: null
4
+ validation_split: null
5
+ test_split: test
6
+ fewshot_split: test
7
+ output_type: multiple_choice
8
+ doc_to_choice: ["A", "B", "C"]
9
+ doc_to_target: answer
10
+ metric_list:
11
+ - metric: acc
12
+ aggregation: mean
13
+ higher_is_better: true
14
+ metadata:
15
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en
2
+ include: _bertaqa_template
3
+ dataset_name: en
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_gemma-7b.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_gemma-7b
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_gemma-7b
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_hitz.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_hitz
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_hitz
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_itzuli.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_itzuli
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_itzuli
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_latxa-13b-v1.1
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_latxa-13b-v1.1
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_latxa-13b-v1
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_latxa-13b-v1
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_latxa-70b-v1.1
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_latxa-70b-v1.1
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_latxa-70b-v1
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_latxa-70b-v1
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_latxa-7b-v1.1
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_latxa-7b-v1.1
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_latxa-7b-v1
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_latxa-7b-v1
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-13b.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_llama-2-13b
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_llama-2-13b
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-70b.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_llama-2-70b
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_llama-2-70b
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-7b.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_llama-2-7b
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_llama-2-7b
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_madlad.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_madlad
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_madlad
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_en_mt_nllb.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_en_mt_nllb
2
+ include: _bertaqa_template
3
+ dataset_name: en_mt_nllb
4
+ doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/bertaqa/bertaqa_eu.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ task: bertaqa_eu
2
+ include: _bertaqa_template
3
+ dataset_name: eu
4
+ doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nErantzuna:"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_generate_configs.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Take in a YAML, and output all other splits with this YAML
3
+ """
4
+
5
+ import argparse
6
+ import os
7
+
8
+ import yaml
9
+ from tqdm import tqdm
10
+
11
+ from lm_eval.utils import eval_logger
12
+
13
+
14
+ SUBJECTS = {
15
+ "agronomy": "农学",
16
+ "anatomy": "解剖学",
17
+ "ancient_chinese": "古汉语",
18
+ "arts": "艺术学",
19
+ "astronomy": "天文学",
20
+ "business_ethics": "商业伦理",
21
+ "chinese_civil_service_exam": "中国公务员考试",
22
+ "chinese_driving_rule": "中国驾驶规则",
23
+ "chinese_food_culture": "中国饮食文化",
24
+ "chinese_foreign_policy": "中国外交政策",
25
+ "chinese_history": "中国历史",
26
+ "chinese_literature": "中国文学",
27
+ "chinese_teacher_qualification": "中国教师资格",
28
+ "clinical_knowledge": "临床知识",
29
+ "college_actuarial_science": "大学精算学",
30
+ "college_education": "大学教育学",
31
+ "college_engineering_hydrology": "大学工程水文学",
32
+ "college_law": "大学法律",
33
+ "college_mathematics": "大学数学",
34
+ "college_medical_statistics": "大学医学统计",
35
+ "college_medicine": "大学医学",
36
+ "computer_science": "计算机科学",
37
+ "computer_security": "计算机安全",
38
+ "conceptual_physics": "概念物理学",
39
+ "construction_project_management": "建设工程管理",
40
+ "economics": "经济学",
41
+ "education": "教育学",
42
+ "electrical_engineering": "电气工程",
43
+ "elementary_chinese": "小学语文",
44
+ "elementary_commonsense": "小学常识",
45
+ "elementary_information_and_technology": "小学信息技术",
46
+ "elementary_mathematics": "初等数学",
47
+ "ethnology": "民族学",
48
+ "food_science": "食品科学",
49
+ "genetics": "遗传学",
50
+ "global_facts": "全球事实",
51
+ "high_school_biology": "高中生物",
52
+ "high_school_chemistry": "高中化学",
53
+ "high_school_geography": "高中地理",
54
+ "high_school_mathematics": "高中数学",
55
+ "high_school_physics": "高中物理学",
56
+ "high_school_politics": "高中政治",
57
+ "human_sexuality": "人类性行为",
58
+ "international_law": "国际法学",
59
+ "journalism": "新闻学",
60
+ "jurisprudence": "法理学",
61
+ "legal_and_moral_basis": "法律与道德基础",
62
+ "logical": "逻辑学",
63
+ "machine_learning": "机器学习",
64
+ "management": "管理学",
65
+ "marketing": "市场营销",
66
+ "marxist_theory": "马克思主义理论",
67
+ "modern_chinese": "现代汉语",
68
+ "nutrition": "营养学",
69
+ "philosophy": "哲学",
70
+ "professional_accounting": "专业会计",
71
+ "professional_law": "专业法学",
72
+ "professional_medicine": "专业医学",
73
+ "professional_psychology": "专业心理学",
74
+ "public_relations": "公共关系",
75
+ "security_study": "安全研究",
76
+ "sociology": "社会学",
77
+ "sports_science": "体育学",
78
+ "traditional_chinese_medicine": "中医中药",
79
+ "virology": "病毒学",
80
+ "world_history": "世界历史",
81
+ "world_religions": "世界宗教",
82
+ }
83
+
84
+
85
+ def parse_args():
86
+ parser = argparse.ArgumentParser()
87
+ parser.add_argument("--base_yaml_path", required=True)
88
+ parser.add_argument("--save_prefix_path", default="cmmlu")
89
+ parser.add_argument("--cot_prompt_path", default=None)
90
+ parser.add_argument("--task_prefix", default="")
91
+ return parser.parse_args()
92
+
93
+
94
+ if __name__ == "__main__":
95
+ args = parse_args()
96
+
97
+ # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
98
+ base_yaml_name = os.path.split(args.base_yaml_path)[-1]
99
+ with open(args.base_yaml_path, encoding="utf-8") as f:
100
+ base_yaml = yaml.full_load(f)
101
+
102
+ if args.cot_prompt_path is not None:
103
+ import json
104
+
105
+ with open(args.cot_prompt_path, encoding="utf-8") as f:
106
+ cot_file = json.load(f)
107
+
108
+ for subject_eng, subject_zh in tqdm(SUBJECTS.items()):
109
+ if args.cot_prompt_path is not None:
110
+ description = cot_file[subject_eng]
111
+ else:
112
+ description = (
113
+ f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n"
114
+ )
115
+
116
+ yaml_dict = {
117
+ "include": base_yaml_name,
118
+ "task": f"cmmlu_{args.task_prefix}_{subject_eng}"
119
+ if args.task_prefix != ""
120
+ else f"cmmlu_{subject_eng}",
121
+ "dataset_name": subject_eng,
122
+ "description": description,
123
+ }
124
+
125
+ file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
126
+ eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
127
+ with open(file_save_path, "w", encoding="utf-8") as yaml_file:
128
+ yaml.dump(
129
+ yaml_dict,
130
+ yaml_file,
131
+ width=float("inf"),
132
+ allow_unicode=True,
133
+ default_style='"',
134
+ )
135
+
136
+ # write group config out
137
+
138
+ group_yaml_dict = {
139
+ "group": "cmmlu",
140
+ "task": [
141
+ (
142
+ f"cmmlu_{args.task_prefix}_{subject_eng}"
143
+ if args.task_prefix != ""
144
+ else f"cmmlu_{subject_eng}"
145
+ )
146
+ for subject_eng in SUBJECTS.keys()
147
+ ],
148
+ "aggregate_metric_list": [
149
+ {"metric": "acc", "aggregation": "mean", "weight_by_size": True},
150
+ {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
151
+ ],
152
+ "metadata": {"version": 0.0},
153
+ }
154
+
155
+ file_save_path = "_" + args.save_prefix_path + ".yaml"
156
+
157
+ with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
158
+ yaml.dump(
159
+ group_yaml_dict,
160
+ group_yaml_file,
161
+ width=float("inf"),
162
+ allow_unicode=True,
163
+ default_style='"',
164
+ )
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "agronomy"
2
+ "description": "以下是关于农学的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_agronomy"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "chinese_food_culture"
2
+ "description": "以下是关于中国饮食文化的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_chinese_food_culture"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "conceptual_physics"
2
+ "description": "以下是关于概念物理学的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_conceptual_physics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_business_ethics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "business_ethics"
2
+ "description": "以下是关于商业伦理的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_business_ethics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_foreign_policy.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "chinese_foreign_policy"
2
+ "description": "以下是关于中国外交政策的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_chinese_foreign_policy"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_teacher_qualification.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "chinese_teacher_qualification"
2
+ "description": "以下是关于中国教师资格的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_chinese_teacher_qualification"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_construction_project_management.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "construction_project_management"
2
+ "description": "以下是关于建设工程管理的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_construction_project_management"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_high_school_geography.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "high_school_geography"
2
+ "description": "以下是关于高中地理的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_high_school_geography"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_jurisprudence.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "jurisprudence"
2
+ "description": "以下是关于法理学的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_jurisprudence"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "global_facts"
2
+ "description": "以下是关于全球事实的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_global_facts"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "human_sexuality"
2
+ "description": "以下是关于人类性行为的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_human_sexuality"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_management.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "management"
2
+ "description": "以下是关于管理学的单项选择题,请直接给出正确答案的选项。\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "cmmlu_management"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_trivia/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EusTrivia
2
+
3
+ ### Paper
4
+
5
+ Title: Latxa: An Open Language Model and Evaluation Suite for Basque
6
+
7
+ Abstract: https://arxiv.org/abs/2403.20266
8
+
9
+ EusTrivia consists of 1,715 trivia questions from multiple online sources. 56.3\% of the questions are elementary level (grades 3-6), while the rest are considered challenging. A significant portion of the questions focus specifically on the Basque Country, its language and culture. Each multiple-choice question contains two, three or four choices (3.84 on average) and a single correct answer. Five areas of knowledge are covered:
10
+
11
+ - **Humanities and Natural Sciences** (27.8%): This category encompasses questions about history, geography, biology, ecology and other social and natural sciences.
12
+ - **Leisure and Art** (24.5%): This category includes questions on sports and athletes, performative and plastic arts and artists, architecture, cultural events, and related topics.
13
+ - **Music** (16.0%): Here are grouped all the questions about music and musicians, both classical and contemporary.
14
+ - **Language and Literature** (17.1%): This category is concerned with all kinds of literature productions and writers, as well as metalinguistic questions (e.g., definitions, synonyms, and word usage).
15
+ - **Mathematics and ICT** (14.5%): This category covers mathematical problems and questions about ICT, as well as questions about people known for their contributions to these fields of knowledge.
16
+
17
+ Homepage: https://github.com/hitz-zentroa/latxa
18
+
19
+
20
+ ### Citation
21
+
22
+ ```
23
+ @misc{etxaniz2024latxa,
24
+ title={Latxa: An Open Language Model and Evaluation Suite for Basque},
25
+ author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
26
+ year={2024},
27
+ eprint={2403.20266},
28
+ archivePrefix={arXiv},
29
+ primaryClass={cs.CL}
30
+ }
31
+ ```
32
+
33
+ ### Groups and Tasks
34
+
35
+ #### Groups
36
+
37
+ There are no groups.
38
+
39
+ #### Tasks
40
+
41
+ * `eus_trivia`: EusTrivia consists of 1,715 trivia questions from multiple online sources.
42
+
43
+ ### Checklist
44
+
45
+ For adding novel benchmarks/datasets to the library:
46
+ * [ ] Is the task an existing benchmark in the literature?
47
+ * [ ] Have you referenced the original paper that introduced the task?
48
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
49
+
50
+
51
+ If other tasks on this dataset are already supported:
52
+ * [ ] Is the "Main" variant of this task clearly denoted?
53
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
54
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_trivia/eus_trivia.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: HiTZ/EusTrivia
2
+ dataset_name: default
3
+ task: eus_trivia
4
+ doc_to_text: !function utils.doc_to_text
5
+ doc_to_choice: !function utils.doc_to_choice
6
+ validation_split: null
7
+ test_split: test
8
+ fewshot_split: test
9
+ output_type: multiple_choice
10
+ doc_to_target: answer
11
+ metric_list:
12
+ - metric: acc
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ metadata:
16
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_trivia/utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+
4
+ letters = ["A", "B", "C", "D"]
5
+
6
+
7
+ def doc_to_text(doc) -> str:
8
+ """
9
+ Converts a document to a formatted string.
10
+
11
+ Args:
12
+ doc (dict): A dictionary containing the document information.
13
+
14
+ Returns:
15
+ str: A formatted string containing the question and answer choices.
16
+ """
17
+ candidates = doc["candidates"]
18
+ num_choices = len(candidates)
19
+ if num_choices < 2:
20
+ raise ValueError("Invalid number of candidates")
21
+ choices = letters[:num_choices]
22
+ formatted_choices = "\n".join(
23
+ [f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)]
24
+ )
25
+ return f"Galdera: {doc['question']}\n{formatted_choices}\nErantzuna:"
26
+
27
+
28
+ def doc_to_choice(doc) -> List[str]:
29
+ """
30
+ Returns the answer choices for a document.
31
+
32
+ Args:
33
+ doc (dict): A dictionary containing the document information.
34
+
35
+ Returns:
36
+ list: A list of strings containing the answer choices.
37
+ """
38
+ num_choices = len(doc["candidates"])
39
+ if num_choices < 2:
40
+ raise ValueError("Invalid number of candidates")
41
+ return letters[:num_choices]
scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KorMedMCQA
2
+
3
+ ### Paper
4
+
5
+ Title: `KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations`
6
+
7
+ Abstract: `We introduce KorMedMCQA, the first Korean multiple-choice question answering (MCQA) benchmark derived from Korean healthcare professional licensing examinations, covering from the year 2012 to year 2023. This dataset consists of a selection of questions from the license examinations for doctors, nurses, and pharmacists, featuring a diverse array of subjects. We conduct baseline experiments on various large language models, including proprietary/open-source, multilingual/Korean-additional pretrained, and clinical context pretrained models, highlighting the potential for further enhancements. We make our data publicly available on HuggingFace and provide a evaluation script via LM-Harness, inviting further exploration and advancement in Korean healthcare environments.`
8
+
9
+
10
+ Paper : https://arxiv.org/abs/2403.01469
11
+
12
+ Homepage: https://huggingface.co/datasets/sean0042/KorMedMCQA
13
+
14
+
15
+ ### Citation
16
+
17
+ ```
18
+ @article{kweon2024kormedmcqa,
19
+ title={KorMedMCQA: Multi-Choice Question Answering Benchmark for Korean Healthcare Professional Licensing Examinations},
20
+ author={Sunjun Kweon and Byungjin Choi and Minkyu Kim and Rae Woong Park and Edward Choi},
21
+ journal={arXiv preprint arXiv:2403.01469},
22
+ year={2024}
23
+ }
24
+ ```
25
+
26
+ ### Groups and Tasks
27
+
28
+ * `kormedmcqa`: Runs `kormedmcqa_doctor`, `kormedmcqa_nurse`, and `kormedmcqa_pharm`.
29
+
30
+ #### Tasks
31
+
32
+ * `kormedmcqa_doctor`: `Official Korean Doctor Examination`
33
+ * `kormedmcqa_nurse`: `Official Korean Nurse Examination`
34
+ * `kormedmcqa_pharm`: `Official Korean Pharmacist Examination`
35
+
36
+ ### Checklist
37
+
38
+ For adding novel benchmarks/datasets to the library:
39
+ * [x] Is the task an existing benchmark in the literature?
40
+ * [x] Have you referenced the original paper that introduced the task?
41
+ * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
42
+
43
+
44
+ If other tasks on this dataset are already supported:
45
+ * [ ] Is the "Main" variant of this task clearly denoted?
46
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
47
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: kormedmcqa
2
+ task:
3
+ - kormedmcqa_doctor
4
+ - kormedmcqa_nurse
5
+ - kormedmcqa_pharm
6
+ aggregate_metric_list:
7
+ - metric: exact_match
8
+ aggregation: mean
9
+ weight_by_size: true
10
+ metadata:
11
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task : kormedmcqa_doctor
2
+ dataset_path : sean0042/KorMedMCQA
3
+ dataset_name : doctor
4
+ test_split : test
5
+ fewshot_split : dev
6
+ fewshot_config:
7
+ sampler: first_n
8
+ output_type: generate_until
9
+ doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답:"
10
+ doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
11
+ metric_list:
12
+ - metric: exact_match
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ ignore_case: true
16
+ ignore_punctuation: true
17
+ regexes_to_ignore:
18
+ - " "
19
+ generation_kwargs:
20
+ until:
21
+ - "Q:"
22
+ - "\n\n"
23
+ - "</s>"
24
+ - "."
25
+ do_sample: false
26
+ temperature: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task : kormedmcqa_nurse
2
+ dataset_path : sean0042/KorMedMCQA
3
+ dataset_name : nurse
4
+ test_split : test
5
+ fewshot_split : dev
6
+ fewshot_config:
7
+ sampler: first_n
8
+ output_type: generate_until
9
+ doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답:"
10
+ doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
11
+ metric_list:
12
+ - metric: exact_match
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ ignore_case: true
16
+ ignore_punctuation: true
17
+ regexes_to_ignore:
18
+ - " "
19
+ generation_kwargs:
20
+ until:
21
+ - "Q:"
22
+ - "\n\n"
23
+ - "</s>"
24
+ - "."
25
+ do_sample: false
26
+ temperature: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task : kormedmcqa_pharm
2
+ dataset_path : sean0042/KorMedMCQA
3
+ dataset_name : pharm
4
+ test_split : test
5
+ fewshot_split : dev
6
+ fewshot_config:
7
+ sampler: first_n
8
+ output_type: generate_until
9
+ doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nE. {{E}}\n정답:"
10
+ doc_to_target: "{{['A', 'B', 'C', 'D', 'E'][answer-1]}}"
11
+ metric_list:
12
+ - metric: exact_match
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ ignore_case: true
16
+ ignore_punctuation: true
17
+ regexes_to_ignore:
18
+ - " "
19
+ generation_kwargs:
20
+ until:
21
+ - "Q:"
22
+ - "\n\n"
23
+ - "</s>"
24
+ - "."
25
+ do_sample: false
26
+ temperature: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LogiQA 2.0
2
+
3
+ ### Paper
4
+
5
+ LogiQA 2.0 — An Improved Dataset for Logical Reasoning in Natural Language Understanding https://ieeexplore.ieee.org/document/10174688
6
+
7
+
8
+ The dataset is an amendment and re-annotation of LogiQA in 2020, a large-scale logical reasoning reading comprehension dataset adapted from the Chinese Civil Service Examination. This new version has an increased data size, the texts are refined with manual translation by professionals, and improved by removing items with distinctive cultural features like Chinese idioms.
9
+
10
+ Furthermore, a two-way natural language inference (NLI) task is introduced, resulting in 35k premise-hypothesis pairs with gold labels, making it the first large-scale NLI dataset for complex logical reasoning
11
+
12
+ Homepage: https://github.com/csitfun/LogiQA2.0
13
+
14
+ ### Citation
15
+
16
+ ```bibtex
17
+ @ARTICLE{10174688,
18
+ author={Liu, Hanmeng and Liu, Jian and Cui, Leyang and Teng, Zhiyang and Duan, Nan and Zhou, Ming and Zhang, Yue},
19
+ journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
20
+ title={LogiQA 2.0 — An Improved Dataset for Logical Reasoning in Natural Language Understanding},
21
+ year={2023},
22
+ volume={},
23
+ number={},
24
+ pages={1-16},
25
+ doi={10.1109/TASLP.2023.3293046}}
26
+ ```
27
+
28
+ ### Groups and Tasks
29
+
30
+ #### Groups
31
+
32
+ * Not part of a group yet
33
+
34
+ #### Tasks
35
+
36
+ * `logiqa2_zh`: The original dataset in Chinese.
37
+ * `logiqa2_NLI`: The NLI version of the dataset converted from the MRC version.
38
+ * `logieval`: Prompt based; https://github.com/csitfun/LogiEval
39
+
40
+ NOTE! The subtasks have not been verified yet.
41
+
42
+ ### Checklist
43
+
44
+ * [x] Is the task an existing benchmark in the literature?
45
+ * [x] Have you referenced the original paper that introduced the task?
46
+ * [x] If yes, does the original paper provide a reference implementation?
47
+ * [x] The original paper does not. There is another implementation of this task, but it designed for instruction tuned models: https://github.com/csitfun/LogiEval
48
+
49
+ If other tasks on this dataset are already supported:
50
+ * [x] Is the "Main" variant of this task clearly denoted?
51
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
52
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/logieval.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: logieval
2
+ dataset_path: baber/logiqa2
3
+ dataset_name: logieval
4
+ output_type: generate_until
5
+ training_split: train
6
+ test_split: test
7
+ # Instructions + {content}
8
+ doc_to_text: "Instructions: You will be presented with a passage and a question about that passage. There are four options to be chosen from, you need to choose the only correct option to answer that question. If the first option is right, you generate the answer 'A', if the second option is right, you generate the answer 'B', if the third option is right, you generate the answer 'C', if the fourth option is right, you generate the answer 'D'. Read the question and options thoroughly and select the correct answer from the four answer labels. Read the passage thoroughly to ensure you know what the passage entails.\n{{content}}"
9
+ doc_to_target: "{{ideal}}"
10
+ metric_list:
11
+ - metric: exact_match
12
+ aggregation: mean
13
+ higher_is_better: true
14
+ generation_kwargs:
15
+ do_sample: false
16
+ num_fewshot: 1
17
+ filter_list:
18
+ - name: "get-answer"
19
+ filter:
20
+ - function: "regex"
21
+ # starts with A-D excluding leading spaces
22
+ # original implementation uses a.startswith(b)
23
+ # https://github.com/openai/evals/blob/305b237cdb3884c7ddb6a5d12cb184a83551fcba/evals/api.py#L84
24
+ regex_pattern: "^\\s*([A-D])"
25
+ - function: "take_first"
26
+ metadata:
27
+ version: 0.0
28
+ dataset_kwargs:
29
+ trust_remote_code: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/logiqa2.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: logiqa2
2
+ dataset_path: baber/logiqa2
3
+ dataset_name: logiqa2
4
+ output_type: multiple_choice
5
+ training_split: train
6
+ validation_split: validation
7
+ test_split: test
8
+ doc_to_choice: "{{options}}"
9
+ doc_to_text: !function utils_logiqa2.doc_to_text
10
+ doc_to_target: "{{answer}}"
11
+ doc_to_decontamination_query: "{{context}}"
12
+ should_decontaminate: false
13
+ metric_list:
14
+ - metric: acc
15
+ aggregation: mean
16
+ higher_is_better: true
17
+ - metric: acc_norm
18
+ aggregation: mean
19
+ higher_is_better: true
20
+ metadata:
21
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/logiqa2/utils_logiqa2.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from Master
2
+ def doc_to_text(doc) -> str:
3
+ """
4
+ Passage: <passage>
5
+ Question: <question>
6
+ A. <choice1>
7
+ B. <choice2>
8
+ C. <choice3>
9
+ D. <choice4>
10
+ Answer:
11
+ """
12
+ choices = ["a", "b", "c", "d"]
13
+ prompt = "Passage: " + doc["text"] + "\n"
14
+ prompt += "Question: " + doc["question"] + "\n"
15
+ for choice, option in zip(choices, doc["options"]):
16
+ prompt += f"{choice.upper()}. {option}\n"
17
+ prompt += "Answer:"
18
+ return prompt
19
+
20
+
21
+ # # https://github.com/csitfun/LogiQA2.0/blob/main/logiqa2nli/nli-prompt.py
22
+ # def doc_to_textNLI(doc):
23
+ # maj_premise = ' '.join(list(doc['major_premise']))
24
+ # min_premise = ' '.join(list(doc['minor_premise']))
25
+ # hypo = doc['conclusion']
26
+ # prompt_input = "Given the fact: " + maj_premise + ' ' + min_premise + " Does it follow that: " + hypo + " Yes or no?"
27
+ # return prompt_input
scripts/yans/lm-evaluation-harness/lm_eval/tasks/openbookqa/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenBookQA
2
+
3
+ ### Paper
4
+
5
+ Title: `Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering`
6
+
7
+ Abstract: https://arxiv.org/abs/1809.02789
8
+
9
+ OpenBookQA is a question-answering dataset modeled after open book exams for
10
+ assessing human understanding of a subject. It consists of 5,957 multiple-choice
11
+ elementary-level science questions (4,957 train, 500 dev, 500 test), which probe
12
+ the understanding of a small “book” of 1,326 core science facts and the application
13
+ of these facts to novel situations. For training, the dataset includes a mapping
14
+ from each question to the core science fact it was designed to probe. Answering
15
+ OpenBookQA questions requires additional broad common knowledge, not contained
16
+ in the book. The questions, by design, are answered incorrectly by both a retrieval-
17
+ based algorithm and a word co-occurrence algorithm.
18
+
19
+ Homepage: https://allenai.org/data/open-book-qa
20
+
21
+
22
+ ### Citation
23
+
24
+ ```
25
+ @inproceedings{OpenBookQA2018,
26
+ title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
27
+ author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
28
+ booktitle={EMNLP},
29
+ year={2018}
30
+ }
31
+ ```
32
+
33
+ ### Groups and Tasks
34
+
35
+ #### Groups
36
+
37
+ * Not part of a group yet
38
+
39
+ #### Tasks
40
+
41
+ * `openbookqa`
42
+
43
+ ### Checklist
44
+
45
+ For adding novel benchmarks/datasets to the library:
46
+ * [ ] Is the task an existing benchmark in the literature?
47
+ * [ ] Have you referenced the original paper that introduced the task?
48
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
49
+
50
+
51
+ If other tasks on this dataset are already supported:
52
+ * [ ] Is the "Main" variant of this task clearly denoted?
53
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
54
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/openbookqa/openbookqa.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: openbookqa
2
+ dataset_path: openbookqa
3
+ dataset_name: main
4
+ output_type: multiple_choice
5
+ training_split: train
6
+ validation_split: validation
7
+ test_split: test
8
+ doc_to_text: question_stem
9
+ doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
10
+ doc_to_choice: "{{choices.text}}"
11
+ should_decontaminate: true
12
+ doc_to_decontamination_query: question_stem
13
+ metric_list:
14
+ - metric: acc
15
+ aggregation: mean
16
+ higher_is_better: true
17
+ - metric: acc_norm
18
+ aggregation: mean
19
+ higher_is_better: true
20
+ metadata:
21
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SCROLLS: Standardized CompaRison Over Long Language Sequences
3
+ https://arxiv.org/abs/2201.03533
4
+
5
+ SCROLLS is a suite of datasets that require synthesizing information over long texts.
6
+ The benchmark includes seven natural language tasks across multiple domains,
7
+ including summarization, question answering, and natural language inference.
8
+
9
+ Homepage: https://www.scrolls-benchmark.com/
10
+
11
+ Since SCROLLS tasks are generally longer than the maximum sequence length of many models,
12
+ it is possible to create "subset" tasks that contain only those samples whose tokenized length
13
+ is less than some pre-defined limit. For example, to create a subset of "Qasper" that would
14
+ be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length:
15
+
16
+ ```
17
+ class QasperGPTNeoX4K(Qasper):
18
+ PRUNE_TOKENIZERS = ["EleutherAI/pythia-410m-deduped"]
19
+ PRUNE_MAX_TOKENS = 4096
20
+ PRUNE_NUM_PROC = _num_cpu_cores() # optional, to speed up pruning of large datasets like NarrativeQA
21
+ ```
22
+
23
+ `PRUNE_TOKENIZERS` can contain more than one tokenizer; this will include only samples that are
24
+ less than `PRUNE_MAX_TOKENS` for ALL of the tokenizers. This can be useful to comparing models
25
+ that use different tokenizers but the same maximum sequence length.
26
+
27
+ Once the subset task class has been defined in this file, it can be used by adding the class
28
+ to `lm_eval/tasks/__init__.py`.
29
+
30
+ NOTE: GovReport may need `max_gen_toks` set larger for causal models.
31
+ """
scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_contractnli.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ group: scrolls
2
+ task: scrolls_contractnli
3
+ class: !function task.ContractNLI
scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_govreport.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ group: scrolls
2
+ task: scrolls_govreport
3
+ class: !function task.GovReport
scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_narrativeqa.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ group: scrolls
2
+ task: scrolls_narrativeqa
3
+ class: !function task.NarrativeQA
scripts/yans/lm-evaluation-harness/lm_eval/tasks/scrolls/scrolls_qasper.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ group: scrolls
2
+ task: scrolls_qasper
3
+ class: !function task.Qasper