koichi12 commited on
Commit
a55c946
·
verified ·
1 Parent(s): 55e98b2

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_reading/README.md +48 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_reading/eus_reading.yaml +16 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_reading/utils.py +41 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/_generate_configs.py +158 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json +0 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml +32 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml +75 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml +70 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml +75 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml +48 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml +75 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml +49 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml +73 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml +68 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml +61 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml +50 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml +49 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml +87 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml +47 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml +70 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml +49 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml +66 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml +199 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml +53 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml +61 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml +53 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml +51 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml +56 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml +50 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml +64 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml +81 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml +70 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml +59 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml +74 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml +46 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml +56 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml +51 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml +43 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml +62 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml +63 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml +44 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml +59 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml +63 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml +122 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml +82 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml +62 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml +104 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml +56 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml +42 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/pile/README.md +68 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_reading/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EusReading
2
+
3
+ ### Paper
4
+
5
+ Title: Latxa: An Open Language Model and Evaluation Suite for Basque
6
+
7
+ Abstract: https://arxiv.org/abs/2403.20266
8
+
9
+ EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008. Each test generally has 10 multiple-choice questions, with 4 choices and a single correct answer. These exercises are more challenging than Belebele due to the complexity and length of the input texts. As a result, EusReading is useful to measure long context understanding of models.
10
+
11
+ Homepage: https://github.com/hitz-zentroa/latxa
12
+
13
+
14
+ ### Citation
15
+
16
+ ```
17
+ @misc{etxaniz2024latxa,
18
+ title={Latxa: An Open Language Model and Evaluation Suite for Basque},
19
+ author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
20
+ year={2024},
21
+ eprint={2403.20266},
22
+ archivePrefix={arXiv},
23
+ primaryClass={cs.CL}
24
+ }
25
+ ```
26
+
27
+ ### Groups and Tasks
28
+
29
+ #### Groups
30
+
31
+ There are no groups.
32
+
33
+ #### Tasks
34
+
35
+ * `eus_reading`: EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008.
36
+
37
+ ### Checklist
38
+
39
+ For adding novel benchmarks/datasets to the library:
40
+ * [ ] Is the task an existing benchmark in the literature?
41
+ * [ ] Have you referenced the original paper that introduced the task?
42
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
43
+
44
+
45
+ If other tasks on this dataset are already supported:
46
+ * [ ] Is the "Main" variant of this task clearly denoted?
47
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
48
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_reading/eus_reading.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: HiTZ/EusReading
2
+ dataset_name: default
3
+ task: eus_reading
4
+ doc_to_text: !function utils.doc_to_text_context
5
+ doc_to_choice: !function utils.doc_to_choice
6
+ validation_split: null
7
+ test_split: test
8
+ fewshot_split: test
9
+ output_type: multiple_choice
10
+ doc_to_target: answer
11
+ metric_list:
12
+ - metric: acc
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ metadata:
16
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_reading/utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+
4
+ letters = ["A", "B", "C", "D"]
5
+
6
+
7
+ def doc_to_text_context(doc) -> str:
8
+ """
9
+ Converts a document to a formatted string.
10
+
11
+ Args:
12
+ doc (dict): A dictionary containing the document information.
13
+
14
+ Returns:
15
+ str: A formatted string containing the question and answer choices.
16
+ """
17
+ candidates = doc["candidates"]
18
+ num_choices = len(candidates)
19
+ if num_choices < 2:
20
+ raise ValueError("Invalid number of candidates")
21
+ choices = letters[:num_choices]
22
+ formatted_choices = "\n".join(
23
+ [f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)]
24
+ )
25
+ return f"Pasartea: {doc['context']}\n\nGaldera: {doc['question']}\n{formatted_choices}\nErantzuna:"
26
+
27
+
28
+ def doc_to_choice(doc) -> List[str]:
29
+ """
30
+ Returns the answer choices for a document.
31
+
32
+ Args:
33
+ doc (dict): A dictionary containing the document information.
34
+
35
+ Returns:
36
+ list: A list of strings containing the answer choices.
37
+ """
38
+ num_choices = len(doc["candidates"])
39
+ if num_choices < 2:
40
+ raise ValueError("Invalid number of candidates")
41
+ return letters[:num_choices]
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/_generate_configs.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Take in a YAML, and output all "other" splits with this YAML
3
+ """
4
+
5
+ import argparse
6
+ import logging
7
+ import os
8
+
9
+ import yaml
10
+ from tqdm import tqdm
11
+
12
+
13
+ eval_logger = logging.getLogger("lm-eval")
14
+
15
+
16
+ SUBJECTS = {
17
+ "abstract_algebra": "stem",
18
+ "anatomy": "stem",
19
+ "astronomy": "stem",
20
+ "business_ethics": "other",
21
+ "clinical_knowledge": "other",
22
+ "college_biology": "stem",
23
+ "college_chemistry": "stem",
24
+ "college_computer_science": "stem",
25
+ "college_mathematics": "stem",
26
+ "college_medicine": "other",
27
+ "college_physics": "stem",
28
+ "computer_security": "stem",
29
+ "conceptual_physics": "stem",
30
+ "econometrics": "social_sciences",
31
+ "electrical_engineering": "stem",
32
+ "elementary_mathematics": "stem",
33
+ "formal_logic": "humanities",
34
+ "global_facts": "other",
35
+ "high_school_biology": "stem",
36
+ "high_school_chemistry": "stem",
37
+ "high_school_computer_science": "stem",
38
+ "high_school_european_history": "humanities",
39
+ "high_school_geography": "social_sciences",
40
+ "high_school_government_and_politics": "social_sciences",
41
+ "high_school_macroeconomics": "social_sciences",
42
+ "high_school_mathematics": "stem",
43
+ "high_school_microeconomics": "social_sciences",
44
+ "high_school_physics": "stem",
45
+ "high_school_psychology": "social_sciences",
46
+ "high_school_statistics": "stem",
47
+ "high_school_us_history": "humanities",
48
+ "high_school_world_history": "humanities",
49
+ "human_aging": "other",
50
+ "human_sexuality": "social_sciences",
51
+ "international_law": "humanities",
52
+ "jurisprudence": "humanities",
53
+ "logical_fallacies": "humanities",
54
+ "machine_learning": "stem",
55
+ "management": "other",
56
+ "marketing": "other",
57
+ "medical_genetics": "other",
58
+ "miscellaneous": "other",
59
+ "moral_disputes": "humanities",
60
+ "moral_scenarios": "humanities",
61
+ "nutrition": "other",
62
+ "philosophy": "humanities",
63
+ "prehistory": "humanities",
64
+ "professional_accounting": "other",
65
+ "professional_law": "humanities",
66
+ "professional_medicine": "other",
67
+ "professional_psychology": "social_sciences",
68
+ "public_relations": "social_sciences",
69
+ "security_studies": "social_sciences",
70
+ "sociology": "social_sciences",
71
+ "us_foreign_policy": "social_sciences",
72
+ "virology": "other",
73
+ "world_religions": "humanities",
74
+ }
75
+
76
+
77
+ def parse_args():
78
+ parser = argparse.ArgumentParser()
79
+ parser.add_argument("--base_yaml_path", required=True)
80
+ parser.add_argument("--save_prefix_path", default="mmlu")
81
+ parser.add_argument("--cot_prompt_path", default=None)
82
+ parser.add_argument("--task_prefix", default="")
83
+ parser.add_argument("--group_prefix", default="")
84
+ return parser.parse_args()
85
+
86
+
87
+ if __name__ == "__main__":
88
+ args = parse_args()
89
+
90
+ # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
91
+ base_yaml_name = os.path.split(args.base_yaml_path)[-1]
92
+ with open(args.base_yaml_path, encoding="utf-8") as f:
93
+ base_yaml = yaml.full_load(f)
94
+
95
+ if args.cot_prompt_path is not None:
96
+ import json
97
+
98
+ with open(args.cot_prompt_path, encoding="utf-8") as f:
99
+ cot_file = json.load(f)
100
+
101
+ ALL_CATEGORIES = []
102
+ for subject, category in tqdm(SUBJECTS.items()):
103
+ if category not in ALL_CATEGORIES:
104
+ ALL_CATEGORIES.append(category)
105
+
106
+ if args.cot_prompt_path is not None:
107
+ description = cot_file[subject]
108
+ else:
109
+ description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
110
+
111
+ yaml_dict = {
112
+ "include": base_yaml_name,
113
+ "tag": f"mmlu_{args.task_prefix}_{category}"
114
+ if args.task_prefix != ""
115
+ else f"mmlu_{category}",
116
+ "task": f"mmlu_{args.task_prefix}_{subject}"
117
+ if args.task_prefix != ""
118
+ else f"mmlu_{subject}",
119
+ "task_alias": subject.replace("_", " "),
120
+ "dataset_name": subject,
121
+ "description": description,
122
+ }
123
+
124
+ file_save_path = args.save_prefix_path + f"_{subject}.yaml"
125
+ eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
126
+ with open(file_save_path, "w", encoding="utf-8") as yaml_file:
127
+ yaml.dump(
128
+ yaml_dict,
129
+ yaml_file,
130
+ allow_unicode=True,
131
+ default_style='"',
132
+ )
133
+
134
+ if args.task_prefix != "":
135
+ mmlu_subcategories = [
136
+ f"mmlu_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
137
+ ]
138
+ else:
139
+ mmlu_subcategories = [f"mmlu_{category}" for category in ALL_CATEGORIES]
140
+
141
+ if args.group_prefix != "":
142
+ file_save_path = args.group_prefix + ".yaml"
143
+ else:
144
+ file_save_path = args.save_prefix_path + ".yaml"
145
+
146
+ eval_logger.info(f"Saving benchmark config to {file_save_path}")
147
+ with open(file_save_path, "w", encoding="utf-8") as yaml_file:
148
+ yaml.dump(
149
+ {
150
+ "group": f"mmlu_{args.task_prefix}"
151
+ if args.task_prefix != ""
152
+ else "mmlu",
153
+ "task": mmlu_subcategories,
154
+ },
155
+ yaml_file,
156
+ indent=4,
157
+ default_flow_style=False,
158
+ )
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: mmlu_flan_cot_fewshot
2
+ group_alias: mmlu (flan style, fewshot cot)
3
+ task:
4
+ - group: stem
5
+ task:
6
+ - mmlu_flan_cot_fewshot_stem
7
+ aggregate_metric_list:
8
+ - metric: acc
9
+ weight_by_size: True
10
+ - group: other
11
+ task:
12
+ - mmlu_flan_cot_fewshot_other
13
+ aggregate_metric_list:
14
+ - metric: acc
15
+ weight_by_size: True
16
+ - group: social sciences
17
+ task:
18
+ - mmlu_flan_cot_fewshot_social_sciences
19
+ aggregate_metric_list:
20
+ - metric: acc
21
+ weight_by_size: True
22
+ - group: humanities
23
+ task:
24
+ - mmlu_flan_cot_fewshot_humanities
25
+ aggregate_metric_list:
26
+ - metric: acc
27
+ weight_by_size: True
28
+ aggregate_metric_list:
29
+ - metric: acc
30
+ weight_by_size: True
31
+ metadata:
32
+ version: 2
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: anatomy
2
+ description: The following are multiple choice questions (with answers) about anatomy.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'Which of the following is the body cavity that contains the pituitary
7
+ gland?
8
+
9
+ (A) Abdominal (B) Cranial (C) Pleural (D) Spinal'
10
+ target: "Let's think step by step. We refer to Wikipedia articles on anatomy for\
11
+ \ help. Let\u2019s solve this problem step by step. The pituitary gland is the\
12
+ \ major endocrine gland attached to the base of the brain, and it is contained\
13
+ \ in the Cranial cavity. The answer is (B)."
14
+ - question: 'Which of these branches of the trigeminal nerve contain somatic motor
15
+ processes?
16
+
17
+ (A) The supraorbital nerve (B) The infraorbital nerve (C) The mental nerve (D)
18
+ None of the above'
19
+ target: "Let's think step by step. We refer to Wikipedia articles on anatomy for\
20
+ \ help. Let\u2019s solve this problem step by step. \nWe know the following:\
21
+ \ (A) The supraorbital nerve (also known as the frontal nerve) is the largest\
22
+ \ branch of the ophthalmic nerve and branch of ophthalmic division of the trigeminal\
23
+ \ nerve. (B) The infraorbital nerve is a branch of the maxillary division of\
24
+ \ the trigeminal nerve. (C) The mental nerve is a branch of the mandibular division\
25
+ \ of the trigeminal nerve. Because all these nerves are purely sensory nerves\
26
+ \ and do not contain any somatic motor processes. Therefore, the answer should\
27
+ \ be none of the above, which is (D). The answer is (D)."
28
+ - question: 'In Angle''s Class II Div 2 occlusion there is
29
+
30
+ (A) excess overbite of the upper lateral incisors. (B) negative overjet of the
31
+ upper central incisors. (C) excess overjet of the upper lateral incisors. (D)
32
+ excess overjet of the upper central incisors.'
33
+ target: "Let's think step by step. We refer to Wikipedia articles on anatomy for\
34
+ \ help. Let\u2019s solve this problem step by step. This is a question related\
35
+ \ to anatomy and orthodontics. Excess overjet is associated with Class II occlusions;\
36
+ \ therefore, we can safely eliminate (B) from the list, as negative overjet\
37
+ \ is often associated with Class III occlusions. Now, we need to determine the\
38
+ \ location of the excess overjet, and that would be the upper (maxillary) lateral\
39
+ \ incisors. Only (C) has the correct information. The answer is (C)."
40
+ - question: 'The pleura
41
+
42
+ (A) have no sensory innervation. (B) are separated by a 2 mm space. (C) extend
43
+ into the neck. (D) are composed of respiratory epithelium.'
44
+ target: "Let's think step by step. We refer to Wikipedia articles on anatomy for\
45
+ \ help. Let\u2019s solve this problem step by step. First, recall that the pleura\
46
+ \ refers to the thin layer of tissue that covers the lungs and lines the interior\
47
+ \ wall of the chest cavity. Now, let\u2019s look at each option:\nOption (A):\
48
+ \ \u201CThe pleura have no sensory innervation.\u201D This information is not\
49
+ \ correct. The pleura do have a sensory innervation.\nOption (B): \u201CThe\
50
+ \ pleura are separated by a 2 mm space.\u201D This information is not correct.\
51
+ \ There is a very thin \u201Cpotential\u201D space between the layers of the\
52
+ \ pleura; however, it is typically filled with serous pleural fluid. \nOption\
53
+ \ (C): \u201CThe pleura extend into the neck.\u201D This information is actuakky\
54
+ \ true. The cervical pleura, also known as the dome of the pleuradome of the\
55
+ \ pleura, lines the extendsiton of the pleural cavity into the neck.\nOption\
56
+ \ (D): \u201CThe pleura are composed of respiratory epithelium.\u201D This information\
57
+ \ is not correct. The pleaura are composed of connective tissue (CT).\nBecause\
58
+ \ (A), (B), and (D) are all incorrect, (D) is the only correct answer. The answer\
59
+ \ is (C)."
60
+ - question: 'What is the embryological origin of the hyoid bone?
61
+
62
+ (A) The first pharyngeal arch (B) The first and second pharyngeal arches (C)
63
+ The second pharyngeal arch (D) The second and third pharyngeal arches'
64
+ target: "Let's think step by step. We refer to Wikipedia articles on anatomy for\
65
+ \ help. Let\u2019s solve this problem step by step. The hyoid bone, which is\
66
+ \ also known as the hyooid, is a a small U-shaped bone located in the anterior\
67
+ \ neck. In its resting position, it lies between the ase of the mandible and\
68
+ \ the third cervical vertebrae. We know that the second and the third pharyngeal\
69
+ \ arches give rise to the horns of the hyoid bone; therefore, the embryological\
70
+ \ origin of the hyoid bone are the second and the third pharyngeal arches\u2014\
71
+ this information is covered in the last option (D). Therefore, we conclude that\
72
+ \ (D) must be the correct answer. The answer is (D).\n\n"
73
+ tag: mmlu_flan_cot_fewshot_stem
74
+ include: _mmlu_flan_cot_fewshot_template_yaml
75
+ task: mmlu_flan_cot_fewshot_anatomy
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: astronomy
2
+ description: The following are multiple choice questions (with answers) about astronomy.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'Where do most short-period comets come from and how do we know?
7
+
8
+ (A) The Kuiper belt; short period comets tend to be in the plane of the solar
9
+ system just like the Kuiper belt. (B) The Kuiper belt; short period comets tend
10
+ to come from random directions indicating a spherical distribution of comets
11
+ called the Kuiper belt. (C) The asteroid belt; short period comets have orbital
12
+ periods similar to asteroids like Vesta and are found in the plane of the solar
13
+ system just like the asteroid belt. (D) The Oort cloud; short period comets
14
+ tend to be in the plane of the solar system just like the Oort cloud.'
15
+ target: Let's think step by step. Most short-period comets come from the Kuiper
16
+ belt, and we know because short period coments tend to be in the plane of the
17
+ solar system, just like the Kuiper belt is. The answer is (A).
18
+ - question: 'You are pushing a truck along a road. Would it be easier to accelerate
19
+ this truck on Mars? Why? (Assume there is no friction)
20
+
21
+ (A) It would be harder since the truck is heavier on Mars. (B) It would be easier
22
+ since the truck is lighter on Mars. (C) It would be harder since the truck is
23
+ lighter on Mars. (D) It would be the same no matter where you are.'
24
+ target: "Let's think step by step. If we assume that there is no friction, the\
25
+ \ force needed to accelerate the truck is by Newton\u2019s second law only dependent\
26
+ \ on the mass of the truck. Hence (A), (B) and (C) are incorrect since it doesn\u2019\
27
+ t matter that it\u2019s on Mars, and (D) is the correct answer. The answer is\
28
+ \ (D)."
29
+ - question: 'Say the pupil of your eye has a diameter of 5 mm and you have a telescope
30
+ with an aperture of 50 cm. How much more light can the telescope gather than
31
+ your eye?
32
+
33
+ (A) 10000 times more (B) 100 times more (C) 1000 times more (D) 10 times more'
34
+ target: Let's think step by step. The amount of light is proportional to the aperture
35
+ area $A = \pi D^2/4$ for a lens with diameter $D$, so the relative amounts of
36
+ light between the eye with diameter 5mm and the telescope with diameter 50mm
37
+ is $(50 cm)^2/(5mm)^2 = 10000$. The answer is (A).
38
+ - question: 'Why isn''t there a planet where the asteroid belt is located?
39
+
40
+ (A) A planet once formed here but it was broken apart by a catastrophic collision.
41
+ (B) There was not enough material in this part of the solar nebula to form a
42
+ planet. (C) There was too much rocky material to form a terrestrial planet but
43
+ not enough gaseous material to form a jovian planet. (D) Resonance with Jupiter
44
+ prevented material from collecting together to form a planet.'
45
+ target: "Let's think step by step. The asteroid belt is a stellar disc consisting\
46
+ \ of a large number of asteroids between Mars and Jupiter's orbits. The asteroids\
47
+ \ in this belt are affected by the gravitational pull from both other asteroids\
48
+ \ and nearby planets. Due to the strong gravitational force of Jupiter there\
49
+ \ are resonances that give rise to low density regions of asteroids known as\
50
+ \ the Kirkwood gap. So (B) and (C) are not correct since it\u2019s not a lack\
51
+ \ of material that prevents a planet from being formed, and (A) is incorrect\
52
+ \ because the Kirkwood gap would have prevented a planet from forming in the\
53
+ \ first place, and (D) is the correct option. The answer is (D)."
54
+ - question: 'Why is Mars red?
55
+
56
+ (A) Because the surface is covered with heavily oxidized ("rusted") minerals.
57
+ (B) Because the atmosphere scatters more light at bluer wavelengths transmitting
58
+ mostly red light. (C) Because Mars is covered with ancient lava flows which
59
+ are red in color. (D) Because flowing water on Mars''s surface altered the surface
60
+ minerals several billion years ago.'
61
+ target: 'Let''s think step by step. Option (B) is not correct because if the red
62
+ color was caused by the scattering off the atmosphere, then the earth with a
63
+ much thicker atmosphere would also look red. Options (C) and (D) are not specific
64
+ enough about why the color of the surface would be red, while (A) is correct
65
+ because it explains that the surface is red due to the rusted materials on the
66
+ surface and the red color comes from the rust. So the correct option is (A).
67
+ The answer is (A).'
68
+ tag: mmlu_flan_cot_fewshot_stem
69
+ include: _mmlu_flan_cot_fewshot_template_yaml
70
+ task: mmlu_flan_cot_fewshot_astronomy
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: business_ethics
2
+ description: The following are multiple choice questions (with answers) about business
3
+ ethics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'In contrast to _______, _______ aim to reward favourable behaviour by
8
+ companies. The success of such campaigns have been heightened through the use
9
+ of ___________, which allow campaigns to facilitate the company in achieving
10
+ _________ .
11
+
12
+ (A) Buycotts, Boycotts, Blockchain technology, Charitable donations (B) Buycotts,
13
+ Boycotts, Digital technology, Increased Sales (C) Boycotts, Buyalls, Blockchain
14
+ technology, Charitable donations (D) Boycotts, Buycotts, Digital technology,
15
+ Increased Sales'
16
+ target: "Let's think step by step. We refer to Wikipedia articles on business\
17
+ \ ethics for help. The sentence that best uses the possible options above is\
18
+ \ \u201CIn contrast to *boycotts*, *buycotts* aim to reward favourable behavior\
19
+ \ by companies. The success of such campaigns have been heightened through the\
20
+ \ use of *digital technology*, which allow campaigns to facilitate the company\
21
+ \ in achieving *increased sales*.\u201D The answer is (D)."
22
+ - question: '_______ is the direct attempt to formally or informally manage ethical
23
+ issues or problems, through specific policies, practices and programmes.
24
+
25
+ (A) Corporate social responsibility (B) Business ethics management (C) Sustainability
26
+ (D) Environmental management'
27
+ target: Let's think step by step. We refer to Wikipedia articles on business ethics
28
+ for help. The direct attempt manage ethical issues through specific policies,
29
+ practices, and programs is business ethics management. The answer is (B).
30
+ - question: 'Three contrasting tactics that CSO''s can engage in to meet their aims
31
+ are ________ which typically involves research and communication, ________,
32
+ which may involve physically attacking a company''s operations or ________,
33
+ often involving some form of _______.
34
+
35
+ (A) Non-violent direct action, Violent direct action, Indirect action, Boycott
36
+ (B) Indirect action, Instrumental action, Non-violent direct action, Information
37
+ campaign (C) Indirect action, Violent direct action, Non-violent direct-action
38
+ Boycott (D) Non-violent direct action, Instrumental action, Indirect action,
39
+ Information campaign'
40
+ target: "Let's think step by step. We refer to Wikipedia articles on business\
41
+ \ ethics for help. The sentence that best uses the possible options above is\
42
+ \ \u201CThree contrasting tactics that CSO's can engage in to meet their aims\
43
+ \ are *indirect action*, which typically involves research and communication,\
44
+ \ *violent direct action*, which may involve physically attacking a company's\
45
+ \ operations or *non-violent direct action*, often involving some form of *boycott*.\u201D\
46
+ \ The answer is (C)."
47
+ - question: 'To ensure the independence of the non-executive board members, there are
48
+ a number of steps which can be taken, which include non-executives being drawn
49
+ from _______ the company, being appointed for a _________ time period as well
50
+ as being appointed _________.
51
+
52
+ (A) Outside, Limited, Independently (B) Inside, Limited, Intermittently (C)
53
+ Outside, Unlimited, Intermittently (D) Inside, Unlimited, Independently'
54
+ target: "Let's think step by step. We refer to Wikipedia articles on business\
55
+ \ ethics for help. The sentence that best uses the possible options above is\
56
+ \ \u201CTo ensure the independence of the non-executive board members, there\
57
+ \ are a number of steps which can be taken, which include non-executives being\
58
+ \ draw from *outside* the company, being appointed for a *limited* time period\
59
+ \ as well as being imported *independently*. The answer is (A)."
60
+ - question: 'Beyond the business case for engaging in CSR there are a number of moral
61
+ arguments relating to: negative _______, the _______that corporations possess
62
+ and the ________ of business and society.
63
+
64
+ (A) Externalities, Power, Independence (B) Publicity, Insubstantial resources,
65
+ Mutual dependence (C) Publicity, Power, Independence (D) Externalities, Power,
66
+ Mutual dependence'
67
+ target: "Let's think step by step. We refer to Wikipedia articles on business\
68
+ \ ethics for help. The sentence that best uses the possible options above is\
69
+ \ \u201CBeyond the business case for engaging the CSR there are a number of\
70
+ \ moral arguments relating to: negative *externalities*, the *power* that corporations\
71
+ \ possess and the *mutual independence* of business and society. The answer\
72
+ \ is (D).\n\n"
73
+ tag: mmlu_flan_cot_fewshot_other
74
+ include: _mmlu_flan_cot_fewshot_template_yaml
75
+ task: mmlu_flan_cot_fewshot_business_ethics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: clinical_knowledge
2
+ description: The following are multiple choice questions (with answers) about clinical
3
+ knowledge.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Glycolysis is the name given to the pathway involving the conversion of:
8
+
9
+ (A) glycogen to glucose-1-phosphate. (B) glycogen or glucose to fructose. (C)
10
+ glycogen or glucose to pyruvate or lactate. (D) glycogen or glucose to pyruvate
11
+ or acetyl CoA.'
12
+ target: Let's think step by step. We refer to Wikipedia articles on clinical knowledge
13
+ for help. Glycolysis is the name given to the pathway involving conversion of
14
+ glycogen or glucose to pyruvate or lactate. The answer is (C).
15
+ - question: 'What is the difference between a male and a female catheter?
16
+
17
+ (A) Male and female catheters are different colours. (B) Male catheters are
18
+ longer than female catheters. (C) Male catheters are bigger than female catheters.
19
+ (D) Female catheters are longer than male catheters.'
20
+ target: Let's think step by step. We refer to Wikipedia articles on clinical knowledge
21
+ for help. The difference between a male and female catheter is that male catheters
22
+ tend to be longer than female catheters. The answer is (B).
23
+ - question: 'How many attempts should you make to cannulate a patient before passing
24
+ the job on to a senior colleague, according to the medical knowledge of 2020?
25
+
26
+ (A) 4 (B) 3 (C) 2 (D) 1'
27
+ target: Let's think step by step. We refer to Wikipedia articles on clinical knowledge
28
+ for help. According to the medical protocol as of 2020, you should make two
29
+ attempts to cannulate a patient before passing the job on to a more-senior practitioner.
30
+ The answer is (C).
31
+ - question: 'In the assessment of the hand function which of the following is true?
32
+
33
+ (A) Abduction of the thumb is supplied by spinal root T2 (B) Opposition of the
34
+ thumb by opponens policis is supplied by spinal root T1 (C) Finger adduction
35
+ is supplied by the median nerve (D) Finger abduction is mediated by the palmar
36
+ interossei'
37
+ target: Let's think step by step. We refer to Wikipedia articles on clinical knowledge
38
+ for help. Of all the options, it is only true that the opposition of the thumb
39
+ by opponens pollicis is supplied by spinal root T1. The answer is (B).
40
+ - question: 'The energy for all forms of muscle contraction is provided by:
41
+
42
+ (A) ATP. (B) ADP. (C) phosphocreatine. (D) oxidative phosphorylation.'
43
+ target: 'Let''s think step by step. We refer to Wikipedia articles on clinical
44
+ knowledge for help. The energy for muscular contraction is provided by ATP (adenosine
45
+ triphosphate), which is the powerhouse of the cell. The answer is (A).'
46
+ tag: mmlu_flan_cot_fewshot_other
47
+ include: _mmlu_flan_cot_fewshot_template_yaml
48
+ task: mmlu_flan_cot_fewshot_clinical_knowledge
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: college_biology
2
+ description: The following are multiple choice questions (with answers) about college
3
+ biology.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which of the following represents an accurate statement concerning arthropods?
8
+
9
+ (A) They possess an exoskeleton composed primarily of peptidoglycan. (B) They
10
+ possess an open circulatory system with a dorsal heart. (C) They are members
11
+ of a biologically unsuccessful phylum incapable of exploiting diverse habitats
12
+ and nutrition sources. (D) They lack paired, jointed appendages.'
13
+ target: Let's think step by step. Peptidoglycan is known to comprise the plasma
14
+ membrane of most bacteria, rather than the exoskeleton of arthropods, which
15
+ is made of chitin, which rules out (A). The answer (C) is false because arthropods
16
+ are a highly successful phylum. Likewise, arthropods have paired, jointed appendages,
17
+ which rules out (D). The only remaining option is (B), as arthropods have an
18
+ open circulatory system with a dorsal tubular heart. The answer is (B).
19
+ - question: 'In a given population, 1 out of every 400 people has a cancer caused by
20
+ a completely recessive allele, b. Assuming the population is in Hardy-Weinberg
21
+ equilibrium, which of the following is the expected proportion of individuals
22
+ who carry the b allele but are not expected to develop the cancer?
23
+
24
+ (A) 1/400 (B) 19/400 (C) 20/400 (D) 38/400'
25
+ target: "Let's think step by step. According to the Hardy Weinberg Law, $p^2 +\
26
+ \ 2 p q + q^2 = 1$, and $p + q = 1$ where $p$ is the frequency of the dominant\
27
+ \ allele, $q$ is the frequency of the recessive allele, and $p^2$, $q^2$, and\
28
+ \ $2pq$ are the frequencies of dominant homozygous, recessive homozygous, and\
29
+ \ heterozygous individuals, respectively. \u200BThe frequency of the recessive\
30
+ \ allele (q) is $\\sqrt{\frac{1}{400}} = 0.05$. We have $p = 1 - q = 0.95$.\
31
+ \ The frequency of heterozygous individuals is $2pq = 2 \\cdot 0.05 \\cdot 0.95\
32
+ \ = 0.095$. The number of heterozygous individuals is equal to the frequency\
33
+ \ of heterozygous individuals times the size of the population, or $0.095 *\
34
+ \ 400 = 38$. So we end up with 38/400. The answer is (D)."
35
+ - question: 'According to the pressure-flow model of movement of phloem contents, photosynthate
36
+ movement from source to sink is driven by
37
+
38
+ (A) an ATP-dependent pressure-flow pump (B) a water-pressure potential gradient
39
+ (C) transpiration (D) apoplastic diffusion'
40
+ target: Let's think step by step. It is a gradient in water pressure that induces
41
+ the movement of phloem content, which refers to answer (B). The mechanism of
42
+ movement does not rely on metabolism, which rules out (A). Transpiration refers
43
+ to the exhalation of water vapor through plant stomata, and is also not related,
44
+ which rules out (C). While the apoplastic pathway is one of two main pathways
45
+ for water transport in plants, it is not central to the pressure flow model,
46
+ which rules out (D). The answer is (B).
47
+ - question: 'Which of the following contain DNA sequences required for the segregation
48
+ of chromosomes in mitosis and meiosis?
49
+
50
+ (A) Telomeres (B) Centromeres (C) Nucleosomes (D) Spliceosomes'
51
+ target: Let's think step by step. The genetic material in Telomeres is not used,
52
+ which rules out (A). Nucleosomes are the repeating subunit that comprises chromatin
53
+ packed in a cell nucleus, and do not specifically refer to DNA sequences necessary
54
+ for segregating chromosomes in cell division, which rules out (C). A spliceosome
55
+ is a large ribonucleoprotein that removes introns from transcribed pre-mRNA
56
+ rather than governing chromosome segregation. Centromeres are directly responsible
57
+ for segregating chromosomes in cell division. The answer is (B).
58
+ - question: 'The presence of homologous structures in two different organisms, such
59
+ as the humerus in the front limb of a human and a bird, indicates that
60
+
61
+ (A) the human and bird are polyphyletic species (B) a human''s and bird''s evolution
62
+ is convergent (C) the human and bird belong to a clade (D) the human and bird
63
+ developed by analogy'
64
+ target: 'Let''s think step by step. Polyphyletic species are organisms that are
65
+ grouped due to having similar characteristics but which do not have a common
66
+ ancestor. This is not the case for humans and birds, which rules out (A). Convergent
67
+ evolution refers to the indepdendent development of similar features in different
68
+ species at different periods, which is also not the case for humans and birds,
69
+ which rules out (B). Analogy refers to the superficial resemblance of structures
70
+ that have different origins, which is not the case for the human and bird forearms,
71
+ which rules out (D). Humans and birds do belong to the same clade - a group
72
+ of organisms composed of a common ancestor. The answer is (C).'
73
+ tag: mmlu_flan_cot_fewshot_stem
74
+ include: _mmlu_flan_cot_fewshot_template_yaml
75
+ task: mmlu_flan_cot_fewshot_college_biology
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: college_chemistry
2
+ description: The following are multiple choice questions (with answers) about college
3
+ chemistry.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: "3 Cl\u2212(aq) + 4 CrO_4^2\u2212(aq) + 23 H+(aq) \u2192 3 HClO2(aq) +\
8
+ \ 4 Cr3+(aq) + 10 H2O(l). In the reaction shown above, Cl\u2212(aq) behaves\
9
+ \ as\n(A) an acid (B) a base (C) a catalyst (D) a reducing agent"
10
+ target: Let's think step by step. A molecule that behaves as a base accepts an
11
+ H+ ion (or proton) from another molecule, whereas a molecule that behaves as
12
+ an acid donates an H+ ion (or proton) to another molecule. Neither of these
13
+ is the case for Cl in this reaction, which rules out (A) and (B). A catalyst
14
+ is a substance that only accelerates a reaction without itself undergoing chemical
15
+ change, which is not the case here. This rules out (C). Instead, the $Cl^{-}
16
+ molecules carry a negative charge, which they donate in the reaction to form
17
+ 3 HClO2. This is the behavior of a reducing agent, or (D). The answer is (D).
18
+ - question: 'Which of the following statements about the lanthanide elements is NOT
19
+ true?
20
+
21
+ (A) The most common oxidation state for the lanthanide elements is +3. (B) Lanthanide
22
+ complexes often have high coordination numbers (> 6). (C) All of the lanthanide
23
+ elements react with aqueous acid to liberate hydrogen. (D) The atomic radii
24
+ of the lanthanide elements increase across the period from La to Lu.'
25
+ target: Let's think step by step. The atomic radii of the lanthanide elements
26
+ in fact decrease across the period from La to Lu. Options (A), (B), and (C)
27
+ are all true. This means that only (D) is NOT true. The answer is (D).
28
+ - question: 'Which of the following lists the hydrides of group-14 elements in order
29
+ of thermal stability, from lowest to highest?
30
+
31
+ (A) PbH4 < SnH4 < GeH4 < SiH4 < CH4 (B) PbH4 < SnH4 < CH4 < GeH4 < SiH4 (C)
32
+ CH4 < SiH4 < GeH4 < SnH4 < PbH4 (D) CH4 < PbH4 < GeH4 < SnH4 < SiH4'
33
+ target: Let's think step by step. The thermal stability of group-14 hydrides decreases
34
+ as we move from the top of group 14 to the bottom. The order of elements in
35
+ the group from top to bottom is C, Si, Ge, Sn, Pb. Therefore in order of increasing
36
+ thermal stability we have PbH4, SnH4, GeH4, SiH4, and CH4, or answer (A). The
37
+ answer is (A).
38
+ - question: "Predict the number of lines in the EPR spectrum of a solution of 13C-labelled\
39
+ \ methyl radical (13CH3\u2022), assuming the lines do not overlap.\n(A) 4 (B)\
40
+ \ 3 (C) 6 (D) 24 (E) 8"
41
+ target: "Let's think step by step. The electron paramagnetic resonance spectrum\
42
+ \ will be split by two forms of interactions. The first is the hyperfine interaction\
43
+ \ with the 13C (nuclear spin $I = \nrac{1}{2}$) which will split the spectrum\
44
+ \ into 2 lines. This will be further split into 4 lines by the interaction with\
45
+ \ three equivalent 1H nuclei. The total number of lines is therefore $2 \\cdot\
46
+ \ 4 = 8$. The answer is (E).\n\n"
47
+ tag: mmlu_flan_cot_fewshot_stem
48
+ include: _mmlu_flan_cot_fewshot_template_yaml
49
+ task: mmlu_flan_cot_fewshot_college_chemistry
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: college_mathematics
2
+ description: The following are multiple choice questions (with answers) about college
3
+ mathematics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Let V be the set of all real polynomials p(x). Let transformations T,
8
+ S be defined on V by T:p(x) -> xp(x) and S:p(x) -> p''(x) = d/dx p(x), and interpret
9
+ (ST)(p(x)) as S(T(p(x))). Which of the following is true?
10
+
11
+ (A) ST = 0 (B) ST = T (C) ST = TS (D) ST - TS is the identity map of V onto
12
+ itself.'
13
+ target: "Let's think step by step. For a given polynomial $p$ we have\n\\[ST(p)\
14
+ \ = (xp(x))\u2019 = p(x) + xp\u2019(x)\\]\nand\n\\[TS(p) = xp\u2019(x).\\]\n\
15
+ Hence \\[ST(p) - TS(p) = p(x) + xp\u2019(x) - xp\u2019(x).\\] The answer is\
16
+ \ (D)."
17
+ - question: 'Suppose that f(1 + x) = f(x) for all real x. If f is a polynomial and
18
+ f(5) = 11, then f(15/2)
19
+
20
+ (A) -11 (B) 0 (C) 11 (D) 33/2'
21
+ target: Let's think step by step. The only polynomial so that $f(1 + x) = f(x)$
22
+ is a constant polynomial. Hence $f(5) = 11 = f(15/2)$. The answer is (C).
23
+ - question: 'Let A be a real 2x2 matrix. Which of the following statements must be
24
+ true?
25
+
26
+ I. All of the entries of A^2 are nonnegative.
27
+
28
+ II. The determinant of A^2 is nonnegative.
29
+
30
+ III. If A has two distinct eigenvalues, then A^2 has two distinct eigenvalues.
31
+
32
+ (A) I only (B) II only (C) III only (D) II and III only'
33
+ target: 'Let''s think step by step. We have \[ det(A^2) = (det(A))^2 \geq 0,\]
34
+ hence II holds.
35
+
36
+ III is false: as a counterexample take a diagonal matrix with -1 and 1 on the
37
+ diagonal. Then $A^2$ is the identity matrix. The answer is (B).'
38
+ - question: 'Let A be the set of all ordered pairs of integers (m, n) such that 7m
39
+ + 12n = 22. What is the greatest negative number in the set B = {m + n : (m,
40
+ n) \in A}?
41
+
42
+ (A) -5 (B) -4 (C) -3 (D) -2'
43
+ target: Let's think step by step. We have 12n = 22 - 7m and one of the solutions
44
+ is $m = -2$, $n = 3$. Then $m + n = 1$, hence we need to look for smaller $m$
45
+ in order to make $m + n$ negative. The next solution is $m = -14$ and $n = 10$.
46
+ For smaller $m$ we have $m + n$ smaller than $-4$. The answer is (B).
47
+ - question: 'A tank initially contains a salt solution of 3 grams of salt dissolved
48
+ in 100 liters of water. A salt solution containing 0.02 grams of salt per liter
49
+ of water is sprayed into the tank at a rate of 4 liters per minute. The sprayed
50
+ solution is continually mixed with the salt solution in the tank, and the mixture
51
+ flows out of the tank at a rate of 4 liters per minute. If the mixing is instantaneous,
52
+ how many grams of salt are in the tank after 100 minutes have elapsed?
53
+
54
+ (A) 2 (B) 2 - e^-2 (C) 2 + e^-2 (D) 2 + e^-4'
55
+ target: "Let's think step by step. For all $t \\in \\mathbb{R}$, let $s(t)$ denote\
56
+ \ the number grams of salt in the tank at the $t$ minute mark. Then $s(0) =\
57
+ \ 3$.\nWe use $s$ and $s(t)$ interchangeably. We also use $s^{\\prime}$ and\
58
+ \ $s^{\\prime}(t)$ interchangeably. The solution sprayed into the tank adds\
59
+ \ $(0.02) 4=2 / 25$ grams of salt per minute. There are always 100 liters of\
60
+ \ liquid in the tank, containing $s$ grams of salt. So the density of salt in\
61
+ \ the tank is $s / 100$ grams per liter. The flow of water out of the tank therefore\
62
+ \ subtracts $4(s / 100)=s / 25$ grams of salt per minute. Then, for all $t \\\
63
+ in \\mathbb{R}$, we have $s^{\\prime}(t)=(2 / 25)-(s / 25)=(2-s) / 25$, and\
64
+ \ so $[s(t)=2] \\Rightarrow\\left[s^{\\prime}(t)=0\right]$. For all $t \\in\
65
+ \ \\mathbb{R}$,\n$$\n\frac{d}{d t}[\\ln (s-2)]=\frac{s^{\\prime}}{s-2}=\frac{-1}{25}=\f\
66
+ rac{d}{d t}\\left[-\frac{t}{25}\right] .\n$$\nChoose $C \\in \\mathbb{R}$ such\
67
+ \ that, for all $t \\in \\mathbb{R}, \\ln ((s(t)-2))=-[t / 25]+C$. Let $K:=e^{C}$.\
68
+ \ Then, for all $t \\in \\mathbb{R}$, we have $(s(t))-2=K e^{-t / 25}$, and\
69
+ \ so $s(t)=2+K e^{-t / 25}$. Then $3=s(0)=2+K e^{0}=2+K$, so $K=1$. Then $s(100)=2+K\
70
+ \ e^{-100 / 25}=2+1 \\cdot e^{-4}=2+e^{-4}$. The answer is (D).\n\n"
71
+ tag: mmlu_flan_cot_fewshot_stem
72
+ include: _mmlu_flan_cot_fewshot_template_yaml
73
+ task: mmlu_flan_cot_fewshot_college_mathematics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: college_medicine
2
+ description: The following are multiple choice questions (with answers) about college
3
+ medicine.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'An expected side effect of creatine supplementation is:
8
+
9
+ (A) muscle weakness. (B) gain in body mass. (C) muscle cramps. (D) loss of electrolytes.'
10
+ target: Let's think step by step. We refer to Wikipedia articles on medicine for
11
+ help. Creatine supplementation is a dietary supplement that results in body
12
+ mass gain. The answer is (B).
13
+ - question: 'Which of the following is not a true statement?
14
+
15
+ (A) Muscle glycogen is broken down enzymatically to glucose-1-phosphate (B)
16
+ Elite endurance runners have a high proportion of Type I fibres in their leg
17
+ muscles (C) Liver glycogen is important in the maintenance of the blood glucose
18
+ concentration (D) Insulin promotes glucose uptake by all tissues in the body'
19
+ target: "Let's think step by step. We refer to Wikipedia articles on medicine\
20
+ \ for help. Let\u2019s solve this step by step and go over each choice: \n(A)\
21
+ \ \u201CMuscle glycogen is broken down enzymatically to glucose-1-phosphate\u201D\
22
+ : This is a correct statement.\n(B) \u201CElite endurance runners have a high\
23
+ \ proportion of Type I fibres in their leg muscles\u201D: This is a correct\
24
+ \ statement.\n(C) \u201CLiver glycogen is important in the maintenance of the\
25
+ \ blood glucose concentration\u201D: This is a correct statement. \n(D) \u201C\
26
+ Insulin promotes glucose uptake by all tissues in the body\u201D: This is not\
27
+ \ a correct statement, because insulin promotes glucose uptake by the liver,\
28
+ \ adipose tissue, and muscle, but not all tissues. For instance, the tissues\
29
+ \ in the brain and red blood cells are not affected by insulin. The answer is\
30
+ \ (D)."
31
+ - question: "A high school science teacher fills a 1 liter bottle with pure nitrogen\
32
+ \ and seals the lid. The pressure is 1.70 atm, and the room temperature is 25\xB0\
33
+ C. Which two variables will both increase the pressure of the system, if all\
34
+ \ other variables are held constant?\n(A) Increasing temperature, increasing\
35
+ \ moles of gas (B) Increasing temperature, increasing volume (C) Decreasing\
36
+ \ volume, decreasing temperature (D) Decreasing moles of gas, increasing volume"
37
+ target: 'Let''s think step by step. We refer to Wikipedia articles on medicine
38
+ for help. The relevant equation for this is the ideal gas law: PV=nRT. To increase
39
+ the pressure of the system (P), then either n (number of moles of the gas) or
40
+ T (temperature) have to increase. The answer is (A).'
41
+ - question: 'In a genetic test of a newborn, a rare genetic disorder is found that
42
+ has X-linked recessive transmission. Which of the following statements is likely
43
+ true regarding the pedigree of this disorder?
44
+
45
+ (A) All descendants on the maternal side will have the disorder. (B) Females
46
+ will be approximately twice as affected as males in this family. (C) All daughters
47
+ of an affected male will be affected. (D) There will be equal distribution of
48
+ males and females affected.'
49
+ target: "Let's think step by step. We refer to Wikipedia articles on medicine\
50
+ \ for help. Let\u2019s solve this step by step. Let's recall first that females\
51
+ \ have two X chromosomes, while males have one X and one Y chromosome. This\
52
+ \ is an important fact we need to know before answering this question. \nBecause\
53
+ \ a male can only pass his only one X chromosome to a daughter, if he is affected\
54
+ \ by this rare genetic disorder, then we know for sure that he will pass this\
55
+ \ rare genetic disorder to all his future-born daughters. Therefore, \u201C\
56
+ (C): All daughters of an affected male will be affected\u201D is a correct statement.\
57
+ \ The answer is (C)."
58
+ - question: 'Glucose is transported into the muscle cell:
59
+
60
+ (A) via protein transporters called GLUT4. (B) only in the presence of insulin.
61
+ (C) via hexokinase. (D) via monocarbylic acid transporters.'
62
+ target: 'Let''s think step by step. We refer to Wikipedia articles on medicine
63
+ for help. Glucose (also known as the blood sugar) is the main sugar found in
64
+ the human body. It is transported into the muscle cell via diffusion through
65
+ protein transporters called GLUT4. The answer is (A).'
66
+ tag: mmlu_flan_cot_fewshot_other
67
+ include: _mmlu_flan_cot_fewshot_template_yaml
68
+ task: mmlu_flan_cot_fewshot_college_medicine
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: college_physics
2
+ description: The following are multiple choice questions (with answers) about college
3
+ physics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'A refracting telescope consists of two converging lenses separated by
8
+ 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification
9
+ of the telescope is
10
+
11
+ (A) 4 (B) 5 (C) 6 (D) 20'
12
+ target: Let's think step by step. In a refracting telescope, if both lenses are
13
+ converging, the focus of both lenses must be between the two lenses, and thus
14
+ the focal lengths of the two lenses must add up to their separation. Since the
15
+ focal length of one lens is 20 cm, the focal length of the other must be 80
16
+ cm. The magnification is the ratio of these two focal lengths, or 4. The answer
17
+ is (A).
18
+ - question: 'The muon decays with a characteristic lifetime of about 10^-6 second into
19
+ an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden
20
+ from decaying into an electron and just a single neutrino by the law of conservation
21
+ of
22
+
23
+ (A) charge (B) mass (C) energy and momentum (D) lepton number'
24
+ target: Let's think step by step. Lepton number must be conserved, meaning the
25
+ total number of leptons minus the number of antileptons. If a muon decays into
26
+ an electron and a single neutrino, the total lepton number would go from one
27
+ to two, violating lepton number conservation. The answer is (D).
28
+ - question: 'One end of a Nichrome wire of length 2L and cross-sectional area A is
29
+ attached to an end of another Nichrome wire of length L and cross- sectional
30
+ area 2A. If the free end of the longer wire is at an electric potential of 8.0
31
+ volts, and the free end of the shorter wire is at an electric potential of 1.0
32
+ volt, the potential at the junction of the two wires is most nearly equal to
33
+
34
+ (A) 2.4 V (B) 3.3 V (C) 4.5 V (D) 5.7 V'
35
+ target: Let's think step by step. This is a simple voltage divider problem, where
36
+ the longer wire has a resistance four times that of the shorter end. So the
37
+ voltage divider ratio is 1 / 5, meaning that the potential in the middle is
38
+ 1.0 V + (8.0 V - 1.0 V) * 1/5 = 2.4 V. The answer is (A).
39
+ - question: 'A refracting telescope consists of two converging lenses separated by
40
+ 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification
41
+ of the telescope is
42
+
43
+ (A) 4 (B) 5 (C) 6 (D) 20'
44
+ target: Let's think step by step. In a refracting telescope, if both lenses are
45
+ converging, the focus of both lenses must be between the two lenses, and thus
46
+ the focal lengths of the two lenses must add up to their separation. Since the
47
+ focal length of one lens is 20 cm, the focal length of the other must be 80
48
+ cm. The magnification is the ratio of these two focal lengths, or 4. The answer
49
+ is (A).
50
+ - question: 'For which of the following thermodynamic processes is the increase in
51
+ the internal energy of an ideal gas equal to the heat added to the gas?
52
+
53
+ (A) Constant temperature (B) Constant volume (C) Constant pressure (D) Adiabatic'
54
+ target: 'Let''s think step by step. Heat added to the gas can go into the gases
55
+ internal energy or work done against an external force. However, if the volume
56
+ of the gas container is constant, no work will be done (since work is pressure
57
+ times change in volume). So, at constant volume, all of the heat goes into the
58
+ internal energy. The answer is (B).'
59
+ tag: mmlu_flan_cot_fewshot_stem
60
+ include: _mmlu_flan_cot_fewshot_template_yaml
61
+ task: mmlu_flan_cot_fewshot_college_physics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: computer_security
2
+ description: The following are multiple choice questions (with answers) about computer
3
+ security.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'SHA-1 has a message digest of
8
+
9
+ (A) 160 bits (B) 512 bits (C) 628 bits (D) 820 bits'
10
+ target: Let's think step by step. Since SHA-1 is a hash function which takes an
11
+ question and produces a 160-bit (20-byte) hash value, its message digest is 160
12
+ bits. The answer is (A).
13
+ - question: "_____________ can modify data on your system \u2013 so that your system\
14
+ \ doesn\u2019t run correctly or you can no longer access specific data, or it\
15
+ \ may even ask for ransom in order to give your access.\n(A) IM \u2013 Trojans\
16
+ \ (B) Backdoor Trojans (C) Trojan-Downloader (D) Ransom Trojan"
17
+ target: Let's think step by step. The system is asking for trojans, which are
18
+ for ransom, which means ransom trojan. The answer is (D).
19
+ - question: 'What is ethical hacking?
20
+
21
+ (A) "Hacking" ethics so they justify unintended selfish behavior (B) Hacking
22
+ systems (e.g., during penetration testing) to expose vulnerabilities so they
23
+ can be fixed, rather than exploited (C) Hacking into systems run by those whose
24
+ ethics you disagree with (D) A slang term for rapid software development, e.g.,
25
+ as part of hackathons'
26
+ target: Let's think step by step. Ethical hacking is a process of detecting vulnerabilities
27
+ in an application, system, or organization's infrastructure that an attacker
28
+ can use to exploit an individual or organization. They use this process to prevent
29
+ cyberattacks and security breaches by lawfully hacking into the systems and
30
+ looking for weak points. The answer is (B).
31
+ - question: 'The ____________ is anything which your search engine cannot search.
32
+
33
+ (A) Haunted web (B) World Wide Web (C) Surface web (D) Deep Web'
34
+ target: "Let's think step by step. The search engine searches on the Surface Web,\
35
+ \ which is the portion of the world wide web which is visible so (B,C) are wrong.\
36
+ \ The Haunted Web doesn\u2019t correspond to an internet concept. The Deep Web\
37
+ \ is the part of the World Wide Web which is not indexed. The answer is (D)."
38
+ - question: 'Exploitation of the Heartbleed bug permits
39
+
40
+ (A) overwriting cryptographic keys in memory (B) a kind of code injection (C)
41
+ a read outside bounds of a buffer (D) a format string attack'
42
+ target: 'Let''s think step by step. The Heartbleed Bug is a serious vulnerability
43
+ in the popular OpenSSL cryptographic software library. Heartbleed resulted from
44
+ improper question validation (due to a missing bounds check) in the implementation
45
+ of the TLS heartbeat extension. The vulnerability was classified as a buffer
46
+ over-read, a situation where more data can be read than should be allowed. The
47
+ answer is (C).'
48
+ tag: mmlu_flan_cot_fewshot_stem
49
+ include: _mmlu_flan_cot_fewshot_template_yaml
50
+ task: mmlu_flan_cot_fewshot_computer_security
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: conceptual_physics
2
+ description: '
3
+
4
+ The following are multiple choice questions (with answers) about conceptual physics.'
5
+ fewshot_config:
6
+ sampler: first_n
7
+ samples:
8
+ - question: 'Colors in a soap bubble result from light
9
+
10
+ (A) converted to a different frequency (B) deflection (C) interference (D) polarization'
11
+ target: Let's think step by step. In a soap bubble film, the light bounces between
12
+ the two soap-air interfaces many times, interfering with itself constructively
13
+ or destructively depending on the width of the film. This results in different
14
+ colors being visible. The answer is (C).
15
+ - question: 'Compared with the mass of a uranium atom undergoing fission, the combined
16
+ masses of the products after fission are
17
+
18
+ (A) less (B) more (C) the same (D) zero'
19
+ target: Let's think step by step. Fission releases energy, which comes from the
20
+ rest mass of its initial nucleus. Thus the mass of the products is less than
21
+ the mass of the reactant uranium nucleus. The answer is (A).
22
+ - question: 'Things that are equivalent according to the equivalence principle are
23
+
24
+ (A) space and time. (B) a traveling twin and a stay-at-home twin. (C) gravity
25
+ and acceleration. (D) mass and energy.'
26
+ target: "Let's think step by step. Einstein\u2019s famous equivalence principle\
27
+ \ states that gravity and acceleration are equivalent. The answer is (C)."
28
+ - question: 'Which of these three elements has the most mass per nucleon?
29
+
30
+ (A) Hydrogen (B) Iron (C) Uranium (D) Same in each'
31
+ target: Let's think step by step. Due to nuclear binding energy, the mass of an
32
+ atomic nucleus is less than the sum of individual masses of the free constituent
33
+ protons and neutrons; this is known as the mass defect. Hydrogen has no mass
34
+ defect because it has only a single nucleon, so it will have the most mass per
35
+ nucleon. The answer is (A).
36
+ - question: 'A model airplane flies slower when flying into the wind and faster with
37
+ wind at its back. When launched at right angles to the wind a cross wind its
38
+ groundspeed compared with flying in still air is
39
+
40
+ (A) the same (B) greater (C) less (D) either greater or less depending on wind
41
+ speed'
42
+ target: "Let's think step by step. The plane\u2019s speed in the direction of\
43
+ \ the wind is greater than it would be in the absence of wind, and its direction\
44
+ \ orthogonal to the wind is the same as it would be in the absence of the wind.\
45
+ \ The total speed, which is these two components added in quadrature, is thus\
46
+ \ greater than the speed in still air. The answer is (B).\n\n"
47
+ tag: mmlu_flan_cot_fewshot_stem
48
+ include: _mmlu_flan_cot_fewshot_template_yaml
49
+ task: mmlu_flan_cot_fewshot_conceptual_physics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: econometrics
2
+ description: The following are multiple choice questions (with answers) about econometrics.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'Suppose now that a researcher wishes to use information criteria to determine
7
+ the optimal lag length for a VAR. 500 observations are available for the bi-variate
8
+ VAR, and the values of the determinant of the variance-covariance matrix of
9
+ residuals are 0.0336, 0.0169, 0.0084, and 0.0062 for 1, 2, 3, and 4 lags respectively.
10
+ What is the optimal model order according to Akaike''s information criterion?
11
+
12
+ (A) 1 lag (B) 2 lags (C) 3 lags (D) 4 lags'
13
+ target: "Let's think step by step. We refer to Wikipedia articles on econometrics\
14
+ \ for help. Let\u2019s solve this problem step by step. First of all, let\u2019\
15
+ s recall that for a given set of data, Akaike's information criterion (AIC)\
16
+ \ allows us to measure how well a statistical model fits the data; it is an\
17
+ \ estimator of prediction error. Here in this problem we will need to use the\
18
+ \ formula ln(det(sigma_hat)) + (2 * k / T) to determine the values of Akaike\u2019\
19
+ s criterion, where ln denotes the natural log function, det the determinant\
20
+ \ function, k the total number of parameters in total (across both equations),\
21
+ \ and T the number of observations (which, in this case, is equal to 500). For\
22
+ \ 1 lag, the number of parameters in total is equal to 6; for 2 lags, it is\
23
+ \ 10; for 3 lags, it is 14; and for 4 lags, it is 18. Now, let\u2019s calculate\
24
+ \ the values of the criterion for each lag:\n(A) 1 lag: ln(0.0336) + (2 * 6\
25
+ \ / 500) = ln(0.0336) + (12 / 500) = -3.369\n(B) 2 lags: ln(0.0169) + (2 * 10\
26
+ \ / 500) = ln(0.0169) + (20 / 500) = -4.040\n(C) 3 lags: ln(0.0084) + (2 * 14\
27
+ \ / 500) = ln(0.0084) + (28 / 500) =-4.724\n(D) 4 lags: ln(0.0062) + (2 * 18\
28
+ \ / 500) = ln(0.0062) + (36 / 500) =-5.011\nBecause the optimal model order\
29
+ \ according to AIC minimizes the information criterion, the answer should be\
30
+ \ the one with the lowest value. In this case, (D) has the lowest value. The\
31
+ \ answer is (C)."
32
+ - question: 'Consider the following AR(1) model with the disturbances having zero mean
33
+ and unit variance
34
+
35
+ yt = 0.2 + 0.4 yt-1 + ut
36
+
37
+ The (unconditional) mean of y will be given by
38
+
39
+ (A) 0.2 (B) 0.4 (C) 0.5 (D) 0.33'
40
+ target: "Let's think step by step. We refer to Wikipedia articles on econometrics\
41
+ \ for help. Let\u2019s solve this problem step by step. If we have a an AR(1)\
42
+ \ model with the disturbances having zero mean and unit variance, then the unconditional\
43
+ \ mean of y is equal to the following:\nunconditional mean of y = (the intercept\
44
+ \ term) / (1 - autoregressive coefficient)\nWe know that the intercept term\
45
+ \ is 0.2 and the autoregressive coefficient is 0.4; thus, we have:\nunconditional\
46
+ \ mean of y = (0.2) / (1 - 0.4) = (0.2) / (0.6) = 2 / 6 = 1 / 3, which is approximately\
47
+ \ 0.33. That means that the answer should be (D) 0.33. The answer is (D)."
48
+ - question: 'What would be then consequences for the OLS estimator if heteroscedasticity
49
+ is present in a regression model but ignored?
50
+
51
+ (A) It will be biased (B) It will be inconsistent (C) It will be inefficient
52
+ (D) All of (a), (b) and (c) will be true.'
53
+ target: Let's think step by step. We refer to Wikipedia articles on econometrics
54
+ for help. Heteroscedasticity refers to the condition where the variance of the
55
+ error terms is not constant across multiple observations. If heteroscedasticity
56
+ is present in a regression model, then the coefficient estimates in the OLS
57
+ estimator will be not only unbiased and consistent but also inefficient. Because
58
+ (A) and (B) are incorrect choices and (C) is a correct choice, (D) cannot be
59
+ the right answer. Ultimately, (C) is the only true choice. The answer is (C).
60
+ - question: 'Suppose that a test statistic has associated with it a p-value of 0.08.
61
+ Which one of the following statements is true?
62
+
63
+ (i) If the size of the test were exactly 8%, we would be indifferent between
64
+ rejecting and not rejecting the null hypothesis
65
+
66
+ (ii) The null would be rejected if a 10% size of test were used
67
+
68
+ (iii) The null would not be rejected if a 1% size of test were used
69
+
70
+ (iv) The null would be rejected if a 5% size of test were used.
71
+
72
+ (A) (ii) and (iv) only (B) (i) and (iii) only (C) (i), (ii), and (iii) only
73
+ (D) (i), (ii), (iii), and (iv).'
74
+ target: "Let's think step by step. We refer to Wikipedia articles on econometrics\
75
+ \ for help. Let\u2019s reason about each of the options.\n(i) is a true statement.\n\
76
+ (ii) is a true statement.\n(iii) is a true statement.\n(iv) is not a true statement.\
77
+ \ Thus, (i), (ii), and (iii) are true. The answer is (C)."
78
+ - question: 'For a stationary autoregressive process, shocks will
79
+
80
+ (A) Eventually die away (B) Persist indefinitely (C) Grow exponentially (D)
81
+ Never occur'
82
+ target: 'Let''s think step by step. We refer to Wikipedia articles on econometrics
83
+ for help. This is a formal logic problem about stationally process. For a stationary
84
+ autoregressive process, shocks will eventually die away. The answer is (A).'
85
+ tag: mmlu_flan_cot_fewshot_social_sciences
86
+ include: _mmlu_flan_cot_fewshot_template_yaml
87
+ task: mmlu_flan_cot_fewshot_econometrics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: electrical_engineering
2
+ description: '
3
+
4
+ The following are multiple choice questions (with answers) about electrical engineering.'
5
+ fewshot_config:
6
+ sampler: first_n
7
+ samples:
8
+ - question: "A point pole has a strength of 4\u03C0 * 10^-4 weber. The force in newtons\
9
+ \ on a point pole of 4\u03C0 * 1.5 * 10^-4 weber placed at a distance of 10\
10
+ \ cm from it will be\n(A) 15 N. (B) 20 N. (C) 7.5 N. (D) 3.75 N."
11
+ target: "Let's think step by step. The force between two point poles is given\
12
+ \ by m_1m_2/(mu_0 4 \\pi r^2), in analogy to Coulomb\u2019s law. Plugging in\
13
+ \ the values given in the question, we calculate that the force is approximately\
14
+ \ 15 N. The answer is (A)."
15
+ - question: 'The coil of a moving coil meter has 100 turns, is 40 mm long and 30 mm
16
+ wide. The control torque is 240*10-6 N-m on full scale. If magnetic flux density
17
+ is 1Wb/m2 range of meter is
18
+
19
+ (A) 1 mA. (B) 2 mA. (C) 3 mA. (D) 4 mA.'
20
+ target: Let's think step by step. The torque on a coil in a uniform magnetic field
21
+ is given by BANI, where B is the magnetic flux density, A is the area of the
22
+ coil, N is the number of turns, and I is the current. So we have that I = (Torque)/(BAN),
23
+ or 240e-6/(1200e-6 * 100 * 1) = 2e-3. The answer is (B).
24
+ - question: 'In an SR latch built from NOR gates, which condition is not allowed
25
+
26
+ (A) S=0, R=0 (B) S=0, R=1 (C) S=1, R=0 (D) S=1, R=1'
27
+ target: Let's think step by step. An SR latch is a set-reset latch; in the case
28
+ where S=1 and R=1, the circuit has no stable state; instead a race condition
29
+ will be produced within the circuit, so the device will be in an undefined state.
30
+ So S=1, R=1 is an illegal question. The answer is (D).
31
+ - question: 'Two long parallel conductors carry 100 A. If the conductors are separated
32
+ by 20 mm, the force per meter of length of each conductor will be
33
+
34
+ (A) 100 N. (B) 0.1 N. (C) 1 N. (D) 0.01 N.'
35
+ target: Let's think step by step. The magnetic force-per-length between two current-carrying
36
+ conductors is given by \mu_0 I_1 I_2 / (2 \pi r), where $r$ is the separation
37
+ distance and I_1 and I_2 are the currents. Plugging in 100 A for I_1 and I_2,
38
+ and 20 mm for r, gives 0.1 N. The answer is (B).
39
+ - question: "In a 2 pole lap winding dc machine , the resistance of one conductor is\
40
+ \ 2\u03A9 and total number of conductors is 100. Find the total resistance\n\
41
+ (A) 200\u03A9 (B) 100\u03A9 (C) 50\u03A9 (D) 10\u03A9"
42
+ target: 'Let''s think step by step. In lap winding, effectively two resistors
43
+ are connected in parallel, so the actual resistance of each pair is 1 Ohm. Since
44
+ we have 50 pairs, we get a total resistance of 50 Ohms. The answer is (C).'
45
+ tag: mmlu_flan_cot_fewshot_stem
46
+ include: _mmlu_flan_cot_fewshot_template_yaml
47
+ task: mmlu_flan_cot_fewshot_electrical_engineering
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: formal_logic
2
+ description: The following are multiple choice questions (with answers) about formal
3
+ logic.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: "Which of the given formulas of PL is the best symbolization of the following\
8
+ \ sentence?\nTurtles live long lives and are happy creatures, unless they are\
9
+ \ injured.\n(A) (L \u2022 H) \u2261 I (B) (L \u2022 H) \u2228 I (C) L \u2022\
10
+ \ (H \u2228 I) (D) L \u2022 (H \u2283 R)."
11
+ target: "Let's think step by step. We refer to Wikipedia articles on formal logic\
12
+ \ for help. Let\u2019s solve this step by step. Let \u201CL\u201D denote \u201C\
13
+ living long\u201D, H \u201Cbeing happy\u201D, and \u201CI\u201D \u201Cbeing\
14
+ \ injured\u201D. Now, consider each choice:\n(A) means (living long AND being\
15
+ \ happy) is equivalent to (being injured). \n(B) means (living long AND being\
16
+ \ happy) OR (being injured). \n(C) means (living long) AND (being happy OR being\
17
+ \ injured). \n(D) means (living long) AND (being happy implies being R), but\
18
+ \ what R denotes is not clear.\nObviously, (B) is the best symbolization of\
19
+ \ the original sentence. The answer is (B)."
20
+ - question: 'Select the best translation into predicate logic.George borrows Hector''s
21
+ lawnmower. (g: George; h: Hector; l: Hector''s lawnmower; Bxyx: x borrows y
22
+ from z).
23
+
24
+ (A) Blgh (B) Bhlg (C) Bglh (D) Bghl'
25
+ target: "Let's think step by step. We refer to Wikipedia articles on formal logic\
26
+ \ for help. Let\u2019s solve this step by step. We are told that \u201CBxyx\u201D\
27
+ \ means \u201Cx borrows y from z\u201D. We can rewrite \u201CGeorge borrows\
28
+ \ Hector's lawnmower\u201D as \u201CGeorge borrows a lawnmower from Hector\u201D\
29
+ , which can then be translated into predicate logic as \u201CBglh\u201D. The\
30
+ \ answer \u201CBglh\u201D appears in (C); therefore, (C) must be the correct\
31
+ \ answer. The answer is (C)."
32
+ - question: "\nSelect the best English interpretation of the given arguments in predicate\
33
+ \ logic.\nDm\n(\u2200x)(Wx \u2283 ~Dx). \n(\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax\n\
34
+ (A) Marina is a dancer. Some weaklings are not dancers. Either everything is\
35
+ \ a weakling or Georgia plays volleyball. So something plays volleyball. (B)\
36
+ \ Marina is a dancer. No weakling is a dancer. Everything is either a weakling\
37
+ \ or plays volleyball. So something plays volleyball. (C) Marina is a dancer.\
38
+ \ Some weaklings are not dancers. Everything is either a weakling or plays volleyball.\
39
+ \ So something plays volleyball. (D) Marina is a dancer. No weakling is a dancer.\
40
+ \ Either everything is a weakling or Georgia plays volleyball. So something\
41
+ \ plays volleyball."
42
+ target: "Let's think step by step. We refer to Wikipedia articles on formal logic\
43
+ \ for help. Let\u2019s solve this step by step. Let \u201CD\u201D denote \u201C\
44
+ being a dancer\u201D, \u201Cm\u201D denote \u201CMaria\u201D, \u201Cg\u201D\
45
+ \ denote \u201CGeorgia\u201D, \u201CW\u201D denote \u201Cweakling\u201D, \u201C\
46
+ A\u201D denote \u201Cplaying volleyball\u201D. Then, we have the following:\n\
47
+ 1. Dm \u2192 Maria is a dance.\n2. (\u2200x)(Wx \u2283 ~Dx). \u2192 For all\
48
+ \ x, if x is a weakling, then x is not a dancer. In other words, no weakling\
49
+ \ is a dancer.\n3. (\u2200x)Wx \u2228 Ag\t/ (\u2203x)Ax \u2192 For all x, x\
50
+ \ is a weakling or Georgia plays volleyball. So there exists an x that plays\
51
+ \ volleyball. \nOptions (A) and (C) do claim that some weaklings are not dancers,\
52
+ \ but the second argument strongly states that no weakling is a dancer. Thus,\
53
+ \ we can eliminate them. Option (B) omits the important detail about Georgia\
54
+ \ playing volleyball. Option (D) has all the details presented in the arguments\
55
+ \ and is the best English interpretation of the arguments. The answer is (D)."
56
+ - question: "Select the best translation into predicate logic: No people drive on Mars.\n\
57
+ (A) ~Pd (B) (\u2200x)(Px \u2228 ~Dx) (C) (\u2200x)(Px \u2283 ~Dx) (D) ~Dp"
58
+ target: "Let's think step by step. We refer to Wikipedia articles on formal logic\
59
+ \ for help. Let\u2019s solve this step by step. Let \u201CP\u201D denote \u201C\
60
+ being on Mars\u201D and \u201CD\u201D denote \u201Cdriving on Mars\u201D. Then\
61
+ \ let\u2019s consider each option:\nOption (A): ~Pd \u2192 d is not on Mars.\n\
62
+ Option (B): (\u2200x)(Px \u2228 ~Dx) \u2192 For all x, x is on Mars and x do\
63
+ \ not drive on Mars.\nOption (C): (\u2200x)(Px \u2283 ~Dx) \u2192 For all x,\
64
+ \ x is on Mars implies that x do not drive on Mars.\nOption (D): ~Dp: \u2192\
65
+ \ p do not drive on Mars.\nOf all these options, Option (C) appears to be the\
66
+ \ best and most meaningful interpretation of the argument \u201CNo people drive\
67
+ \ on Mars.\u201D The answer is (C).\n\n"
68
+ tag: mmlu_flan_cot_fewshot_humanities
69
+ include: _mmlu_flan_cot_fewshot_template_yaml
70
+ task: mmlu_flan_cot_fewshot_formal_logic
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: global_facts
2
+ description: The following are multiple choice questions (with answers) about global
3
+ facts.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: "As of 2017, how many of the world\u2019s 1-year-old children today have\
8
+ \ been vaccinated against some disease? *\n(A) 80% (B) 60% (C) 40% (D) 20%"
9
+ target: Let's think step by step. We refer to Wikipedia articles on global facts
10
+ for help. According to data published by the World Health Organization, the
11
+ nummber of 1-year-old children vaccinated in 2017 exceeds 80%. The answer is
12
+ (A).
13
+ - question: 'As of 2019, about what percentage of Americans agree that the state is
14
+ run for the benefit of all the people?
15
+
16
+ (A) 31% (B) 46% (C) 61% (D) 76%'
17
+ target: Let's think step by step. We refer to Wikipedia articles on global facts
18
+ for help. In 2019, about 46% percentage of Americans agree that the state is
19
+ run for the benefit of all the people. The answer is (B).
20
+ - question: 'As of 2019, about what percentage of Russians say it is very important
21
+ to have free media in our country without government/state censorship?
22
+
23
+ (A) 38% (B) 53% (C) 68% (D) 83%'
24
+ target: Let's think step by step. We refer to Wikipedia articles on global facts
25
+ for help. As of 2019, about 38% of Russians say it is very important to have
26
+ free media in our country. The answer is (A).
27
+ - question: 'As of 2015, since 1990 forests have ____ in Europe and have ____ in Africa
28
+ and the Americas.
29
+
30
+ (A) increased, increased (B) increased, decreased (C) decreased, increased (D)
31
+ decreased, decreased'
32
+ target: Let's think step by step. We refer to Wikipedia articles on global facts
33
+ for help. As of 2015, since 1990 forests have increased in Europe and have decreased
34
+ in Africa and the Americas. The answer is (B).
35
+ - question: 'Which of the following pairs of statements are both true (as of 2019)?
36
+
37
+ (A) People tend to be optimistic about their own future and the future of their
38
+ nation or the world. (B) People tend to be optimistic about their own future
39
+ but pessimistic about the future of their nation or the world. (C) People tend
40
+ to be pessimistic about their own future but optimistic about the future of
41
+ their nation or the world. (D) People tend to be pessimistic about their own
42
+ future and the future of their nation or the world.'
43
+ target: 'Let''s think step by step. We refer to Wikipedia articles on global facts
44
+ for help. As of 2019, most people tend to be optimistic about their own future
45
+ but pessimistic about the future of their nation or the world. The answer is
46
+ (B).'
47
+ tag: mmlu_flan_cot_fewshot_other
48
+ include: _mmlu_flan_cot_fewshot_template_yaml
49
+ task: mmlu_flan_cot_fewshot_global_facts
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_chemistry
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school chemistry.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which of the following is considered an acid anhydride?
8
+
9
+ (A) HCl (B) H2SO3 (C) SO2 (D) Al(NO3)3'
10
+ target: Let's think step by step. An acid anhydride is a compound that is derived
11
+ by removing water from an acid. The chemical formula for water is H2O, which
12
+ means that we need to determine which of these options, when combined with H2O,
13
+ forms an acid. SO2, or Sulfur dioxide, when combined with H2O, makes H2SO4,
14
+ or sulfuric acid. The answer is (C).
15
+ - question: 'Which of the following is expected to be a polar molecule?
16
+
17
+ (A) PCl4F (B) BF3 (C) CO2 (D) Si(CH3)4'
18
+ target: Let's think step by step. A polar molecule is one that has a slightly
19
+ positive charge on one end of the molecule and a slightly negative charge on
20
+ the other end. Boron trifluoride (BF3) has Boron as the center atom and three
21
+ fluorine atoms attached to it; it is trigonal planar and symmetric, so it is
22
+ nonpolar. Carbon Dioxide (CO2) has Carbon as the central atom with double bonds
23
+ to two Oxygen atoms - this is also symmetrical and therefore nonpolar. The same
24
+ is the case for tetramethyl silane (SI(CH3)4), which is a Silicon atom surrounded
25
+ by four methyl groups. The structure of PCL4F is that Phosphorus is the central
26
+ atom, attached to four chlorines and one fluorine atom. This is asymmetrical,
27
+ and therefore has a net dipole and is expected to be a polar molecule. The answer
28
+ is (A).
29
+ - question: 'From the solubility rules, which of the following is true?
30
+
31
+ (A) All chlorides, bromides, and iodides are soluble (B) All sulfates are soluble
32
+ (C) All hydroxides are soluble (D) All ammonium-containing compounds are soluble'
33
+ target: Let's think step by step. The chlorides, bromides, and iodides of lead,
34
+ silver, and mercury are not soluble in water. This rules out (A). The sulfates
35
+ of lead, barium, and calcium are not soluble in water, which rules out (B).
36
+ The hydroxides of any metal besides sodium, potassium, ammonium, calcium, and
37
+ barium are insoluble. This rules out (C). Typically ammonium ions indicate a
38
+ soluble ionic substance. The answer is (D).
39
+ - question: 'A new compound is synthesized and found to be a monoprotic acid with a
40
+ molar mass of 248 g/mol. When 0.0050 mol of this acid are dissolved in 0.500
41
+ L of water, the pH is measured as 3.89. What is the pKa of this acid?
42
+
43
+ (A) 3.89 (B) 7.78 (C) 5.78 (D) 2.33'
44
+ target: "Let's think step by step. Recall that $[A] = [H^{+}]$. Here, this is\
45
+ \ equal to $$10^{-3.89}$. Then we have $K_{a} = $\nrac{[H^{+}][A^{-}]}{[HA]}\
46
+ \ = \nrac{10^{-3.89} \\cdot 10^{-3.89}}{10^{-2}}. The resulting exponent is\
47
+ \ $-3.89 + (-3.89) - (-2) = 5.78$, therefore $K_a = 10^{-5.78}$. The $pK_a$\
48
+ \ is the negative log of $K_a$, which is equal to $5.78$. The answer is (C)."
49
+ - question: 'A solution contains 2.00 mole of acetic acid, CH3COOH, and 1.00 mole of
50
+ calcium acetate, Ca(CH3COO)2. The solution is able to resist the addition of
51
+ a small amount of strong acid or strong base with only minor changes in the
52
+ pH of the solution. Larger quantities of strong acid or strong base can cause
53
+ a significant change in pH. How many moles of nitric acid, HNO3, may be added
54
+ before the pH begins to change significantly?
55
+
56
+ (A) 0.500 mole (B) 1.00 mole (C) 2.00 mole (D) 3.00 mole'
57
+ target: "Let's think step by step. We would like to compute the buffer capacity\
58
+ \ of this solution. First we write the equation for the ionization of the weak\
59
+ \ acid, in this case of acetic acid. $CH_{3}COOH (aq) + H_{2}O \nightarrow H_{3}O^{+}\
60
+ \ + CH3COO^{-}$. The conjugate base is therefore the acetate ion. The added\
61
+ \ strong acid, Nitric acid, will react with the conjugate base. Therefore the\
62
+ \ maximum amount of acid that can be added will be equal to the amount of acetate\
63
+ \ ion, or 2 moles. The answer is (C).\n\n"
64
+ tag: mmlu_flan_cot_fewshot_stem
65
+ include: _mmlu_flan_cot_fewshot_template_yaml
66
+ task: mmlu_flan_cot_fewshot_high_school_chemistry
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_european_history
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school european history.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'This question refers to the following information.
8
+
9
+ Albeit the king''s Majesty justly and rightfully is and ought to be the supreme
10
+ head of the Church of England, and so is recognized by the clergy of this realm
11
+ in their convocations, yet nevertheless, for corroboration and confirmation
12
+ thereof, and for increase of virtue in Christ''s religion within this realm
13
+ of England, and to repress and extirpate all errors, heresies, and other enormities
14
+ and abuses heretofore used in the same, be it enacted, by authority of this
15
+ present Parliament, that the king, our sovereign lord, his heirs and successors,
16
+ kings of this realm, shall be taken, accepted, and reputed the only supreme
17
+ head in earth of the Church of England, called Anglicans Ecclesia; and shall
18
+ have and enjoy, annexed and united to the imperial crown of this realm, as well
19
+ the title and style thereof, as all honors, dignities, preeminences, jurisdictions,
20
+ privileges, authorities, immunities, profits, and commodities to the said dignity
21
+ of the supreme head of the same Church belonging and appertaining; and that
22
+ our said sovereign lord, his heirs and successors, kings of this realm, shall
23
+ have full power and authority from time to time to visit, repress, redress,
24
+ record, order, correct, restrain, and amend all such errors, heresies, abuses,
25
+ offenses, contempts, and enormities, whatsoever they be, which by any manner
26
+ of spiritual authority or jurisdiction ought or may lawfully be reformed, repressed,
27
+ ordered, redressed, corrected, restrained, or amended, most to the pleasure
28
+ of Almighty God, the increase of virtue in Christ''s religion, and for the conservation
29
+ of the peace, unity, and tranquility of this realm; any usage, foreign land,
30
+ foreign authority, prescription, or any other thing or things to the contrary
31
+ hereof notwithstanding.
32
+
33
+ English Parliament, Act of Supremacy, 1534
34
+
35
+ From the passage, one may infer that the English Parliament wished to argue
36
+ that the Act of Supremacy would
37
+
38
+ (A) give the English king a new position of authority (B) give the position
39
+ of head of the Church of England to Henry VIII alone and exclude his heirs (C)
40
+ establish Calvinism as the one true theology in England (D) end various forms
41
+ of corruption plaguing the Church in England'
42
+ target: Let's think step by step. We refer to Wikipedia articles on european history
43
+ for help. The Act of Supremacy states that it grants authority to the king "to
44
+ repress and extirpate all errors, heresies, and other enormities and abuses",
45
+ referring to the corruption in the Church of England. The answer is (D).
46
+ - question: "This question refers to the following information.\nRead the following\
47
+ \ excerpt.\nThe revolutionary seed had penetrated into every country and spread\
48
+ \ more or less. It was greatly developed under the r\xE9gime of the military\
49
+ \ despotism of Bonaparte. His conquests displaced a number of laws, institutions,\
50
+ \ and customs; broke through bonds sacred among all nations, strong enough to\
51
+ \ resist time itself; which is more than can be said of certain benefits conferred\
52
+ \ by these innovators.\nThe monarchs will fulfil the duties imposed upon them\
53
+ \ by Him who, by entrusting them with power, has charged them to watch over\
54
+ \ the maintenance of justice, and the rights of all, to avoid the paths of error,\
55
+ \ and tread firmly in the way of truth. Placed beyond the passions which agitate\
56
+ \ society, it is in days of trial chiefly that they are called upon to despoil\
57
+ \ realities of their false appearances, and to show themselves as they are,\
58
+ \ fathers invested with the authority belonging by right to the heads of families,\
59
+ \ to prove that, in days of mourning, they know how to be just, wise, and therefore\
60
+ \ strong, and that they will not abandon the people whom they ought to govern\
61
+ \ to be the sport of factions, to error and its consequences, which must involve\
62
+ \ the loss of society.\nUnion between the monarchs is the basis of the policy\
63
+ \ which must now be followed to save society from total ruin. . . .\nLet them\
64
+ \ not confound concessions made to parties with the good they ought to do for\
65
+ \ their people, in modifying, according to their recognized needs, such branches\
66
+ \ of the administration as require it.\nLet them be just, but strong; beneficent,\
67
+ \ but strict.\nLet them maintain religious principles in all their purity, and\
68
+ \ not allow the faith to be attacked and morality interpreted according to the\
69
+ \ social contract or the visions of foolish sectarians.\nLet them suppress Secret\
70
+ \ Societies; that gangrene of society.\n\u2014Klemens von Metternich, Political\
71
+ \ Confession of Faith, 1820\nWhich of the following was the greatest cause of\
72
+ \ the fears expressed by Metternich in the document above?\n(A) The ideas of\
73
+ \ personal liberty and nationalism conceived during the Enlightenment resulted\
74
+ \ in radical revolutions that could spread throughout Europe. (B) The conquest\
75
+ \ of Europe by Napoleon led to the creation of new factions and shifted the\
76
+ \ European balance of power. (C) The power of monarchs had grown to the point\
77
+ \ where it needed to be checked by other powers within each nation or domination\
78
+ \ of civilians would occur. (D) The rising and falling economic cycle of the\
79
+ \ newly emerging capitalist economy could lead to civilian unrest that must\
80
+ \ be suppressed."
81
+ target: Let's think step by step. We refer to Wikipedia articles on european history
82
+ for help. The fears of revolution in early 19th century Europe expressed by
83
+ Klemens von Metternich, a conservative Austrian statesman, were a direct result
84
+ of the age of Enlightenment, a period of European history where the absolute
85
+ power of the monarchy was challenged with ideas of individual liberty and nationalism,
86
+ leading to the French revolution and its effects all over Europe. The answer
87
+ is (A).
88
+ - question: 'This question refers to the following information.
89
+
90
+ The excerpts below are from the Navigation Acts of 1651.
91
+
92
+ [A]fter the first day of December, one thousand six hundred fifty and one, and
93
+ from thence forwards, no goods or commodities whatsoever of the growth, production
94
+ or manufacture of Asia, Africa or America, or of any part thereof; or of any
95
+ islands belonging to them, or which are described or laid down in the usual
96
+ maps or cards of those places, as well of the English plantations as others,
97
+ shall be imported or brought into this Commonwealth of England, or into Ireland,
98
+ or any other lands, islands, plantations, or territories to this Commonwealth
99
+ belonging, or in their possession, in any other ship or ships, vessel or vessels
100
+ whatsoever, but only in such as do truly and without fraud belong only to the
101
+ people of this Commonwealth, or the plantations thereof, as the proprietors
102
+ or right owners thereof; and whereof the master and mariners are also of the
103
+ people of this Commonwealth, under the penalty of the forfeiture and loss of
104
+ all the goods that shall be imported contrary to this act, , , ,
105
+
106
+ [N]o goods or commodities of the growth, production, or manufacture of Europe,
107
+ or of any part thereof, shall after the first day of December, one thousand
108
+ six hundred fifty and one, be imported or brought into this Commonwealth of
109
+ England, or any other lands or territories to this Commonwealth belonging, or
110
+ in their possession, in any ship or ships, vessel or vessels whatsoever, but
111
+ in such as do truly and without fraud belong only to the people of this Commonwealth,
112
+ and in no other, except only such foreign ships and vessels as do truly and
113
+ properly belong to the people of that country or place, of which the said goods
114
+ are the growth, production or manufacture.
115
+
116
+ Which of the following best describes the outcome of the Navigation Acts of
117
+ 1651?
118
+
119
+ (A) They served as a catalyst for the growth of English shipping and overseas
120
+ trade, but did little to limit the prospects of the Dutch in the seventeenth
121
+ century. (B) They brought about almost immediate hardships for the Dutch economy
122
+ as their dominance of overseas trade quickly ended. (C) They were rescinded
123
+ during the restoration of the Stuarts as they sought normal diplomatic relations
124
+ with the Dutch so not as to need Parliament''s financial support for war. (D)
125
+ They led to nearly a century of recurrent war between England and the Netherlands,
126
+ which would not end until after American independence.'
127
+ target: Let's think step by step. We refer to Wikipedia articles on european history
128
+ for help. The Navigation Acts of 1651 helped English shipping by restricting
129
+ the ability of ships from other European countries, especially the Dutch, to
130
+ transport goods from colonies in Asia and Africa into England. The answer is
131
+ (A).
132
+ - question: "This question refers to the following information.\nIn Russia there was\
133
+ \ nothing going on well, and [Souvarine] was in despair over the news he had\
134
+ \ received. His old companions were all turning to the politicians; the famous\
135
+ \ Nihilists who made Europe tremble-sons of village priests, of the lower middle\
136
+ \ class, of tradesmen-could not rise above the idea of national liberation,\
137
+ \ and seemed to believe that the world would be delivered-when they had killed\
138
+ \ their despot&\u2026\n\"Foolery! They'll never get out of it with their foolery.\"\
139
+ \nThen, lowering his voice still more, in a few bitter words he described his\
140
+ \ old dream of fraternity. He had renounced his rank and his fortune; he had\
141
+ \ gone among workmen, only in the hope of seeing at last the foundation of a\
142
+ \ new society of labour in common. All the sous in his pockets had long gone\
143
+ \ to the urchins of the settlement; he had been as tender as a brother with\
144
+ \ the colliers, smiling at their suspicion, winning them over by his quiet workmanlike\
145
+ \ ways and his dislike of chattering. But decidedly the fusion had not taken\
146
+ \ place.\nHis voice changed, his eyes grew bright, he fixed them on \xE9tienne,\
147
+ \ directly addressing him:\n\"Now, do you understand that? These hatworkers\
148
+ \ at Marseilles who have won the great lottery prize of a hundred thousand francs\
149
+ \ have gone off at once and invested it, declaring that they are going to live\
150
+ \ without doing anything! Yes, that is your idea, all of you French workmen;\
151
+ \ you want to unearth a treasure in order to devour it alone afterwards in some\
152
+ \ lazy, selfish corner. You may cry out as much as you like against the rich,\
153
+ \ you haven't got courage enough to give back to the poor the money that luck\
154
+ \ brings you. You will never be worthy of happiness as long as you own anything,\
155
+ \ and your hatred of the bourgeois proceeds solely from an angry desire to be\
156
+ \ bourgeois yourselves in their place.\"\n\xE9mile Zola, French writer, Germinal,\
157
+ \ 1885\nThe passage displays the direct concern for the welfare of the working\
158
+ \ classes that was typically a part of which movement?\n(A) Capitalist (B) Scientific\
159
+ \ (C) Communist (D) Existentialist"
160
+ target: Let's think step by step. We refer to Wikipedia articles on european history
161
+ for help. The modern Communist movement aims to establish a classless society
162
+ based on communal ownership and distribution of property and means of production,
163
+ thereby especially benefiting the working classes. The answer is (C).
164
+ - question: "This question refers to the following information.\nThe following excerpt\
165
+ \ is from a pamphlet.\nYou will do me the justice to remember, that I have always\
166
+ \ strenuously supported the Right of every man to his own opinion, however different\
167
+ \ that opinion might be to mine. He who denies to another this right, makes\
168
+ \ a slave of himself to his present opinion, because he precludes himself the\
169
+ \ right of changing it.\nThe most formidable weapon against errors of every\
170
+ \ kind is Reason. I have never used any other, and I trust I never shall.\n\
171
+ The circumstance that has now taken place in France of the total abolition of\
172
+ \ the whole national order of priesthood, and of everything appertaining to\
173
+ \ compulsive systems of religion, and compulsive articles of faith, has not\
174
+ \ only precipitated my intention, but rendered a work of this kind exceedingly\
175
+ \ necessary, lest in the general wreck of superstition, of false systems of\
176
+ \ government, and false theology, we lose sight of morality, of humanity, and\
177
+ \ of the theology that is true.\nI believe in one God, and no more; and I hope\
178
+ \ for happiness beyond this life.\nI believe in the equality of man; and I believe\
179
+ \ that religious duties consist in doing justice, loving mercy, and endeavoring\
180
+ \ to make our fellow-creatures happy.\nI do not believe in the creed professed\
181
+ \ by the Jewish church, by the Roman church, by the Greek church, by the Turkish\
182
+ \ church, by the Protestant church, nor by any church that I know of. My own\
183
+ \ mind is my own church.\nAll national institutions of churches, whether Jewish,\
184
+ \ Christian or Turkish, appear to me no other than human inventions, set up\
185
+ \ to terrify and enslave mankind, and monopolize power and profit.\nI do not\
186
+ \ mean by this declaration to condemn those who believe otherwise; they have\
187
+ \ the same right to their belief as I have to mine.\n\u2014Thomas Paine, The\
188
+ \ Age of Reason, 1794\u20131795\nWhich of the following Enlightenment philosophes\
189
+ \ designed a system of checks and balances for government to avoid abuses of\
190
+ \ power?\n(A) Jean Jacques Rousseau (B) Baron Montesquieu (C) Mary Wollstonecraft\
191
+ \ (D) Adam Smith"
192
+ target: 'Let''s think step by step. We refer to Wikipedia articles on european
193
+ history for help. Baron Montesquieu was a 18th centrury French philsopher who
194
+ wrote extensively against the monoplization of power and advocated for a system
195
+ of checks and balances in government to prevent the rise of despotism. The answer
196
+ is (B).'
197
+ tag: mmlu_flan_cot_fewshot_humanities
198
+ include: _mmlu_flan_cot_fewshot_template_yaml
199
+ task: mmlu_flan_cot_fewshot_high_school_european_history
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_geography
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school geography.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which one of the following items is an example of nonmaterial culture?
8
+
9
+ (A) Dove soap (B) Dove candy bar (C) Dove symbol (D) A dove (bird).'
10
+ target: Let's think step by step. We refer to Wikipedia articles on geography
11
+ for help. Nonmaterial culture consists of cultural ideas, beliefs or symbols
12
+ that are not physical objects. The answer is (C).
13
+ - question: 'During the third stage of the demographic transition model, which of the
14
+ following is true?
15
+
16
+ (A) Birth rates increase and population growth rate is less rapid. (B) Birth
17
+ rates decline and population growth rate is less rapid. (C) Birth rates increase
18
+ and population growth rate increases. (D) Birth rates decrease and population
19
+ growth rate increases.'
20
+ target: Let's think step by step. We refer to Wikipedia articles on geography
21
+ for help. The demographic transition model models the five different stages
22
+ of population growth as a country goes through economic development, where the
23
+ third stage refers to a period of declining birth rates and lower population
24
+ growth. The answer is (B).
25
+ - question: 'The practice of hiring a foreign third-party service provider to run an
26
+ operation is called
27
+
28
+ (A) outsourcing. (B) offshoring. (C) maquiladoras. (D) locational interdependence.'
29
+ target: Let's think step by step. We refer to Wikipedia articles on geography
30
+ for help. "Offshoring" literally means to move or base some of the activities
31
+ or processes of a company to a foreign country. The answer is (B).
32
+ - question: 'Which of the following statements is NOT accurate regarding the services
33
+ provided by local governments in the United States?
34
+
35
+ (A) Duplication of efforts occurs often. (B) Social problems of the central
36
+ city spill over into the surrounding residential suburbs. (C) Inefficiency in
37
+ providing services occurs often. (D) One neighborhood''s efforts to reduce pollution
38
+ are always supported by neighboring communities.'
39
+ target: Let's think step by step. We refer to Wikipedia articles on geography
40
+ for help. There may be economic, social or political reasons for two neighboring
41
+ communities and their local governments not agreeing to pollution reduction
42
+ efforts initiated by one of them. The answer is (D).
43
+ - question: 'The rate of natural increase of a population is found by subtracting the
44
+
45
+ (A) crude death rate from the crude birth date. (B) crude birth rate from the
46
+ crude death rate. (C) doubling time from the crude birth rate. (D) fertility
47
+ rate from the crude death rate.'
48
+ target: 'Let''s think step by step. We refer to Wikipedia articles on geography
49
+ for help. The difference between number of births and deaths gives the population
50
+ increase at any given time. The answer is (A).'
51
+ tag: mmlu_flan_cot_fewshot_social_sciences
52
+ include: _mmlu_flan_cot_fewshot_template_yaml
53
+ task: mmlu_flan_cot_fewshot_high_school_geography
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_government_and_politics
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school government and politics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which of the following best states an argument made by James Madison in
8
+ The Federalist number 10?
9
+
10
+ (A) Honest politicians can prevent factions from developing. (B) Factions are
11
+ more likely to occur in large republics than in small ones. (C) The negative
12
+ effects of factionalism can be reduced by a republican government. (D) Free
13
+ elections are the people''s best defense against factionalism.'
14
+ target: Let's think step by step. We refer to Wikipedia articles on government
15
+ and politics for help. In the Federalist number 10, James Madison advocated
16
+ for a representative republican form of government to guard against factionalism.
17
+ The answer is (C).
18
+ - question: 'The term "budget deficit" refers to the
19
+
20
+ (A) annual increase in federal spending on the military (B) amount of interest
21
+ on the national debt (C) difference between the initial budget proposals made
22
+ by the president and Congress (D) amount the government spends in excess of
23
+ its revenues'
24
+ target: Let's think step by step. We refer to Wikipedia articles on government
25
+ and politics for help. When the goverment spends more than it earns, their difference
26
+ is the budget deficit. The answer is (D).
27
+ - question: 'Which of the following statements about cabinet departments is FALSE?
28
+
29
+ (A) They are established by the legislative branch. (B) Their members often
30
+ don''t have much influence over presidential decisions. (C) They cannot all
31
+ be run by leaders who belong to the same political party the president does.
32
+ (D) Not every federal agency is a cabinet department.'
33
+ target: Let's think step by step. We refer to Wikipedia articles on government
34
+ and politics for help. There is no law stipulating that some cabinet department
35
+ leaders have to belong to a political party different from that of the president.
36
+ The answer is (C).
37
+ - question: 'Which of the following cases established the precedent that a defendant
38
+ must be informed of the right to remain silent, the right to a lawyer, and protection
39
+ from self-incrimination?
40
+
41
+ (A) Weeks v. United States (B) Betts v. Brady (C) Mapp v. Ohio (D) Miranda v.
42
+ Arizona'
43
+ target: Let's think step by step. We refer to Wikipedia articles on government
44
+ and politics for help. In the landmark Miranda v. Arizona in 1966, the US Supreme
45
+ Court, based on the Fifth and Sixth Amendment of the US Constitution, guaranteed
46
+ a defendant's right to an attorney and protection from self-incrimination. The
47
+ answer is (D).
48
+ - question: 'Uncertainty over the limits to presidential power is caused primarily
49
+ by the fact that
50
+
51
+ (A) the constitutional definition of those powers is broad and unspecific (B)
52
+ most people agree that the Constitution places too many limits on presidential
53
+ power (C) the Supreme Court consistently refuses to rule on cases concerning
54
+ presidential powers (D) constitutional amendments have greatly increased presidential
55
+ powers'
56
+ target: 'Let''s think step by step. We refer to Wikipedia articles on government
57
+ and politics for help. The US Constitution is not very specific about the powers
58
+ of the president, leading to uncertainty over its limits. The answer is (A).'
59
+ tag: mmlu_flan_cot_fewshot_social_sciences
60
+ include: _mmlu_flan_cot_fewshot_template_yaml
61
+ task: mmlu_flan_cot_fewshot_high_school_government_and_politics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_macroeconomics
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school macroeconomics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which of the following policies best describes supply-side fiscal policy?
8
+
9
+ (A) An increase in the money supply (B) Increased government spending (C) Lower
10
+ taxes on research and development of new technology (D) Higher taxes on household
11
+ income'
12
+ target: Let's think step by step. We refer to Wikipedia articles on macroeconomics
13
+ for help. Supply-side fiscal policy stimulates the economy by encouraging more
14
+ production of goods and services through reduction in taxes and deregulation.
15
+ The answer is (C).
16
+ - question: 'The short-run Phillips curve indicates a
17
+
18
+ (A) direct relation between unemployment and inflation (B) direct relation between
19
+ price and quantity demanded (C) inverse relation between price and quantity
20
+ demanded (D) inverse relation between unemployment and inflation'
21
+ target: Let's think step by step. We refer to Wikipedia articles on macroeconomics
22
+ for help. The short-run Phillips curve shows that whenever unemployment decreases
23
+ below a natural level, the inflation starts increasing, and vice-versa. The
24
+ answer is (D).
25
+ - question: 'Holding all else equal which of the following monetary policies would
26
+ be used to boost U.S. exports?
27
+
28
+ (A) Increasing the discount rate (B) Increasing the reserve ratio (C) Buying
29
+ government securities (D) Lowering tariffs'
30
+ target: Let's think step by step. We refer to Wikipedia articles on macroeconomics
31
+ for help. Buying government securities leads to reduction in demand for US dollars
32
+ from foreign buyers, thereby making it cheaper and hence making US exports more
33
+ attractive. The answer is (C).
34
+ - question: 'A federal deficit occurs when
35
+
36
+ (A) exports exceed imports. (B) imports exceed exports. (C) federal tax collections
37
+ exceed spending. (D) federal spending exceeds federal tax revenues.'
38
+ target: Let's think step by step. We refer to Wikipedia articles on macroeconomics
39
+ for help. A federal deficit occurs when federal spending exceeds federal income
40
+ which is primarily from tax revenues. The answer is (D).
41
+ - question: 'Which of the following is not included in the U.S. GDP?
42
+
43
+ (A) The U.S. military opens a new base in a foreign country with 1000 U.S. personnel.
44
+ (B) Japanese consumers buy thousands of CDs produced in the United States. (C)
45
+ An American pop singer performs a sold-out concert in Paris. (D) A French theatrical
46
+ production tours dozens of American cities.'
47
+ target: 'Let''s think step by step. We refer to Wikipedia articles on macroeconomics
48
+ for help. The economic transactions related to the performance of the American
49
+ pop-singer in Paris happens entirely outside the U.S. and hence is not included
50
+ in the GDP numbers. The answer is (C).'
51
+ tag: mmlu_flan_cot_fewshot_social_sciences
52
+ include: _mmlu_flan_cot_fewshot_template_yaml
53
+ task: mmlu_flan_cot_fewshot_high_school_macroeconomics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_mathematics
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school mathematics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Simplify and write the result with a rational denominator: $$\sqrt{\sqrt[3]{\sqrt{\frac{1}{729}}}}$$
8
+
9
+ (A) \frac{3\sqrt{3}}{3} (B) \frac{1}{3} (C) \sqrt{3} (D) \frac{\sqrt{3}}{3}'
10
+ target: Let's think step by step. Factoring $729=3^6$ and combining the roots
11
+ $\frac{1}{2}\frac{1}{3}\frac{1}{2}=\frac{1}{12}$, we get that $\sqrt{\sqrt[3]{\sqrt{\frac{1}{729}}}}=\left(\frac{1}{3^6}\right)^{\frac{1}{12}}=\frac{1}{3^{\frac{1}{2}}}=\frac{3}{\sqrt{3}}$
12
+ The answer is (D).
13
+ - question: 'Five thousand dollars compounded annually at an $x\%$ interest rate takes
14
+ six years to double. At the same interest rate, how many years will it take
15
+ $\$300$ to grow to $\$9600$?
16
+
17
+ (A) 12 (B) 1 (C) 30 (D) 5'
18
+ target: Let's think step by step. To go from $\$300$ to $\$9600$, the value must
19
+ go up by a factor of $9600/300=32=2^5$. Since at this interest rate it takes
20
+ six years for it to double, it will take $5*6=30$ years to grow to $\$9600$.
21
+ The answer is (C).
22
+ - question: "Ten students take a biology test and receive the following scores: 45,\
23
+ \ 55, 50, 70, 65, 80, 40, 90, 70, 85. What is the mean of the students\u2019\
24
+ \ test scores?\n(A) 55 (B) 60 (C) 62 (D) 65"
25
+ target: Let's think step by step. There are 10 students and the sum of their scores
26
+ is $45 + 55 + 50 + 70 + 65 + 80 + 40 + 90 + 70 + 85 = 650$, the mean is $650/10=65$.
27
+ The answer is (D).
28
+ - question: 'The variable $x$ varies directly as the square of $y$, and $y$ varies
29
+ directly as the cube of $z$. If $x$ equals $-16$ when $z$ equals 2, what is
30
+ the value of $x$ when $z$ equals $\frac{1}{2}$?
31
+
32
+ (A) -1 (B) 16 (C) -\frac{1}{256} (D) \frac{1}{16}'
33
+ target: Let's think step by step. We know that $x \propto y^2$ and $y \propto
34
+ z^3$, so $x = k z^6$ for some constant $k$. Plugging in for $x=-16$ and $z=2$,
35
+ the constant value is $k=\frac{x}{z^6}=\frac{-16}{64}=-\frac{1}{4}$. So, when
36
+ $z=\frac{1}{2}$, the value of $x$ is $x=kz^6=-\frac{1}{4}\frac{1}{2^6}=-\frac{1}{256}$.
37
+ The answer is (C).
38
+ - question: 'Joe was in charge of lights for a dance. The red light blinks every two
39
+ seconds, the yellow light every three seconds, and the blue light every five
40
+ seconds. If we include the very beginning and very end of the dance, how many
41
+ times during a seven minute dance will all the lights come on at the same time?
42
+ (Assume that all three lights blink simultaneously at the very beginning of
43
+ the dance.)
44
+
45
+ (A) 3 (B) 15 (C) 6 (D) 5'
46
+ target: 'Let''s think step by step. The least common multiple of 2, 3 and 5 is
47
+ 30, so during a 7 minute dance, all the three lights will come on at the same
48
+ time $2*7+1=15$ times. The answer is (B).'
49
+ tag: mmlu_flan_cot_fewshot_stem
50
+ include: _mmlu_flan_cot_fewshot_template_yaml
51
+ task: mmlu_flan_cot_fewshot_high_school_mathematics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_microeconomics
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school microeconomics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which of the following is necessarily a characteristic of oligopoly?
8
+
9
+ (A) Free entry into and exit from the market (B) A few large producers (C) One
10
+ producer of a good with no close substitutes (D) A homogenous product'
11
+ target: Let's think step by step. We refer to Wikipedia articles on microeconomics
12
+ for help. An oligopoly is when a market is dominated by just one or a few number
13
+ of sellers or producers. To get oligopoly, the market should have high barriers
14
+ to new entry, and the product has differentiation. The answer is (B).
15
+ - question: 'If the government subsidizes producers in a perfectly competitive market,
16
+ then
17
+
18
+ (A) the demand for the product will increase (B) the demand for the product
19
+ will decrease (C) the consumer surplus will increase (D) the consumer surplus
20
+ will decrease'
21
+ target: Let's think step by step. We refer to Wikipedia articles on microeconomics
22
+ for help. (A) and (B) are wrong because the demand curve does not change at
23
+ all. If the government subsidizes producers, the supply will increase, and thus
24
+ the consumer surplus also increases. The answer is (C).
25
+ - question: 'Which of the following is true of a price floor?
26
+
27
+ (A) The price floor shifts the demand curve to the left. (B) An effective floor
28
+ creates a shortage of the good. (C) The price floor shifts the supply curve
29
+ of the good to the right. (D) To be an effective floor, it must be set above
30
+ the equilibrium price.'
31
+ target: Let's think step by step. We refer to Wikipedia articles on microeconomics
32
+ for help. Price floor does not shift the demand or shift curve. An effective
33
+ price floor should be set above the equilibrium price, otherwise the market
34
+ bears and the floor does not have effective effect. The answer is (D).
35
+ - question: 'The concentration ratio for a monopoly is
36
+
37
+ (A) 0 (B) 5 (C) 10 (D) 100'
38
+ target: Let's think step by step. We refer to Wikipedia articles on microeconomics
39
+ for help. The concentration ratio is calculated as the sum of market share of
40
+ a specific number of largest companies. Monopoly means one company or entity
41
+ controls the entire market, therefore, the concentration ratio is 100 percent.
42
+ The answer is (D).
43
+ - question: 'In a competitive labor market for housepainters, which of the following
44
+ would increase the demand for housepainters?
45
+
46
+ (A) An effective minimum wage imposed on this labor market. (B) An increase
47
+ in the price of gallons of paint. (C) An increase in the construction of new
48
+ houses. (D) An increase in the price of mechanical painters so long as the output
49
+ effect exceeds the substitution effect.'
50
+ target: 'Let''s think step by step. We refer to Wikipedia articles on microeconomics
51
+ for help. An increase in the construction of new houses means an increase demand
52
+ of in-house painting, thus increases the demand for housepainters. The answer
53
+ is (C).'
54
+ tag: mmlu_flan_cot_fewshot_social_sciences
55
+ include: _mmlu_flan_cot_fewshot_template_yaml
56
+ task: mmlu_flan_cot_fewshot_high_school_microeconomics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_physics
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school physics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'A microwave oven is connected to an outlet, 120 V, and draws a current
8
+ of 2 amps. At what rate is energy being used by the microwave oven?
9
+
10
+ (A) 10 W (B) 30 W (C) 60 W (D) 240 W'
11
+ target: Let's think step by step. Rate of energy usage is known as power; in an
12
+ dissipative electrical circuit, power is given by voltage times current. So
13
+ in our case, the power is 120 V times 2 amps, or 240 W. The answer is (D).
14
+ - question: "A point charge, Q = +1 mC, is fixed at the origin. How much work is required\
15
+ \ to move a charge, Q = +8 \xB5C, from the point (0, 4 meters) to the point\
16
+ \ (3 meters, 0)?\n(A) 3.5 J (B) 6.0 J (C) 22.5 J (D) 40 J"
17
+ target: "Let's think step by step. To calculate the work required to move a charge\
18
+ \ from one location to another in a fixed electric field, it is enough to calculate\
19
+ \ the potential difference between the two locations. Here, the potential only\
20
+ \ depends on the distance between the charges; it\u2019s $k q_1 q_2 / r$, where\
21
+ \ $k$ is Coulomb\u2019s constant. Plugging in values $q_1 = $ 1 mC, $q_2 = 8\
22
+ \ \\mu$ C, gives the answer as 5.992 J, which rounds to 6 J. The answer is (B)."
23
+ - question: 'Which of the following conditions will ensure that angular momentum is
24
+ conserved? I. Conservation of linear momentum II. Zero net external force III.
25
+ Zero net external torque
26
+
27
+ (A) I and II only (B) I and III only (C) II and III only (D) III only'
28
+ target: Let's think step by step. Torque is defined as the change in angular momentum;
29
+ if there is zero external torque, angular momentum is conserved. The answer
30
+ is (D).
31
+ - question: "A photocell of work function \u03D5 = 2eV is connected to a resistor in\
32
+ \ series. Light of frequency f = 1 \xD7 10^15 Hz hits a metal plate of the photocell.\
33
+ \ If the power of the light is P = 100 W, what is the current through the resistor?\n\
34
+ (A) 2:00 AM (B) 6:00 AM (C) 12:00 AM (D) 24 A"
35
+ target: Let's think step by step. The only answer above which has units of current
36
+ is D, 24 A. The answer is (D).
37
+ - question: "A pipe full of air is closed at one end. A standing wave is produced in\
38
+ \ the pipe, causing the pipe to sound a note. Which of the following is a correct\
39
+ \ statement about the wave\u2019s properties at the closed end of the pipe?\n\
40
+ (A) The pressure is at a node, but the particle displacement is at an antinode.\
41
+ \ (B) The pressure is at an antinode, but the particle displacement is at a\
42
+ \ node. (C) The pressure and the particle displacement are both at nodes. (D)\
43
+ \ The pressure and the particle displacement are both at antinodes."
44
+ target: 'Let''s think step by step. At the closed end of the pipe, the particles
45
+ cannot have any net displacement because the pipe closure stops them. So the
46
+ particle displacement is at a node. This closure also causes the pressure to
47
+ be maximal, i.e. an antinode. The answer is (B).'
48
+ tag: mmlu_flan_cot_fewshot_stem
49
+ include: _mmlu_flan_cot_fewshot_template_yaml
50
+ task: mmlu_flan_cot_fewshot_high_school_physics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_psychology
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school psychology.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Pascale is interested in the processing strategies children use to learn
8
+ new information. Pascale would best be classified as what type of psychologist?
9
+
10
+ (A) sociocultural (B) clinical (C) cognitive (D) behaviorist'
11
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
12
+ for help. Sociocultural psychologist focuses on the effect of societal factors
13
+ on people. Clinical psychologist focuses on people with mental issues. Cognitive
14
+ psychologist focuses on how people think and learn, including the processing
15
+ strategies. Behaviorist focuses more on the environment and experience effect
16
+ on people. The answer is (C).
17
+ - question: 'According to Caplan''s model of consultee-centered case consultation,
18
+ the consultant is primarily interested in
19
+
20
+ (A) identifying the causes and solutions of the client''s presenting problems
21
+ (B) identifying and eliminating the causes of the consultee''s difficulties
22
+ in handling a problem (C) establishing a hierarchy of authority to enable effective
23
+ decision making (D) presenting a single, well-defined and unambiguous course
24
+ of action for the consultant to overcome skills deficits'
25
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
26
+ for help. Caplan defines two type of consultation. Client-centered case consultation
27
+ aims to handle client's problems, while consultee-centered case consultation
28
+ aims to identify the reason of client's difficulty to solve problems. The answer
29
+ is (B).
30
+ - question: 'According to the Individuals with Disabilities Education Improvement Act,
31
+ which of the following must an educational agency do before it changes the educational
32
+ placement of a student with a disability?
33
+
34
+ (A) Give the child a trial period in the new environment (B) Notify the parents
35
+ in writing (C) Obtain school board approval (D) Obtain parental consent'
36
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
37
+ for help. When the decision to change the educational placement of a student
38
+ with a disability is made, the educational agency must notify the parents in
39
+ writing on that date. The answer is (B).
40
+ - question: 'While swimming in the ocean, Ivan is frightened by a dark shadow in the
41
+ water even before he has the chance to identify what the shadow is. The synaptic
42
+ connections taking place during this incident of fright are best described by
43
+ which of the following?
44
+
45
+ (A) Messages are sent from the thalamus directly to the amygdala. (B) Messages
46
+ are sent from the thalamus to the "what" and "where" pathways. (C) Messages
47
+ are sent from the parasympathetic nervous system to the cerebral cortex. (D)
48
+ Messages are sent from the frontal lobes to the pituitary gland.'
49
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
50
+ for help. Our neural system has a mechanism that can respond immediate emotional
51
+ signal before going to the thought center. In the Ivan's case, messages travel
52
+ directly from thalamus to amygdala. The answer is (A).
53
+ - question: 'Ani believes that her attitudes and behavior play a central role in what
54
+ happens to her. Such a belief is likely to be associated with
55
+
56
+ (A) a strong superego. (B) low self-esteem. (C) low self-efficacy. (D) an internal
57
+ locus of control.'
58
+ target: 'Let''s think step by step. We refer to Wikipedia articles on psychology
59
+ for help. People with an external locus of control believes fate and luck play
60
+ an important role in their lives, while people with an internal locus of control
61
+ believes they control their lives. The answer is (D).'
62
+ tag: mmlu_flan_cot_fewshot_social_sciences
63
+ include: _mmlu_flan_cot_fewshot_template_yaml
64
+ task: mmlu_flan_cot_fewshot_high_school_psychology
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: high_school_statistics
2
+ description: The following are multiple choice questions (with answers) about high
3
+ school statistics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'A new smartwatch is manufactured in one part of a factory, then secured
8
+ for shipping in another, independent part of the factory. The weight of the
9
+ smartwatch has a mean of 62 grams and a standard deviation of 1.0 grams. The
10
+ weight of the packaging (box, user''s guide, bubble wrap, etc.) has a mean of
11
+ 456 grams and a standard deviation of 6 grams. Together, the distribution of
12
+ the weight of the smartwatch and its packaging would have the following mean
13
+ and standard deviation:
14
+
15
+ (A) Mean 518 grams; standard deviation 7.0 grams (B) Mean 518 grams; standard
16
+ deviation 3.5 grams (C) Mean 518 grams; standard deviation 6.1 grams (D) Mean
17
+ 394 grams; standard deviation 6.1 grams'
18
+ target: Let's think step by step. Since the weight of the watch and the weight
19
+ of the packaging are independent random variables, the mean and variance of
20
+ their sum is equal to the sum of their individual means and variances. So the
21
+ mean is 62 + 456 = 518 grams, and the variances is 1.0^2 + 6.0^2 = 37, leading
22
+ to a standard deviation of 6.1 grams. The answer is (C).
23
+ - question: 'After a frost warning was issued, the owner of a large orange grove asked
24
+ his workers to spray all his trees with water. The water was supposed to freeze
25
+ and form a protective covering of ice around the orange blossom. Nevertheless,
26
+ the owner suspected that some trees suffered considerable damage due to the
27
+ frost. To estimate the proportion of trees that suffered more than 50 percent
28
+ damage due to the frost, he took a random sample of 100 trees from his grove.
29
+ What is the response variable in this experiment?
30
+
31
+ (A) The proportion of trees that suffered more than 50 percent damage due to
32
+ frost. (B) The number of trees affected by the frost. (C) The number of trees
33
+ sampled from the grove. (D) For each sampled tree, whether it suffered more
34
+ than 50 percent damage or at most 50 percent damage.'
35
+ target: Let's think step by step. In this experiment, the response variable is
36
+ what is measured. For each tree, what is measured is whether or not it suffered
37
+ more than 50 percent damage due to the frost. The answer is (D).
38
+ - question: 'Suppose X and Y are random variables with E(X) = 37, var(X) = 5, E(Y)
39
+ = 62, and var(Y) = 12. What are the expected value and variance of the random
40
+ variable X + Y?
41
+
42
+ (A) E(X + Y) = 99, var(X + Y) = 8.5 (B) E(X + Y) = 99, var(X + Y) = 13 (C) E(X
43
+ + Y) = 99, var(X + Y) = 17 (D) There is insufficient information to answer this
44
+ question.'
45
+ target: Let's think step by step. While means of sums of random variables add
46
+ (regardless of whether the variables are independent) in order to determine
47
+ the variance of a sum of random variables, we need to know not just their individual
48
+ variances but the covariance of the two variables, which is not given in this
49
+ problem. The answer is (D).
50
+ - question: 'Which of the following sets has the smallest standard deviation? Which
51
+ has the largest?
52
+
53
+ I: {1,2,3}
54
+
55
+ II: {-10,10}
56
+
57
+ III: {100}
58
+
59
+ (A) I, II (B) II, III (C) III, I (D) III, II'
60
+ target: Let's think step by step. The variance of distribution I is the expected
61
+ squared deviation from its mean (which is 2), so the variance is 2/3 . The variance
62
+ of distribution II is 10^2 (because both elements are 10 away from the mean
63
+ of zero). The variance of distribution III is 0, since it has a single entry.
64
+ So distribution III has the smallest standard deviation and distribution II
65
+ has the largest. The answer is (D).
66
+ - question: 'Which of the following is a correct statement about correlation?
67
+
68
+ (A) If the slope of the regression line is exactly 1, then the correlation is
69
+ exactly 1. (B) If the correlation is 0, then the slope of the regression line
70
+ is undefined. (C) Switching which variable is called x and which is called y
71
+ changes the sign of the correlation. (D) The correlation r is equal to the slope
72
+ of the regression line when z-scores for the y-variable are plotted against
73
+ z-scores for the x-variable.'
74
+ target: 'Let''s think step by step. Statement A is false because the slope of
75
+ the regression line being exactly 1 can occur even when the two variables are
76
+ not perfectly correlated. Statement B is false because uncorrelated variables
77
+ regression lines can have slope zero. Statement C is false because correlation
78
+ is symmetric in the two random variables. The answer is (D).'
79
+ tag: mmlu_flan_cot_fewshot_stem
80
+ include: _mmlu_flan_cot_fewshot_template_yaml
81
+ task: mmlu_flan_cot_fewshot_high_school_statistics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: international_law
2
+ description: The following are multiple choice questions (with answers) about international
3
+ law.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'How the consent to be bound of a State may be expressed?
8
+
9
+ (A) The consent of a State to be bound is expressed only by ratification (B)
10
+ The consent of a state to be bound by a treaty may be expressed by signature,
11
+ ratification, acceptance, approval or accession (C) The consent of a State to
12
+ be bound is expressed by signature (D) The consent of a State to be bound is
13
+ expressed by whatever means they choose'
14
+ target: Let's think step by step. We refer to Wikipedia articles on international
15
+ law for help. Article 11 of Vienna Convention on the Law of Treaties signed
16
+ in 1969 states that "the consent of a State to be bound by a treaty may be expressed
17
+ by signature, exchange of instruments constituting a treaty, ratification, acceptance,
18
+ approval or accession, or by any other means if so agreed." (B) is the most
19
+ precise and accurate answer. The answer is (B).
20
+ - question: 'What is the judge ad hoc?
21
+
22
+ (A) If a party to a contentious case before the ICJ does not have a national
23
+ sitting as judge, it is entitled to nominate someone as a judge solely for that
24
+ case, with the title of judge ad hoc (B) Judge ad hoc is the member of the bench
25
+ of the ICJ with a casting vote (C) Judge ad hoc is a surrogate judge, in case
26
+ a judge is disqualified or passes away (D) Judge ad hoc is the judge that each
27
+ party will always nominate in every contentious case'
28
+ target: Let's think step by step. We refer to Wikipedia articles on international
29
+ law for help. As "ad hoc" implies, a judge ad hoc is appointed only for a specific
30
+ case or period, when a party to a contentious case before the International
31
+ Court of Justice does not have a regular national sitting as judge. The answer
32
+ is (A).
33
+ - question: 'When ''consent'' can serve as a circumstance precluding the wrongfulness
34
+ of a State conduct?
35
+
36
+ (A) Consent can serve as a circumstance precluding the wrongfulness whenever
37
+ it is given (B) Consent can never serve as a circumstance precluding wrongfulness
38
+ (C) Consent can serve as a circumstance precluding wrongfulness, provided the
39
+ consent is valid and to the extent that the conduct remains within the limits
40
+ of the consent given (D) Consent can always serve as a circumstance precluding
41
+ wrongfulness, no matter which organ of the State gives it'
42
+ target: Let's think step by step. We refer to Wikipedia articles on international
43
+ law for help. Valid consent can serve as a circumstance precluding the wrongfulness
44
+ of a State conduct if the conduct remains within the limits of that consent,
45
+ according to Chapter V of the Responsibility of States for Internationally Wrongful
46
+ Acts, 2001, United Nations. The answer is (C).
47
+ - question: 'Would a reservation to the definition of torture in the ICCPR be acceptable
48
+ in contemporary practice?
49
+
50
+ (A) This is an acceptable reservation if the reserving country''s legislation
51
+ employs a different definition (B) This is an unacceptable reservation because
52
+ it contravenes the object and purpose of the ICCPR (C) This is an unacceptable
53
+ reservation because the definition of torture in the ICCPR is consistent with
54
+ customary international law (D) This is an acceptable reservation because under
55
+ general international law States have the right to enter reservations to treaties'
56
+ target: Let's think step by step. We refer to Wikipedia articles on international
57
+ law for help. For it contravenes the object and purpose of the ICCPR, this is
58
+ an unacceptable reservation in contemporary practice. The answer is (B).
59
+ - question: 'What types of force does Article 2(4) of the UN Charter prohibit?
60
+
61
+ (A) Article 2(4) encompasses only armed force (B) Article 2(4) encompasses all
62
+ types of force, including sanctions (C) Article 2(4) encompasses all interference
63
+ in the domestic affairs of States (D) Article 2(4) encompasses force directed
64
+ only against a State''s territorial integrity'
65
+ target: 'Let''s think step by step. We refer to Wikipedia articles on international
66
+ law for help. Article 2(4) of the UN Charter prohibits states from using armed
67
+ forces in their international relations. The answer is (A).'
68
+ tag: mmlu_flan_cot_fewshot_humanities
69
+ include: _mmlu_flan_cot_fewshot_template_yaml
70
+ task: mmlu_flan_cot_fewshot_international_law
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: jurisprudence
2
+ description: The following are multiple choice questions (with answers) about jurisprudence.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'Iverson Jewelers wrote a letter to Miller, ''We have received an exceptionally
7
+ fine self winding Rolox watch which we will sell to you at a very favorable
8
+ price.''
9
+
10
+ (A) The letter is an offer to sell (B) A valid offer cannot be made by letter.
11
+ (C) The letter contains a valid offer which will terminate within a reasonable
12
+ time. (D) The letter lacks one of the essential elements of an offer.'
13
+ target: Let's think step by step. We refer to Wikipedia articles on jurisprudence
14
+ for help. An offer shows the intent to enter into a mutually-beneficial contract
15
+ with specific terms. An offer can be made by a letter. While this letter indicates
16
+ the willingness to sell, the lack of specific terms, such as transaction price
17
+ and offer expiration date, makes it an incomplete offer. The answer is (D).
18
+ - question: 'Functions of the law include all but which of the following?
19
+
20
+ (A) maximizing individual freedom (B) providing a basis for compromise (C) keeping
21
+ the peace (D) promoting the principles of the free enterprise system'
22
+ target: Let's think step by step. We refer to Wikipedia articles on jurisprudence
23
+ for help. Laws are fundamentally about helping resolve disputes between individuals,
24
+ and therefore essential for maximizing individual freedom, providing a basis
25
+ for compromise, and keeping the peace. The answer is (D).
26
+ - question: 'The ________ School of jurisprudence postulates that the law is based
27
+ on what is "correct."
28
+
29
+ (A) Natural Law (B) Analytical (C) Historical (D) Sociological'
30
+ target: Let's think step by step. We refer to Wikipedia articles on jurisprudence
31
+ for help. Natural Law School of jurisprudence focuses on the laws of nature,
32
+ and states that the law should be based on ethics, morals, and what is "correct".
33
+ Analytical deals with the law as it already exists, Historical postulates that
34
+ the law was found and not made, and Sociological studies how the law and society
35
+ impact each other. The answer is (A).
36
+ - question: 'Which word best summarizes Weber''s explanation of the development of
37
+ formally rational law?
38
+
39
+ (A) Authority. (B) Charisma. (C) Co-operation. (D) Capitalism.'
40
+ target: Let's think step by step. We refer to Wikipedia articles on jurisprudence
41
+ for help. Weber explained the development of formal rationality in laws as how
42
+ the modern society moved from tradition to rationality, where people decide
43
+ actions based less on how they were culturally done and more on expected utilities.
44
+ How rational individuals optimize efficiency of accomplishing tasks for higher
45
+ rewards is a core principle of Capitalism. The answer is (D).
46
+ - question: 'Which position does Rawls claim is the least likely to be adopted by the
47
+ POP (people in the original position)?
48
+
49
+ (A) The POP would choose equality above liberty. (B) The POP would opt for the
50
+ ''maximin'' strategy. (C) The POP would opt for the ''difference principle''.
51
+ (D) The POP would reject the ''system of natural liberty.'''
52
+ target: 'Let''s think step by step. We refer to Wikipedia articles on jurisprudence
53
+ for help. The POP would opt for the ''maximin'' strategy, opt for the ''difference
54
+ principle'', and reject the ''system of natural liberty'', but the POP would
55
+ not choose equality above liberty, since the POP assume both equal and free
56
+ citizens. The answer is (A).'
57
+ tag: mmlu_flan_cot_fewshot_humanities
58
+ include: _mmlu_flan_cot_fewshot_template_yaml
59
+ task: mmlu_flan_cot_fewshot_jurisprudence
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: machine_learning
2
+ description: The following are multiple choice questions (with answers) about machine
3
+ learning.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'Which image data augmentation is most common for natural images?
8
+
9
+ (A) random crop and horizontal flip (B) random crop and vertical flip (C) posterization
10
+ (D) dithering'
11
+ target: Let's think step by step. Data augmentation is used to increase the diversity
12
+ of images in the training dataset. It is important that natural images are kept
13
+ natural after being augmented. Vertical flips of images are not natural, so
14
+ (B) is false. Posterization makes the image look like a poster and and dithering
15
+ increases color depth. None of these two preserve the natural property. The
16
+ only natural data augmentation technique is (A). The answer is (A).
17
+ - question: "Traditionally, when we have a real-valued question attribute during decision-tree\
18
+ \ learning we consider a binary split according to whether the attribute is\
19
+ \ above or below some threshold. Pat suggests that instead we should just have\
20
+ \ a multiway split with one branch for each of the distinct values of the attribute.\
21
+ \ From the list below choose the single biggest problem with Pat\u2019s suggestion:\n\
22
+ (A) It is too computationally expensive. (B) It would probably result in a decision\
23
+ \ tree that scores badly on the training set and a testset. (C) It would probably\
24
+ \ result in a decision tree that scores well on the training set but badly on\
25
+ \ a testset. (D) It would probably result in a decision tree that scores well\
26
+ \ on a testset but badly on a training set."
27
+ target: "Let's think step by step. Because the question is real valued, it is unlikely\
28
+ \ that the same values appear both at training and test time. This means that\
29
+ \ while such a decision tree could yield good performance on the training data,\
30
+ \ when evaluated on the test data it will perform badly because the decision\
31
+ \ tree won\u2019t know what to do with numbers that did not appear in the training\
32
+ \ data. The answer is (C)."
33
+ - question: "You are reviewing papers for the World\u2019s Fanciest Machine Learning\
34
+ \ Conference, and you see submissions with the following claims. Which ones\
35
+ \ would you consider accepting?\n(A) My method achieves a training error lower\
36
+ \ than all previous methods! (B) My method achieves a test error lower than\
37
+ \ all previous methods! (Footnote: When regularisation parameter \u03BB is chosen\
38
+ \ so as to minimise test error.) (C) My method achieves a test error lower than\
39
+ \ all previous methods! (Footnote: When regularisation parameter \u03BB is chosen\
40
+ \ so as to minimise cross-validaton error.) (D) My method achieves a cross-validation\
41
+ \ error lower than all previous methods! (Footnote: When regularisation parameter\
42
+ \ \u03BB is chosen so as to minimise cross-validaton error.)"
43
+ target: "Let's think step by step. In machine learning, we train with some data\
44
+ \ and fixed hyperparameters and the training error can be arbitrarily low, so\
45
+ \ (A) can\u2019t be right. Then, one compares different hyperparameters by selecting\
46
+ \ the model with the lowest cross-validation error, this means that (B) and\
47
+ \ (D) are not the right procedure. The only relevant number after these is the\
48
+ \ test error and thus (C) is the right answer. The answer is (C)."
49
+ - question: 'A 6-sided die is rolled 15 times and the results are: side 1 comes up
50
+ 0 times; side 2: 1 time; side 3: 2 times; side 4: 3 times; side 5: 4 times;
51
+ side 6: 5 times. Based on these results, what is the probability of side 3 coming
52
+ up when using Add-1 Smoothing?
53
+
54
+ (A) 2.0/15 (B) 1.0/7 (C) 3.0/16 (D) 1.0/5'
55
+ target: 'Let''s think step by step. Add-1 smoothing adds the value of one to the
56
+ different counts and then normalizes the probabilities accordingly. The counts
57
+ after adding one will be: side 1 comes up 1 time; side 2: 2 times; side 3: 3
58
+ times; side 4: 4 times; side 5: 5 times; side 6: 6 times. The number of sum
59
+ one die rolls will be 21, so the probability of drawing a three is 3/21 = 1/7.
60
+ The answer is (B).'
61
+ - question: 'To achieve an 0/1 loss estimate that is less than 1 percent of the true
62
+ 0/1 loss (with probability 95%), according to Hoeffding''s inequality the IID
63
+ test set must have how many examples?
64
+
65
+ (A) around 10 examples (B) around 100 examples (C) between 100 and 500 examples
66
+ (D) more than 1000 examples'
67
+ target: "Let's think step by step. By the Hoeffding\u2019s inequality, we expect\
68
+ \ that with 95% probability the in-sample and out-of-sample errors differ by\
69
+ \ epsilon when we have N samples if 2 exp(-2 epsilon^2 N)<0.05, this implies\
70
+ \ that N > -1/(2*epsilon**2) log ( 0.05/2 )= log (40)*5000. Since log(40)>1,\
71
+ \ we have that one needs more than 1000 examples. The answer is (D).\n\n"
72
+ tag: mmlu_flan_cot_fewshot_stem
73
+ include: _mmlu_flan_cot_fewshot_template_yaml
74
+ task: mmlu_flan_cot_fewshot_machine_learning
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: management
2
+ description: The following are multiple choice questions (with answers) about management.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'How can organisational structures that are characterised by democratic
7
+ and inclusive styles of management be described?
8
+
9
+ (A) Hierarchical (B) Bureaucratic (C) Flat (D) Functional'
10
+ target: Let's think step by step. We refer to Wikipedia articles on management
11
+ for help. Flat organizational structures are characterized by democratic and
12
+ inclusive styles of management, and have few (if any) levels of management between
13
+ the workers and managers. The answer is (C).
14
+ - question: 'Hygiene factors are associated with which writer?
15
+
16
+ (A) Frederick Hertzberg (B) D.C. McClelland (C) Abraham Maslow (D) Douglas McGregor'
17
+ target: Let's think step by step. We refer to Wikipedia articles on management
18
+ for help. Hygiene factors include compensation, company policies, supervision,
19
+ interpersonal relations, and work environments. Hertzberg lists them as factors
20
+ that cannot motivate employees but can minimize job dissatisfaction. The answer
21
+ is (A).
22
+ - question: 'What characteristic is not a key feature of the ''open systems'' model
23
+ of management?
24
+
25
+ (A) Morale (B) Innovation (C) Growth resource (D) Adaptation'
26
+ target: Let's think step by step. We refer to Wikipedia articles on management
27
+ for help. The key characteristics of an open system in management include innovation,
28
+ growth resource, and adaption, but do not include morale. The answer is (A).
29
+ - question: 'Which element of the cultural web forms regalia?
30
+
31
+ (A) Symbols (B) Rituals and routines (C) Power structures (D) Control systems'
32
+ target: Let's think step by step. We refer to Wikipedia articles on management
33
+ for help. The cultural web is a tool for mapping an organization's culture,
34
+ where symbols form the regalia that visually expresses the values that the organization
35
+ holds as important. The answer is (A).
36
+ - question: 'What are the two main dimensions of the Ohio Studies into leadership?
37
+
38
+ (A) Starting position and end position (B) Initial environment and changed environment
39
+ (C) Organisational structure and conditioning (D) Initiating structure and considerations'
40
+ target: 'Let''s think step by step. We refer to Wikipedia articles on management
41
+ for help. The Ohio State Leadership Studies conducted in the 1940s identified
42
+ initiating structure and consideration as the two main dimensions of leader
43
+ behavior. The answer is (D).'
44
+ tag: mmlu_flan_cot_fewshot_other
45
+ include: _mmlu_flan_cot_fewshot_template_yaml
46
+ task: mmlu_flan_cot_fewshot_management
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: marketing
2
+ description: The following are multiple choice questions (with answers) about marketing.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'Although the content and quality can be as controlled as direct mail,
7
+ response rates of this medium are lower because of the lack of a personal address
8
+ mechanism. This media format is known as:
9
+
10
+ (A) Care lines. (B) Direct mail. (C) Inserts. (D) Door to door.'
11
+ target: Let's think step by step. We refer to Wikipedia articles on marketing
12
+ for help. Door to door marketing delivers non-addressed items within all buildings
13
+ within a geographic area. While it can control the content and quality as well
14
+ as direct mail marketing, its response rate is lower because of the lack of
15
+ a personal address mechanism. The answer is (D).
16
+ - question: 'In an organization, the group of people tasked with buying decisions is
17
+ referred to as the _______________.
18
+
19
+ (A) Outsourcing unit. (B) Procurement centre. (C) Chief executive unit. (D)
20
+ Decision-making unit.'
21
+ target: Let's think step by step. We refer to Wikipedia articles on marketing
22
+ for help. In an organization, the group of the people tasked with buying decision
23
+ is referred to as the decision-making unit. The answer is (D).
24
+ - question: 'The single group within society that is most vulnerable to reference group
25
+ influence is:
26
+
27
+ (A) The older consumer who feels somewhat left out of things. (B) The married
28
+ women, many of whom feel a need for stability in their lives. (C) New immigrants
29
+ who really want to assimilate into their new culture. (D) Children, who base
30
+ most of their buying decisions on outside influences.'
31
+ target: Let's think step by step. We refer to Wikipedia articles on marketing
32
+ for help. Children, who mostly based their buying decisions on outside influences,
33
+ are the single group within society that is more vulnerable to reference group
34
+ influence. The answer is (D).
35
+ - question: 'Which of the following is an assumption in Maslow''s hierarchy of needs?
36
+
37
+ (A) Needs are dependent on culture and also on social class. (B) Lower-level
38
+ needs must be at least partially satisfied before higher needs can affect behaviour.
39
+ (C) Needs are not prioritized or arranged in any particular order. (D) Satisfied
40
+ needs are motivators, and new needs emerge when current needs remain unmet.'
41
+ target: Let's think step by step. We refer to Wikipedia articles on marketing
42
+ for help. Maslow's hierarchy of needs, from the bottom upwards, are physiological
43
+ (food and clothing), safety, love and belonging needs, esteem, and self-actualization.
44
+ Lower-level needs must be at least partially satisfied before higher ones can
45
+ affect behavior. The answer is (B).
46
+ - question: '_____________ is a natural outcome when combining demographic and geographic
47
+ variables.
48
+
49
+ (A) Geodemographics (B) Product differentiation. (C) ANSOFF matrix. (D) Brand
50
+ management.'
51
+ target: 'Let''s think step by step. We refer to Wikipedia articles on marketing
52
+ for help. Geodemographics is a natural outcome when combining demographic and
53
+ geographic variables. The answer is (A).'
54
+ tag: mmlu_flan_cot_fewshot_other
55
+ include: _mmlu_flan_cot_fewshot_template_yaml
56
+ task: mmlu_flan_cot_fewshot_marketing
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: medical_genetics
2
+ description: The following are multiple choice questions (with answers) about medical
3
+ genetics.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'The stage of meiosis in which chromosomes pair and cross over is:
8
+
9
+ (A) prophase I (B) metaphase I (C) prophase II (D) metaphase II'
10
+ target: Let's think step by step. We refer to Wikipedia articles on medical genetics
11
+ for help. Prophase I is the stage of meiosis where homologous chromosomes pair
12
+ with each other and exchange genetic material. The answer is (A).
13
+ - question: 'DNA ligase is
14
+
15
+ (A) an enzyme that joins fragments in normal DNA replication (B) an enzyme of
16
+ bacterial origin which cuts DNA at defined base sequences (C) an enzyme that
17
+ facilitates transcription of specific genes (D) an enzyme which limits the level
18
+ to which a particular nutrient reaches'
19
+ target: Let's think step by step. We refer to Wikipedia articles on medical genetics
20
+ for help. DNA ligase is a type of enzyme (EC 6.5.1.1) responsible for joining
21
+ DNA strands together by catalyzing a phosphodiester bond. The answer is (A).
22
+ - question: 'Which of the following conditions does not show multifactorial inheritance?
23
+
24
+ (A) Pyloric stenosis (B) Schizophrenia (C) Spina bifida (neural tube defects)
25
+ (D) Marfan syndrome'
26
+ target: Let's think step by step. We refer to Wikipedia articles on medical genetics
27
+ for help. Multifactorial inheritance is when more than a single factor is responsible
28
+ for causing a given trait or health problem. Genes cannot be the only factor.
29
+ Marfan syndrome, on the other hand, requires only one abnormal copy of the of
30
+ the Marfan gene, from one parent, to inherit the trait. The answer is (D).
31
+ - question: 'A gene showing codominance
32
+
33
+ (A) has both alleles independently expressed in the heterozygote (B) has one
34
+ allele dominant to the other (C) has alleles tightly linked on the same chromosome
35
+ (D) has alleles expressed at the same time in development'
36
+ target: Let's think step by step. We refer to Wikipedia articles on medical genetics
37
+ for help. Codominance, as it relates to genetics, refers to a type of genetic
38
+ inheritance where the phenotype of both the parents is easily observed in the
39
+ offspring. A heterozygote is an individual having two different alleles of a
40
+ gene. The answer is (A).
41
+ - question: 'Large triplet repeat expansions can be detected by:
42
+
43
+ (A) polymerase chain reaction. (B) single strand conformational polymorphism
44
+ analysis. (C) Southern blotting. (D) Western blotting.'
45
+ target: 'Let''s think step by step. We refer to Wikipedia articles on medical
46
+ genetics for help. A Southern blot is a method in molecular biology for detecting
47
+ specific DNA sequences in a sample. Large triplet repeat expansions are usually
48
+ detected with this method. The answer is (C).'
49
+ tag: mmlu_flan_cot_fewshot_other
50
+ include: _mmlu_flan_cot_fewshot_template_yaml
51
+ task: mmlu_flan_cot_fewshot_medical_genetics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: miscellaneous
2
+ description: The following are multiple choice questions (with answers) about miscellaneous.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'Which of these songs was a Top 10 hit for the rock band The Police?
7
+
8
+ (A) ''Radio Ga-Ga'' (B) ''Ob-la-di Ob-la-da'' (C) ''De Do Do Do De Da Da Da''
9
+ (D) ''In-a-Gadda-Da-Vida'''
10
+ target: Let's think step by step. We refer to Wikipedia for help. Radio Ga-Ga
11
+ is by Queen. Ob-la-di Ob-la-da is by The Beatles. And In-a-Gadda-Da-Vida is
12
+ by Iron Butterfly. Leaving 'De Do Do Do De Da Da Da' as the only song by The
13
+ Police, and also a Top 10 hit. The answer is (C).
14
+ - question: 'What place is named in the title of the 1979 live album by rock legends
15
+ Cheap Trick?
16
+
17
+ (A) Budapest (B) Budokan (C) Bhutan (D) Britain'
18
+ target: Let's think step by step. We refer to Wikipedia for help. Nippon Budokan
19
+ is an indoor arena in Tokyo, Japan renowned for hosting rock music concerts
20
+ including Cheap Trick in 1978. 'Cheap Trick at Budokan' became the name of their
21
+ album. The answer is (B).
22
+ - question: 'What is produced during photosynthesis?
23
+
24
+ (A) hydrogen (B) nylon (C) oxygen (D) light'
25
+ target: Let's think step by step. We refer to Wikipedia for help. Photosynthesis
26
+ is the process in which green plants use the green pigment chlorophyll to synthesize
27
+ foods with water and carbon dioxide. Oxygen is the byproduct of this process.
28
+ The answer is (C).
29
+ - question: 'Who is the shortest man to ever win an NBA slam dunk competition?
30
+
31
+ (A) Anthony ''Spud'' Webb (B) Michael ''Air'' Jordan (C) Tyrone ''Muggsy'' Bogues
32
+ (D) Julius ''Dr J'' Erving'
33
+ target: Let's think step by step. We refer to Wikipedia for help. In 1986, Spud
34
+ Webb, standing only 5'7" became the shortest NBA player in history to win an
35
+ official slam dunk contest. The answer is (A).
36
+ - question: 'How many axles does a standard automobile have?
37
+
38
+ (A) one (B) two (C) four (D) eight'
39
+ target: 'Let''s think step by step. We refer to Wikipedia for help. Most cars
40
+ have two axles to rotate the wheels.. The answer is (B).'
41
+ tag: mmlu_flan_cot_fewshot_other
42
+ include: _mmlu_flan_cot_fewshot_template_yaml
43
+ task: mmlu_flan_cot_fewshot_miscellaneous
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: moral_scenarios
2
+ description: The following are multiple choice questions (with answers) about moral
3
+ scenarios.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'For which of these two scenarios does the main character (who uses I/me/my)
8
+ do something clearly morally wrong, according to ordinary moral standards in
9
+ the US as of 2020? Scenario 1 | I took his wallet after he harassed me at the
10
+ bar. Scenario 2 | I made love to my newborn baby boy.
11
+
12
+ (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not
13
+ wrong'
14
+ target: Let's think step by step. We refer to Wikipedia articles on moral scenarios
15
+ for help. Stealing from someone is not considered an ethical response to harrassment.
16
+ And having sex with a minor is egregiously unethical. The answer is (A).
17
+ - question: 'For which of these two scenarios does the main character (who uses I/me/my)
18
+ do something clearly morally wrong, according to ordinary moral standards in
19
+ the US as of 2020? Scenario 1 | I asked for a cigarette from the guy outside
20
+ the store. Scenario 2 | When the security guard left his post, I pocketed a
21
+ rare coin.
22
+
23
+ (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not
24
+ wrong'
25
+ target: Let's think step by step. We refer to Wikipedia articles on moral scenarios
26
+ for help. Asking for something is not morally wrong. Stealing is morally wrong.
27
+ The answer is (C).
28
+ - question: 'For which of these two scenarios does the main character (who uses I/me/my)
29
+ do something clearly morally wrong, according to ordinary moral standards in
30
+ the US as of 2020? Scenario 1 | During the house''s demolition, I pushed him
31
+ into a wall. Scenario 2 | When I choose a babysitter for my twin nieces, I don''t
32
+ care about her racial background.
33
+
34
+ (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not
35
+ wrong'
36
+ target: Let's think step by step. We refer to Wikipedia articles on moral scenarios
37
+ for help. Physical violence is morally wrong. Being open to hiring people from
38
+ different racial backgrounds is an expected moral stance. The answer is (B).
39
+ - question: 'For which of these two scenarios does the main character (who uses I/me/my)
40
+ do something clearly morally wrong, according to ordinary moral standards in
41
+ the US as of 2020? Scenario 1 | I bought an old pick-up truck for work. Scenario
42
+ 2 | I looked up at the gorgeous waterfall and threw a rock at it.
43
+
44
+ (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not
45
+ wrong'
46
+ target: Let's think step by step. We refer to Wikipedia articles on moral scenarios
47
+ for help. Neither of these are wrong as no one is wronged by buing a pick-up
48
+ truck or throwing a rock into a stream. The answer is (D).
49
+ - question: 'For which of these two scenarios does the main character (who uses I/me/my)
50
+ do something clearly morally wrong, according to ordinary moral standards in
51
+ the US as of 2020? Scenario 1 | I really loved my newborn baby boy. Scenario
52
+ 2 | The new girl is embarrassed to live in a trailer park, so I invite the class
53
+ to her house.
54
+
55
+ (A) Wrong, Wrong (B) Wrong, Not wrong (C) Not wrong, Wrong (D) Not wrong, Not
56
+ wrong'
57
+ target: 'Let''s think step by step. We refer to Wikipedia articles on moral scenarios
58
+ for help. Loving someone is not wrong. However, exposing something that someone
59
+ is embarrassed about could be considered quite mean. The answer is (C).'
60
+ tag: mmlu_flan_cot_fewshot_humanities
61
+ include: _mmlu_flan_cot_fewshot_template_yaml
62
+ task: mmlu_flan_cot_fewshot_moral_scenarios
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: nutrition
2
+ description: The following are multiple choice questions (with answers) about nutrition.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'What is the first-line drug for patients with type 2 diabetes and obesity,
7
+ as of 2020?
8
+
9
+ (A) Acarbose (B) Metformin (C) Sulphonylureas (D) Insulin'
10
+ target: Let's think step by step. We refer to Wikipedia articles on nutrition
11
+ for help. Metformin (Fortamet, Glumetza, or others) is usually the first medication
12
+ prescribed for type 2 diabetes, as well as obesity. It works by lowering glucose
13
+ production in the liver and improving the body's sensitivity to insulin. The
14
+ answer is (B).
15
+ - question: 'Which of the following statements is correct (according to knowledge in
16
+ 2020)?
17
+
18
+ (A) Consumers with phenylketonuria must avoid the consumption of the sweetener
19
+ aspartame (B) Consumers with phenylketonuria must avoid the consumption of the
20
+ sweetener saccharin (C) Consumers with phenylketonuria must avoid the consumption
21
+ of the sweetener sucralose (D) Consumers with phenylketonuria must avoid the
22
+ consumption of the sweetener acesulfame K'
23
+ target: Let's think step by step. We refer to Wikipedia articles on nutrition
24
+ for help. People with phenylketonuria (PKU) cannot break down the amino acid
25
+ phenylalanine. As it builds up in the blood and brain it can lead to brain damage.
26
+ People with PKU should avoid foods that are converted to phenylalanine in the
27
+ body, such as aspartame. The answer is (A).
28
+ - question: 'Which of the following statements about iodine is correct, as of 2020?
29
+
30
+ (A) 50% of adults consume iodine at levels below the RNI (B) Dairy products
31
+ are a poor source of iodine (C) The iodine content of organic milk is generally
32
+ lower that the level in non-organic milk (D) UK dietary reference values recommend
33
+ an increase in iodine intake in pregnancy'
34
+ target: Let's think step by step. We refer to Wikipedia articles on nutrition
35
+ for help. Organic milk usually has less iodine content than non-organic milk.
36
+ The answer is (C).
37
+ - question: 'Which of the following is the most plausible explanation for the protective
38
+ effect of dietary fibre against cancer of the colon, as of 2020?
39
+
40
+ (A) Propionic acid, formed during colonic fibre fermentation inhibits liver
41
+ fatty acid synthesis (B) Butyric acid, formed during colonic fibre fermentation
42
+ stimulates "silencing" of the SLC5A8 tumour suppressor gene (C) None of these
43
+ options are correct (D) Butyric acid, formed during colonic fibre fermentation
44
+ stimulates anti-oxidant defences in the colon'
45
+ target: Let's think step by step. We refer to Wikipedia articles on nutrition
46
+ for help. Dietary fibre is inversely proportional to the risk of colorectal
47
+ cancer. This is presumed because butyric acid (BA) stimulates antioxidants which
48
+ help protect the colon from cancerous tumors. The answer is (D).
49
+ - question: 'In a cohort study, the risk ratio of developing diabetes was 0.86 when
50
+ comparing consumers of tea (the exposed) to those who did not drink tea (the
51
+ unexposed). Which one statement is correct (according to knowledge in 2020)?
52
+
53
+ (A) The tea drinkers have lower risk of developing diabetes. (B) The tea drinkers
54
+ have higher risk of developing diabetes. (C) Based on the information given
55
+ we cannot tell if the observed difference in disease risk is the result of chance.
56
+ (D) The risk ratio is close to the value one, so there is no difference in disease
57
+ risk between the two groups.'
58
+ target: 'Let''s think step by step. We refer to Wikipedia articles on nutrition
59
+ for help. The risk ratio is not sufficiently reduced that it could not be explained
60
+ by random chance given the studies sample size. The answer is (C).'
61
+ tag: mmlu_flan_cot_fewshot_other
62
+ include: _mmlu_flan_cot_fewshot_template_yaml
63
+ task: mmlu_flan_cot_fewshot_nutrition
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: philosophy
2
+ description: The following are multiple choice questions (with answers) about philosophy.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'The study of reality in the broadest sense, an inquiry into the elemental
7
+ nature of the universe and the things in it, is known as _____.
8
+
9
+ (A) metaphysics (B) epistemology (C) quantum physics (D) axiology'
10
+ target: Let's think step by step. We refer to Wikipedia articles on philosophy
11
+ for help. Among the options, only metaphysics studies the nature of reality
12
+ and existence. The answer is (A).
13
+ - question: "According to Moore\u2019s \u201Cideal utilitarianism,\u201D the right\
14
+ \ action is the one that brings about the greatest amount of:\n(A) pleasure.\
15
+ \ (B) happiness. (C) good. (D) virtue."
16
+ target: Let's think step by step. We refer to Wikipedia articles on philosophy
17
+ for help. Moore's "ideal utilitarianism" states that one's actions should maximize
18
+ intrinsic goods. The answer is (C).
19
+ - question: 'Before Tolstoy''s Christian conversion, what was his perspective on the
20
+ meaning of life?
21
+
22
+ (A) optimist (B) satisfied (C) nominally religious (D) pessimist'
23
+ target: Let's think step by step. We refer to Wikipedia articles on philosophy
24
+ for help. Before his conversion, Tolstoy feels that life was uncertain, which
25
+ is a pessimist's point of view. The answer is (D).
26
+ - question: 'According to d''Holbach, people always act according to _____.
27
+
28
+ (A) free choices (B) dictates of the soul (C) necessary natural laws (D) undetermined
29
+ will'
30
+ target: Let's think step by step. We refer to Wikipedia articles on philosophy
31
+ for help. d'Holbach believes that people act according to necessary laws, and
32
+ it proves nothing about people's free will. The answer is (C).
33
+ - question: 'Psychological egoism is:
34
+
35
+ (A) an ethical theory about how we ought to behave. (B) a generalization concerning
36
+ the way people tend to behave. (C) a claim about human nature and the ways people
37
+ are capable of behaving. (D) none of the above.'
38
+ target: 'Let''s think step by step. We refer to Wikipedia articles on philosophy
39
+ for help. Psychological egoism suggests that one behaves based on what makes
40
+ one feels good, hence it is a claim about human nature and how humans are capable
41
+ of behaving. The answer is (C).'
42
+ tag: mmlu_flan_cot_fewshot_humanities
43
+ include: _mmlu_flan_cot_fewshot_template_yaml
44
+ task: mmlu_flan_cot_fewshot_philosophy
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: prehistory
2
+ description: The following are multiple choice questions (with answers) about prehistory.
3
+ fewshot_config:
4
+ sampler: first_n
5
+ samples:
6
+ - question: 'What is the approximate mean cranial capacity of Homo erectus?
7
+
8
+ (A) under 650 cc (B) about 800 cc (C) just under 1000 cc (D) 1200 cc'
9
+ target: Let's think step by step. We refer to Wikipedia articles on prehistory
10
+ for help. The average cranium capacity of Homo erectus is less than 1000 cubic
11
+ cm. The answer is (C).
12
+ - question: 'According to Timothy Pauketat, the evidence for social stratification
13
+ and political power at Cahokia suggests:
14
+
15
+ (A) a center of Mississippian civilization with conditions similar to the rise
16
+ of early states. (B) the limitations of authority in a Native American society
17
+ of egalitarian foragers. (C) a simple chiefdom or perhaps a complex chiefdom
18
+ had evolved by A.D. 1500. (D) a center of Mississippian civilization with conditions
19
+ similar to societies on the Northwest Coast of North America.'
20
+ target: Let's think step by step. We refer to Wikipedia articles on prehistory
21
+ for help. Timothy Pauketat is known for his research on Cahokia, the center
22
+ of the Mississippian culture, where he found similar conditions to the rise
23
+ of early states. The answer is (A).
24
+ - question: 'Recent research on hominid species dating from the Middle Pliocene indicates
25
+ there was (as of 2020):
26
+
27
+ (A) a great amount of species diversity, or a single species that exhibited
28
+ a lot of diversity. (B) very little species diversity during this period and
29
+ very few hominids. (C) decreased species diversity due to a prolonged ice age
30
+ followed by a severe drought. (D) decreased species diversity but increased
31
+ numbers of hammerstones and flakes, indicating stone tool manufacture.'
32
+ target: Let's think step by step. We refer to Wikipedia articles on prehistory
33
+ for help. Recent research has recognized multiple hominid species from the Middle
34
+ Pliocene, meaning that there is a great amount of species diversity or diversity
35
+ in a single species. The answer is (A).
36
+ - question: 'Researchers now believe that the decline of the Maya was caused chiefly
37
+ by:
38
+
39
+ (A) a cataclysm of some kind, such as an earthquake, volcano, or tsunami. (B)
40
+ ecological degradation resulting from slash-and-burn farming techniques. (C)
41
+ endless wars between neighboring Mayan city-states. (D) practices of interbreeding
42
+ that led to a steep rise in congenital disorders.'
43
+ target: Let's think step by step. We refer to Wikipedia articles on prehistory
44
+ for help. Researchers believe that the Maya collapse was mainly caused by over-exploitation
45
+ of natural resources like the slash-and-burn farming techniques. The answer
46
+ is (B).
47
+ - question: 'The great Mayan king Pacal built temples in the city of Palenque in order
48
+ to:
49
+
50
+ (A) satisfy the powerful Mayan astronomer priests. (B) display his generosity
51
+ to the common people, since they were allowed to live in the temples. (C) frighten
52
+ away enemies, in particular the Spaniards. (D) legitimize his kingship, since
53
+ his father was not royal.'
54
+ target: 'Let''s think step by step. We refer to Wikipedia articles on prehistory
55
+ for help. Pacal built the temples as the funerary monument to legitimize his
56
+ kingship. The answer is (D).'
57
+ tag: mmlu_flan_cot_fewshot_humanities
58
+ include: _mmlu_flan_cot_fewshot_template_yaml
59
+ task: mmlu_flan_cot_fewshot_prehistory
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: professional_accounting
2
+ description: The following are multiple choice questions (with answers) about professional
3
+ accounting.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: "An auditor traces the serial numbers on equipment to a nonissuer\u2019\
8
+ s subledger. Which of the following management assertions is supported by this\
9
+ \ test?\n(A) Valuation and allocation (B) Completeness (C) Rights and obligations\
10
+ \ (D) Presentation and disclosure"
11
+ target: Let's think step by step. We refer to Wikipedia articles on accounting
12
+ for help. The completeness assertion is tested by tracing supporting documents
13
+ to the record entries. The answer is (B).
14
+ - question: 'One hundred years ago, your great-great-grandmother invested $100 at 5%
15
+ yearly interest. What is the investment worth today?
16
+
17
+ (A) $13,000 (B) $600 (C) $15,000 (D) $28,000'
18
+ target: Let's think step by step. We refer to Wikipedia articles on accounting
19
+ for help. A $100 investment at 5% yearly interest is worth 100*(1.05)^100=13150
20
+ after 100 years, which is around $13,000. The answer is (A).
21
+ - question: 'On January 1, year 1, Alpha Co. signed an annual maintenance agreement
22
+ with a software provider for $15,000 and the maintenance period begins on March
23
+ 1, year 1. Alpha also incurred $5,000 of costs on January 1, year 1, related
24
+ to software modification requests that will increase the functionality of the
25
+ software. Alpha depreciates and amortizes its computer and software assets over
26
+ five years using the straight-line method. What amount is the total expense
27
+ that Alpha should recognize related to the maintenance agreement and the software
28
+ modifications for the year ended December 31, year 1?
29
+
30
+ (A) $5,000 (B) $13,500 (C) $16,000 (D) $20,000'
31
+ target: Let's think step by step. We refer to Wikipedia articles on accounting
32
+ for help. The maintenance period begins on March 1, so only 10 months of expenses
33
+ should be recognized, which is $15,000/12*10=$12,500. The software modification
34
+ cost is amortized over 5 years, so each year is $5,000/5=$1,000. So the total
35
+ expense is $12,500+$1,000=$13,500. The answer is (B).
36
+ - question: 'Krete is an unmarried taxpayer with income exclusively from wages. By
37
+ December 31, year 1, Krete''s employer has withheld $16,000 in federal income
38
+ taxes and Krete has made no estimated tax payments. On April 15, year 2, Krete
39
+ timely filed for an extension request to file her individual tax return, and
40
+ paid $300 of additional taxes. Krete''s year 1 tax liability was $16,500 when
41
+ she timely filed her return on April 30, year 2, and paid the remaining tax
42
+ liability balance. What amount would be subject to the penalty for underpayment
43
+ of estimated taxes?
44
+
45
+ (A) $0 (B) $500 (C) $1,650 (D) $16,500'
46
+ target: Let's think step by step. We refer to Wikipedia articles on accounting
47
+ for help. The tax due after withholding is $16,500-$16,000=$500, which is less
48
+ than $1000, hence there is no underpayment penalty of estimated taxes. The answer
49
+ is (A).
50
+ - question: 'Box a nongovernmental not-for-profit organization had the following transactions
51
+ during the year: Proceeds from sale of investments $80000 Purchase of property
52
+ plant and equipment $10000 Proceeds from long-term debt $100000 Loss on sale
53
+ of investment $5000 What amount should be reported as net cash provided by financing
54
+ activities in Box''s statement of cash flows?
55
+
56
+ (A) $70,000 (B) $75,000 (C) $80,000 (D) 100000'
57
+ target: 'Let''s think step by step. We refer to Wikipedia articles on accounting
58
+ for help. Among the four transactions, only Proceeds from long-term debt belongs
59
+ to the financing activities section of cashflow, hence the amount reported should
60
+ be $100000. The answer is (D).'
61
+ tag: mmlu_flan_cot_fewshot_other
62
+ include: _mmlu_flan_cot_fewshot_template_yaml
63
+ task: mmlu_flan_cot_fewshot_professional_accounting
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: professional_law
2
+ description: The following are multiple choice questions (with answers) about professional
3
+ law.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'A son owed a creditor $5,000. The son''s father contacted the creditor
8
+ and told him that he wanted to pay the son''s debt. The father signed a document
9
+ that stated the father would pay the son''s debt at a rate of $500 a month for
10
+ 10 months. The creditor made no written or oral commitment to forbear to sue
11
+ the son to collect the $5,000 debt, and the father made no oral or written request
12
+ for any such forbearance. For the next five months, the father made and the
13
+ creditor accepted the $500 monthly payments as agreed. During that period, the
14
+ creditor, in fact, did forbear to take any legal action against the son. However,
15
+ the father then informed the creditor that he would make no further payments
16
+ on the debt. Which of the following is the most persuasive argument that the
17
+ father is liable to the creditor under the terms of their agreement?
18
+
19
+ (A) The father''s promise and the creditor''s reliance thereon, if proved, gave
20
+ rise to a valid claim by the creditor against the father based on the doctrine
21
+ of promissory estoppel. (B) Because it was foreseeable that the father''s promise
22
+ would induce the creditor to forbear taking any action against the son, such
23
+ forbearance was, as a matter of law, a bargained-for consideration for the father''s
24
+ promise. (C) The father''s five payments to the creditor totaling $2,500 manifested
25
+ a serious intent on the father''s part to be contractually bound, and such manifestation
26
+ is generally recognized as an effective substitute for consideration. (D) By
27
+ assuming the antecedent debt obligation that the son owed to the creditor, the
28
+ father became a surety whose promise to the creditor was enforceable, since
29
+ it was in writing and supported by adequate consideration. '
30
+ target: Let's think step by step. We refer to Wikipedia articles on law for help.
31
+ The doctrine of promissory estoppel stops a person from going back on a promise
32
+ in contract law, hence option (A) should be the most persuasive argument. The
33
+ answer is (A).
34
+ - question: 'A state has recently enacted a statute prohibiting the disposal of any
35
+ nuclear wastes within the state. This law does not contravene or conflict with
36
+ any federal statutes. A man operates a company in the state that is engaged
37
+ in the disposal of nuclear wastes. Subsequent to the passage of the state statute,
38
+ the man, not yet aware of the new law, entered into contracts with many out-of-state
39
+ firms to dispose of their nuclear wastes in the state. On account of this new
40
+ law, however, the man will be unable to perform these contracts. Assume that
41
+ the man has standing to challenge this state law. Which of the following presents
42
+ his strongest constitutional grounds to challenge the state law prohibiting
43
+ the disposal of nuclear wastes within the state?
44
+
45
+ (A) The commerce clause. (B) The equal protection clause of the Fourteenth Amendment.
46
+ (C) The privileges and immunities clause of Article IV, Section 2. (D) The contract
47
+ clause.'
48
+ target: Let's think step by step. We refer to Wikipedia articles on law for help.
49
+ The commerce clause states that Congress shall have the power to regulate commerce
50
+ with foreign Nations, and among the several States, and with the Indian Tribes.
51
+ The statute affects inter-state commerce which puts it into question. Hence
52
+ the man's strongest argument should be the commerce clause. The answer is (A).
53
+ - question: 'On October 1, 1980, a developer, owner of several hundred acres in a rural
54
+ county, drafted a general development plan for the area. The duly recorded plan
55
+ imposed elaborate limitations and restrictions upon the land in the plan, which
56
+ was to be developed as a residential district. The restrictions were to extend
57
+ to all persons acquiring any of the lots and to their heirs, assigns, and lessees.
58
+ It was further provided that all subsequent owners would be charged with due
59
+ notice of the restrictions. Among those restrictions in the general plan were
60
+ the following:(22) A franchise right is created in a strip of land 10 feet in
61
+ width along the rear of each lot for the use of public utility companies with
62
+ right of ingress and egress. (23) No house or structure of any kind shall be
63
+ built on the aforementioned strip of land running through the said blocks. In
64
+ 2000, a retiree purchased one of the lots, built a house, and erected a fence
65
+ in the rear of his property within the restricted area. In 2004, a teacher purchased
66
+ a lot adjacent to the retiree''s property and built a new house. Two years later,
67
+ a librarian purchased the lot that adjoined the teacher''s property. The three
68
+ deeds to those properties each contained references to the deed book where the
69
+ general plan was recorded. In 2008, the librarian began the construction of
70
+ a seven-foot post-and-rail fence along the line dividing his lot with the teacher''s,
71
+ and along the center of the area subject to the franchise right. Although the
72
+ teacher objected to its construction, the fence was completed. If the teacher
73
+ seeks a mandatory injunction to compel removal of the librarian''s fence, the
74
+ court will most likely
75
+
76
+ (A) grant relief, because the fence was in violation of the easement restriction.
77
+ (B) grant relief, because the encroachment of the fence violated the restriction
78
+ in the original plan. (C) deny relief, because the teacher failed to enforce
79
+ the restriction against the retiree. (D) deny relief, because the fence would
80
+ not be construed as "a structure" within the terms of the restriction. '
81
+ target: Let's think step by step. We refer to Wikipedia articles on law for help.
82
+ The restrictions in the original plan say no house or structure of any kind
83
+ shall be built on the aforementioned strip of land running through the said
84
+ blocks. Hence the court will most likely grant relief because the fence violated
85
+ the restriction in the original plan. The answer is (B).
86
+ - question: 'Judge took judicial notice of some facts at the beginning of the trial.
87
+ Which of the following is not an appropriate kind of fact for judicial notice?
88
+
89
+ (A) Indisputable facts. (B) Facts that have been asserted by individual political
90
+ organizations. (C) Facts recognized to be true by common knowledge. (D) Facts
91
+ capable of scientific verification.'
92
+ target: Let's think step by step. We refer to Wikipedia articles on law for help.
93
+ Among the options, facts that have been asserted by individual political organizations
94
+ is not an appropriate kind of fact for judicial notice. The answer is (B).
95
+ - question: 'A state legislature has recently enacted a statute making it a misdemeanor
96
+ to curse or revile or use obscene or opprobrious language toward or in reference
97
+ to a police officer perfonning his duties. A student at a state university organized
98
+ a demonstration on campus to protest the war. The rally was attended by a group
99
+ of 50 students who shouted anti-war messages at cars passing by. To show his
100
+ contempt for the United States, the student sewed the American flag to the rear
101
+ of his jeans. When a police officer saw the flag sown on the student''s jeans,
102
+ he approached and told him to remove the flag or he would be placed under arrest.
103
+ The student became angered and shouted at the police officer, "Listen, you bastard,
104
+ I''ll wear this rag anywhere I please. " The student was subsequently placed
105
+ under arrest and charged with violating the state statute. The student subsequently
106
+ brings suit in state court challenging the constitutionality of the statute.
107
+ The strongest constitutional argument for the student is that
108
+
109
+ (A) the statute is void for vagueness under the Fourteenth Amendment''s due
110
+ process clause. (B) the statute is invalid because it violates the petitioner''s
111
+ freedom of speech under the First Amendment. (C) the statute is an abridgment
112
+ of freedom of speech under the First Amendment because less restrictive means
113
+ are available for achieving the same purpose. (D) the statute is overbroad and
114
+ consequently invalid under the First and Fourteenth Amendments.'
115
+ target: 'Let''s think step by step. We refer to Wikipedia articles on law for
116
+ help. The Fourteenth Amendment further supports the First Amendment by establishing
117
+ a due process clause. Hence the strongest argument should be the statute is
118
+ overbroad and consequently invalid under the First and Fourteenth Amendments.
119
+ The answer is (D).'
120
+ tag: mmlu_flan_cot_fewshot_humanities
121
+ include: _mmlu_flan_cot_fewshot_template_yaml
122
+ task: mmlu_flan_cot_fewshot_professional_law
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: professional_medicine
2
+ description: The following are multiple choice questions (with answers) about professional
3
+ medicine.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: "A 22-year-old male marathon runner presents to the office with the complaint\
8
+ \ of right-sided rib pain when he runs long distances. Physical examination\
9
+ \ reveals normal heart and lung findings and an exhalation dysfunction at ribs\_\
10
+ 4-5 on the right. Which of the following muscles or muscle groups will be most\
11
+ \ useful in correcting this dysfunction utilizing a direct method?\n(A) anterior\
12
+ \ scalene (B) latissimus dorsi (C) pectoralis minor (D) quadratus lumborum"
13
+ target: Let's think step by step. We refer to Wikipedia articles on medicine for
14
+ help. Among the options, only pectoralis minor muscle origins from the outer
15
+ surfaces of the 3rd to 5th ribs. The answer is (C).
16
+ - question: "A 36-year-old male presents to the office with a\_3-week\_history of low\
17
+ \ back pain. He denies any recent trauma but says that he climbs in and out\
18
+ \ of his truck numerous times a day for his job. Examination of the patient\
19
+ \ in the prone position reveals a deep sacral sulcus on the left, a posterior\
20
+ \ inferior lateral angle on the right, and a lumbosacral junction that springs\
21
+ \ freely on compression. The most likely diagnosis is\n(A) left-on-left sacral\
22
+ \ torsion (B) left-on-right sacral torsion (C) right unilateral sacral flexion\
23
+ \ (D) right-on-right sacral torsion"
24
+ target: Let's think step by step. We refer to Wikipedia articles on medicine for
25
+ help. The deep sulcus on the left, a posterior ILA on the right, with a negative
26
+ spring test suggests a right-on-right sacral torsion. All other options have
27
+ a deep sulcus on the right. The answer is (D).
28
+ - question: "A 44-year-old man comes to the office because of a 3-day history of sore\
29
+ \ throat, nonproductive cough, runny nose, and frontal headache. He says the\
30
+ \ headache is worse in the morning and ibuprofen does provide some relief. He\
31
+ \ has not had shortness of breath. Medical history is unremarkable. He takes\
32
+ \ no medications other than the ibuprofen for pain. Vital signs are temperature\
33
+ \ 37.4\xB0C (99.4\xB0F), pulse 88/min, respirations 18/min, and blood pressure\
34
+ \ 120/84 mm Hg. Examination of the nares shows erythematous mucous membranes.\
35
+ \ Examination of the throat shows erythema and follicular lymphoid hyperplasia\
36
+ \ on the posterior oropharynx. There is no palpable cervical adenopathy. Lungs\
37
+ \ are clear to auscultation. Which of the following is the most likely cause\
38
+ \ of this patient's symptoms?\n(A) Allergic rhinitis (B) Epstein-Barr virus\
39
+ \ (C) Mycoplasma pneumonia (D) Rhinovirus"
40
+ target: Let's think step by step. We refer to Wikipedia articles on medicine for
41
+ help. The symptoms, especially the headache, suggest that the most likely cause
42
+ is Rhinovirus. Epstein-Barr virus will cause swollen lymph nodes but there is
43
+ no palpable cervical adenopathy. Lungs are clear to auscultation suggests it's
44
+ not Mycoplasma pneumonia. The answer is (D).
45
+ - question: 'A previously healthy 32-year-old woman comes to the physician 8 months
46
+ after her husband was killed in a car crash. Since that time, she has had a
47
+ decreased appetite and difficulty falling asleep. She states that she is often
48
+ sad and cries frequently. She has been rechecking the door lock five times before
49
+ leaving her house and has to count exactly five pieces of toilet paper before
50
+ she uses it. She says that she has always been a perfectionist but these urges
51
+ and rituals are new. Pharmacotherapy should be targeted to which of the following
52
+ neurotransmitters?
53
+
54
+ (A) Dopamine (B) Glutamate (C) Norepinephrine (D) Serotonin'
55
+ target: Let's think step by step. We refer to Wikipedia articles on medicine for
56
+ help. The patient feels sad and among the options, only Dopamine and Serotonin
57
+ can help increase positive emotions. Serotonin also affects digestion and metabolism,
58
+ which can help the patient's decreased appetite and sleep difficulty. The answer
59
+ is (D).
60
+ - question: "A 42-year-old man comes to the office for preoperative evaluation prior\
61
+ \ to undergoing adrenalectomy scheduled in 2 weeks. One month ago, he received\
62
+ \ care in the emergency department for pain over his right flank following a\
63
+ \ motor vehicle collision. At that time, blood pressure was 160/100 mm Hg and\
64
+ \ CT scan of the abdomen showed an incidental 10-cm left adrenal mass. Results\
65
+ \ of laboratory studies, including complete blood count, serum electrolyte concentrations,\
66
+ \ and liver function tests, were within the reference ranges. The patient otherwise\
67
+ \ had been healthy and had never been told that he had elevated blood pressure.\
68
+ \ He takes no medications. A follow-up visit in the office 2 weeks ago disclosed\
69
+ \ elevated urinary normetanephrine and metanephrine and plasma aldosterone concentrations.\
70
+ \ The patient was referred to a surgeon, who recommended the adrenalectomy.\
71
+ \ Today, vital signs are temperature 36.6\xB0C (97.9\xB0F), pulse 100/min, respirations\
72
+ \ 14/min, and blood pressure 170/95 mm Hg. Physical examination discloses no\
73
+ \ significant findings. Initial preoperative preparation should include treatment\
74
+ \ with which of the following?\n(A) Labetalol (B) A loading dose of potassium\
75
+ \ chloride (C) Nifedipine (D) Phenoxybenzamine"
76
+ target: 'Let''s think step by step. We refer to Wikipedia articles on medicine
77
+ for help. The symptoms and the adrenal mass suggested pheochromocytoma, and
78
+ the blood pressure indicates hypertension. Phenoxybenzamine is used to treat
79
+ hypertension caused by pheochromocytoma. The answer is (D).'
80
+ tag: mmlu_flan_cot_fewshot_other
81
+ include: _mmlu_flan_cot_fewshot_template_yaml
82
+ task: mmlu_flan_cot_fewshot_professional_medicine
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: professional_psychology
2
+ description: The following are multiple choice questions (with answers) about professional
3
+ psychology.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'In the construction of a multiple regression equation for purposes of
8
+ prediction, the optimal combination of measures is one in which the predictors
9
+
10
+ (A) are uncorrelated with each other but are moderately correlated with the
11
+ criterion (B) have low correlations with each other and low correlations with
12
+ the criterion (C) are highly intercorrelated with each other and moderately
13
+ correlated with the criterion (D) have low correlations with the criterion bur
14
+ are moderately correlated with each other'
15
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
16
+ for help. The basis of multiple regression is to assess the relationship between
17
+ one continuous variable and a set of independent variables. So the predictors
18
+ should be uncorrelated with each other but are moderately correlated with the
19
+ criterion. The answer is (A).
20
+ - question: 'There are three ways to measure the Central Tendency: the Mean, the Median
21
+ and the Mode. From your knowledge about them, what is the mode?
22
+
23
+ (A) less sensitive to extreme scores than the mean (B) more useful for skewed
24
+ distributions (C) sensitive to extreme values and highly skewed distributions
25
+ (D) the most frequently occurring number'
26
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
27
+ for help. The definition of mode is the most frequently occurring number. The
28
+ answer is (D).
29
+ - question: "Carl Jung believed that a client's transference:\n(A) is a fantasy that\
30
+ \ distracts the client from reality. (B) represents \u201Cmixed feelings\u201D\
31
+ \ toward the therapist. (C) \"is a form of \"\"acting out.\"\"\" (D) reflects\
32
+ \ the client\u2019s personal and collective unconscious."
33
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
34
+ for help. Transference is a phenomenon that a person's feelings are unconsciously
35
+ redirected, so it reflects the client's personal and collective unconscious.
36
+ The answer is (D).
37
+ - question: "In terms of Hofstede\u2019s (1980) five cultural dimensions, the United\
38
+ \ States scores at the top of the scale on:\n(A) individualism. (B) individualism\
39
+ \ and power distance. (C) power distance and masculinity. (D) uncertainty avoidance."
40
+ target: Let's think step by step. We refer to Wikipedia articles on psychology
41
+ for help. US scores highest on individualism among the five cultural dimensions.
42
+ The answer is (A).
43
+ - question: 'One of your therapy clients asks your advice about a good weight- reduction
44
+ program. You have investigated the programs in the community and are enrolled
45
+ in the one you consider the best. This program offers a $50 bonus to its patrons
46
+ for each new person they bring into the program. Under these circumstances,
47
+ your most appropriate response would be to
48
+
49
+ (A) tell your client the pros and cons of each program you know about except
50
+ for the one in which you are enrolled (B) recommend to your client the program
51
+ in which you are enrolled and explain the $50 bonus you will receive (C) recommend
52
+ to your client the program in which you are enrolled and offer to have the $50
53
+ bonus credited to your client''s account in the program (D) tell your client
54
+ the pros and cons of each program you know about, but do not claim the $50 bonus
55
+ if your client enrolls in your program'
56
+ target: 'Let''s think step by step. We refer to Wikipedia articles on psychology
57
+ for help. Based on the circumstances, you should tell your client about the
58
+ pros and cons of each program, but it would be inappropriate to receive the
59
+ bonus, so you should not claim the $50 bonus. The answer is (D).'
60
+ tag: mmlu_flan_cot_fewshot_social_sciences
61
+ include: _mmlu_flan_cot_fewshot_template_yaml
62
+ task: mmlu_flan_cot_fewshot_professional_psychology
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: security_studies
2
+ description: The following are multiple choice questions (with answers) about security
3
+ studies.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'What are the frameworks of analysis within which terrorism has been considered
8
+ (as of 2020)?
9
+
10
+ (A) Competition between larger nations has resulted in some countries actively
11
+ supporting terrorist groups to undermine the strength of rival states. Terrorist
12
+ networks are extended patronage clubs maintained and paid for by their donor
13
+ states and are conceptualised as being like state actors, to be dealt with using
14
+ military force. (B) Globalization has enabled the internationalization of terrorist
15
+ activities by opening up their operational space, although coordination is still
16
+ managed from a geographical base. This suggests that terrorist groups are nationally
17
+ structured which means that terrorism cannot be considered in terms of a war
18
+ to be defeated militarily without having serious implications on the indigenous
19
+ population. (C) Terrorism can be viewed as a problem to be resolved by military
20
+ means (war on terrorism), by normal police techniques (terrorism as crime),
21
+ or as a medical problem with underlying causes and symptoms (terrorism as disease).
22
+ (D) Terrorism is viewed as a criminal problem. The criminalization of terrorism
23
+ has two important implications. Firstly, it suggests that terrorism can be eradicated
24
+ - terrorists can be caught and brought to trial by normal judicial proceedings
25
+ thereby removing the threat from society - and secondly, it suggests that preventative
26
+ crime techniques are applicable to prevent its development.'
27
+ target: "Let's think step by step. We refer to Wikipedia articles on security\
28
+ \ studies for help. (A) is wrong because it is not competition between larger\
29
+ \ nations that causes terrorism. \n(B) is wrong because globalization is not\
30
+ \ the cause of terrorism.\n(C) is correct because the US undertook the war on\
31
+ \ terrorism. \n(D) is wrong because preventative crime techniques will likely\
32
+ \ not end terrorism. The answer is (C)."
33
+ - question: 'Which of the following is the best lens through which to investigate the
34
+ role of child soldiers?
35
+
36
+ (A) Child soldiers are victims of combat that need re-education and rehabilitation.
37
+ (B) Children and their mothers are not active subjects in warfare and are best
38
+ considered as subjects in the private sphere. (C) Children are most often innocent
39
+ bystanders in war and are best used as signifiers of peace. (D) Children have
40
+ political subjecthood that is missed when they are considered as passive victims
41
+ of warfare.'
42
+ target: Let's think step by step. We refer to Wikipedia articles on security studies
43
+ for help. Child soliders as a political topic can be missed when they are considered
44
+ passive victims of warfare. The answer is (D).
45
+ - question: 'How can we best describe the relationship between the state-centric approach
46
+ and the concept of human security?
47
+
48
+ (A) There are such wide divisions within the human security framework regarding
49
+ the nature of threats and referent objects that no widely applicable comparisons
50
+ between state-centric approaches and human security can be drawn. (B) By adopting
51
+ the framework of human security, the limitations of the realist state-centric
52
+ approach become evident. Whilst human security defines the referent object as
53
+ the person or population, state-centric approaches prioritise the security of
54
+ the state, de-prioritizing the pursuit of human security. (C) The state-centric
55
+ approach to security is a faction of human security, usually defined within
56
+ the broad school of human security. By being state-centric this approach prioritises
57
+ the individual as the referent object in security studies. (D) Both the state-centric
58
+ and human-centric approaches to security are mutually exclusive and offer a
59
+ sufficient analytic framework with which to understand the international security
60
+ system. It is therefore the role of security analysts to determine which of
61
+ these substantial concepts is correct, and which should be discarded.'
62
+ target: Let's think step by step. We refer to Wikipedia articles on security studies
63
+ for help. Human security focuses on a person or population whereas state-centric
64
+ approaches focus on the state while deprioritizing human security. The answer
65
+ is (B).
66
+ - question: 'In order to become securitized, a threat must be presented in which of
67
+ these ways?
68
+
69
+ (A) As an existential threat that requires immediate and extraordinary action,
70
+ posing a threat to the survival of the state or to societal security. (B) As
71
+ requiring immediate and extraordinary action by the state, threatening the survival
72
+ of a referent object and therefore warranting the use of measures not normally
73
+ employed in the political realm. (C) As an urgent threat to the survival of
74
+ the referent object, so serious that it legitimises the employment of extraordinary
75
+ action in response. (D) As an urgent threat to the survival of the audience
76
+ that requires extraordinary or emergency measures.'
77
+ target: Let's think step by step. We refer to Wikipedia articles on security studies
78
+ for help. To be securitized, a threat must be an urgent threat to the survival
79
+ of the referent object. The answer is (C).
80
+ - question: 'What distinguishes coercive diplomacy from military force?
81
+
82
+ (A) Compellence is another term for coercive diplomacy, but covering a narrower
83
+ set of criteria; compellence covers those threats aimed at initiating adversary
84
+ action. A threat to coerce a state to give up part of its territory would count
85
+ as coercive diplomacy, as long as that threat proactively initiates action before
86
+ reactive diplomacy is taken. (B) Coercive diplomacy constitutes the threats
87
+ of limited force to induce adversary''s incentive to comply with the coercer''s
88
+ demands. It is an influence strategy that is intended to obtain compliance:
89
+ the use of force to defeat an opponent first does not count. It leaves an element
90
+ of choice with the target to comply, or to continue. (C) Military force, or
91
+ the threat of military force, utilises fear to achieve strategic objectives.
92
+ Coercive diplomacy is differentiated from this approach, because it does not
93
+ use fear as a tool for coercing an adversary. (D) Coercive diplomacy is employed
94
+ to use force but to limit its effects on the international community. Coercive
95
+ diplomacy is an aggressive strategy that is intended to obtain compliance through
96
+ defeat. It does not leave an element of choice with the target, the target either
97
+ being forced to comply or engage in conflict. It seeks to control by imposing
98
+ compliance by removing any opportunity for negotiation or concession.'
99
+ target: 'Let''s think step by step. We refer to Wikipedia articles on security
100
+ studies for help. Coercive diplomacy uses the threat of force to induce the
101
+ opponent to comply with demands. The answer is (B).'
102
+ tag: mmlu_flan_cot_fewshot_social_sciences
103
+ include: _mmlu_flan_cot_fewshot_template_yaml
104
+ task: mmlu_flan_cot_fewshot_security_studies
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: us_foreign_policy
2
+ description: The following are multiple choice questions (with answers) about us foreign
3
+ policy.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'How did Donald Trump attack globalization in the 2016 campaign?
8
+
9
+ (A) Globalization had made men like him too rich (B) Globalization only benefited
10
+ certain American states, such as New York (C) Liberal elites had encouraged
11
+ globalization, while ''ordinary Americans'' lost jobs because of it (D) Globalization
12
+ encouraged damaging trade wars'
13
+ target: Let's think step by step. We refer to Wikipedia articles on us foreign
14
+ policy for help. Trump attacked globalization because he believed ordinary Americans
15
+ lost jobs due to it, and so he wanted to blame liberals who had encouraged it.
16
+ The answer is (C).
17
+ - question: 'How did NSC-68 change U.S. strategy?
18
+
19
+ (A) It globalized containment. (B) It militarized containment. (C) It called
20
+ for the development of the hydrogen bomb. (D) All of the above'
21
+ target: Let's think step by step. We refer to Wikipedia articles on us foreign
22
+ policy for help. NSC-68 outlined a variety of courses of action, including globalization
23
+ of containment, militarization of contaiment, and the development of the hydrogen
24
+ bomb. The answer is (D).
25
+ - question: 'How do Defensive Realism and Offensive Realism differ in their explanation
26
+ of state behaviour?
27
+
28
+ (A) Defensive realists place greater emphasis on the role of international institutions
29
+ (B) Defensive realists place less emphasis on geographical factors (C) Offensive
30
+ realists give more priority to the national interest than Defensive realists.
31
+ (D) Defensive realists believe states are security maximizers, while Offensive
32
+ realists believe states to be power maximizers'
33
+ target: Let's think step by step. We refer to Wikipedia articles on us foreign
34
+ policy for help. While defensive realism advocates that states are security
35
+ maximizers, offensive realists think of states as power maximizers. The answer
36
+ is (D).
37
+ - question: 'The realm of policy decisions concerned primarily with relations between
38
+ the United States and the rest of the world is known as
39
+
40
+ (A) terrorism policy. (B) economic policy. (C) foreign policy. (D) international
41
+ policy.'
42
+ target: Let's think step by step. We refer to Wikipedia articles on us foreign
43
+ policy for help. The topic of policy decisions concerns with relations between
44
+ the US and the rest of the world is known as foreign policy. The answer is (C).
45
+ - question: 'How did the 2008 financial crisis affect America''s international reputation?
46
+
47
+ (A) It damaged support for the US model of political economy and capitalism
48
+ (B) It created anger at the United States for exaggerating the crisis (C) It
49
+ increased support for American global leadership under President Obama (D) It
50
+ reduced global use of the US dollar'
51
+ target: 'Let''s think step by step. We refer to Wikipedia articles on us foreign
52
+ policy for help. The 2008 financial crisis damanged the international reputation
53
+ of the American model of political economy and capitalism. The answer is (A).'
54
+ tag: mmlu_flan_cot_fewshot_social_sciences
55
+ include: _mmlu_flan_cot_fewshot_template_yaml
56
+ task: mmlu_flan_cot_fewshot_us_foreign_policy
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: world_religions
2
+ description: The following are multiple choice questions (with answers) about world
3
+ religions.
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - question: 'How can the Upanishads be characterized?
8
+
9
+ (A) Ritual texts (B) Philosophical texts (C) Hymns (D) Origin stories'
10
+ target: Let's think step by step. We refer to Wikipedia articles on world religions
11
+ for help. The Upanishads are the most recent part of Vedas (the oldest scriptures
12
+ in Hinduism) and supplied the basis of later Hindu philosophy. So they are philosophical
13
+ texts. The answer is (B).
14
+ - question: 'What is the Second Gem in Buddhism?
15
+
16
+ (A) The Dharma (B) The Sangha (C) The Buddha (D) The Bodhisattva'
17
+ target: Let's think step by step. We refer to Wikipedia articles on world religions
18
+ for help. The Second Gem in Buddhism is The Dharma. The answer is (A).
19
+ - question: 'Which Japanese government promoted a kind of national cult based on the
20
+ emperor and his associations with kami?
21
+
22
+ (A) Honen (B) Tanaka (C) Tokugawa (D) Meiji'
23
+ target: Let's think step by step. We refer to Wikipedia articles on world religions
24
+ for help. The promotion of a national cult based on the emperor and his associations
25
+ with Kami happened during the reign of Emperor Meiji (1852-1912). The answer
26
+ is (D).
27
+ - question: 'In which dynasty was the "Mandate of Heaven" developed to legitimatize
28
+ the new rulers?
29
+
30
+ (A) Shang (B) Zhou (C) Han (D) Xia'
31
+ target: Let's think step by step. We refer to Wikipedia articles on world religions
32
+ for help. The "Mandate of Heaven" was developed as an ancient Chinese philosophical
33
+ concept during the Zhou Dynasty (1046-256 BCE). The answer is (B).
34
+ - question: 'What is the sign of the covenant for Jewish males?
35
+
36
+ (A) The rainbow (B) Circumcision (C) A son (D) Bar mitzvah'
37
+ target: 'Let''s think step by step. We refer to Wikipedia articles on world religions
38
+ for help. In Judaism, the most distinctive sign of the covenant is circumcision
39
+ (brit milah). The answer is (B).'
40
+ tag: mmlu_flan_cot_fewshot_humanities
41
+ include: _mmlu_flan_cot_fewshot_template_yaml
42
+ task: mmlu_flan_cot_fewshot_world_religions
scripts/yans/lm-evaluation-harness/lm_eval/tasks/pile/README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The Pile
2
+
3
+ ### Paper
4
+ Title: The Pile: An 800GB Dataset of Diverse Text for Language Modeling
5
+
6
+ Abstract: https://arxiv.org/abs/2101.00027
7
+
8
+ The Pile is a 825 GiB diverse, open source language modelling data set that consists
9
+ of 22 smaller, high-quality datasets combined together. To score well on Pile
10
+ BPB (bits per byte), a model must be able to understand many disparate domains
11
+ including books, github repositories, webpages, chat logs, and medical, physics,
12
+ math, computer science, and philosophy papers.
13
+
14
+ Homepage: https://pile.eleuther.ai/
15
+
16
+ ### Citation
17
+ ```
18
+ @article{pile,
19
+ title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
20
+ author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
21
+ journal={arXiv preprint arXiv:2101.00027},
22
+ year={2020}
23
+ }
24
+ ```
25
+
26
+ ### Groups and Tasks
27
+
28
+ #### Groups
29
+
30
+ * `pile`
31
+
32
+ #### Tasks
33
+
34
+ * `pile_arxiv`
35
+ * `pile_bookcorpus2`
36
+ * `pile_books3`
37
+ * `pile_dm-mathematics`
38
+ * `pile_enron`
39
+ * `pile_europarl`
40
+ * `pile_freelaw`
41
+ * `pile_github`
42
+ * `pile_gutenberg`
43
+ * `pile_hackernews`
44
+ * `pile_nih-exporter`
45
+ * `pile_opensubtitles`
46
+ * `pile_openwebtext2`
47
+ * `pile_philpapers`
48
+ * `pile_pile-cc`
49
+ * `pile_pubmed-abstracts`
50
+ * `pile_pubmed-central`
51
+ * `pile_stackexchange`
52
+ * `pile_ubuntu-irc`
53
+ * `pile_uspto`
54
+ * `pile_wikipedia`
55
+ * `pile_youtubesubtitles`
56
+
57
+ ### Checklist
58
+
59
+ For adding novel benchmarks/datasets to the library:
60
+ * [ ] Is the task an existing benchmark in the literature?
61
+ * [ ] Have you referenced the original paper that introduced the task?
62
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
63
+
64
+
65
+ If other tasks on this dataset are already supported:
66
+ * [ ] Is the "Main" variant of this task clearly denoted?
67
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
68
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?