diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..264e27a5e8ebde9a203094c7cc9735ecf8ef3993 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml @@ -0,0 +1,13 @@ +dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split +output_type: multiple_choice +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +doc_to_text: "Question: {{question.strip()}}\nAnswer:" +doc_to_choice: "{{choices}}" +doc_to_target: "{{answer}}" +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/_mmlu.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/_mmlu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c0cabf04b8ac1e1f9c809600214c589cfefbba79 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/_mmlu.yaml @@ -0,0 +1,32 @@ +group: mmlu_continuation +group_alias: mmlu (continuation) +task: + - group: stem + task: + - mmlu_continuation_stem + aggregate_metric_list: + - metric: acc + weight_by_size: True + - group: other + task: + - mmlu_continuation_other + aggregate_metric_list: + - metric: acc + weight_by_size: True + - group: social sciences + task: + - mmlu_continuation_social_sciences + aggregate_metric_list: + - metric: acc + weight_by_size: True + - group: humanities + task: + - mmlu_continuation_humanities + aggregate_metric_list: + - metric: acc + weight_by_size: True +aggregate_metric_list: + - metric: acc + weight_by_size: True +metadata: + version: 2 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6f4e29c0fb5147d883ee993d95822dde10b69d4e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml @@ -0,0 +1,6 @@ +"dataset_name": "abstract_algebra" +"description": "The following are questions (with answers) about abstract\ + \ algebra.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_abstract_algebra" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc3de9c4e6679ba4c9f66494c908d99781adf5bb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml @@ -0,0 +1,6 @@ +"dataset_name": "anatomy" +"description": "The following are questions (with answers) about anatomy.\n\ + \n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_anatomy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..76aabcbfcf13a12e66e1af1daae2811b9b388fc8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml @@ -0,0 +1,6 @@ +"dataset_name": "astronomy" +"description": "The following are questions (with answers) about astronomy.\n\ + \n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_astronomy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e64d0920b9d1ac151712aac84a9e9c3f522c3c9f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "business_ethics" +"description": "The following are questions (with answers) about business\ + \ ethics.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_business_ethics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e79805df6f73782f25be4a302c738b73ecd2f2a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml @@ -0,0 +1,6 @@ +"dataset_name": "clinical_knowledge" +"description": "The following are questions (with answers) about clinical\ + \ knowledge.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_clinical_knowledge" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..936f6ffe49245d558c0ef8fdf04b600dc177c375 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "college_biology" +"description": "The following are questions (with answers) about college\ + \ biology.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_college_biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..289364ee44351c3d1bcee1193563babe6abe2a63 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml @@ -0,0 +1,6 @@ +"dataset_name": "college_chemistry" +"description": "The following are questions (with answers) about college\ + \ chemistry.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_college_chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c7d3c5696067f09f9a68fdd9c3f7a1002d264128 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml @@ -0,0 +1,6 @@ +"dataset_name": "college_computer_science" +"description": "The following are questions (with answers) about college\ + \ computer science.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_college_computer_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2dbc0932f63c0782e106db5fc27e96da9d816dec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "college_mathematics" +"description": "The following are questions (with answers) about college\ + \ mathematics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_college_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..38abd2426f844916087795c4cc04355d8d6c2776 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml @@ -0,0 +1,6 @@ +"dataset_name": "college_medicine" +"description": "The following are questions (with answers) about college\ + \ medicine.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_college_medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee6b42584c834a5e92506650ee3aba58ed1cfd66 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "college_physics" +"description": "The following are questions (with answers) about college\ + \ physics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_college_physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ebb487dfbf634d390d2b2f9aa0e31e5a2f68fc6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml @@ -0,0 +1,6 @@ +"dataset_name": "computer_security" +"description": "The following are questions (with answers) about computer\ + \ security.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_computer_security" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c554caf07da77e4a9bb0bea9672dfcee4777b91 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "conceptual_physics" +"description": "The following are questions (with answers) about conceptual\ + \ physics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_conceptual_physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..848ce4e1f0dbff32d304c28f3d60d453e591a30f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "econometrics" +"description": "The following are questions (with answers) about econometrics.\n\ + \n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_econometrics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d71dd16481a2bb5289ef5b713218dae0292bb11a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml @@ -0,0 +1,6 @@ +"dataset_name": "electrical_engineering" +"description": "The following are questions (with answers) about electrical\ + \ engineering.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_electrical_engineering" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe8aa09718cb8aef0dad48c21926f7dacc7b8ee9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "elementary_mathematics" +"description": "The following are questions (with answers) about elementary\ + \ mathematics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_elementary_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eb5dbd2e505e3fb4604dd75f2d5fe1a35fce3391 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml @@ -0,0 +1,6 @@ +"dataset_name": "formal_logic" +"description": "The following are questions (with answers) about formal\ + \ logic.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_formal_logic" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml new file mode 100644 index 0000000000000000000000000000000000000000..280a50d2ee229b5f047a02024298474225203e54 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml @@ -0,0 +1,6 @@ +"dataset_name": "global_facts" +"description": "The following are questions (with answers) about global\ + \ facts.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_global_facts" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e518a5239a6da013ad31bfca284a3b7096bce840 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_biology" +"description": "The following are questions (with answers) about high\ + \ school biology.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c38d60a7706306b215e156d4c27f05585945f7b4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_chemistry" +"description": "The following are questions (with answers) about high\ + \ school chemistry.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5fe34f7af35456657c1acf40e05b3aaabc7893e8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_computer_science" +"description": "The following are questions (with answers) about high\ + \ school computer science.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_computer_science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..666c2742d1b762c103bbd02ff121676a047fb3e5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_european_history" +"description": "The following are questions (with answers) about high\ + \ school european history.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_european_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41f6caf3e7f3b762af7c0350ca9a73d39bede2b8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_geography" +"description": "The following are questions (with answers) about high\ + \ school geography.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_geography" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e80233dc891e6890a5dec384ed2fbe5b82aca094 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_government_and_politics" +"description": "The following are questions (with answers) about high\ + \ school government and politics.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_government_and_politics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ce7fa9d5e3caa8dd3ec8e25172afda5f997b6c0c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_macroeconomics" +"description": "The following are questions (with answers) about high\ + \ school macroeconomics.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_macroeconomics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2598dcb38eb9f8fdacced20c57d62c83dacb8a40 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_mathematics" +"description": "The following are questions (with answers) about high\ + \ school mathematics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96c414d3c411c6380cf83dca3b7aedc325598220 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_microeconomics" +"description": "The following are questions (with answers) about high\ + \ school microeconomics.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_microeconomics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45ab0a539a02ae322f66db689d8eddf13c8b856a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_physics" +"description": "The following are questions (with answers) about high\ + \ school physics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..48dedf5c5ed94a836e0d802398ab05d7ab7db6ce --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_psychology" +"description": "The following are questions (with answers) about high\ + \ school psychology.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_psychology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ee2418c7ff5235c1e31cf381502f5b21db60230 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_statistics" +"description": "The following are questions (with answers) about high\ + \ school statistics.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_statistics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a00f16ceba2cfd3f313c8fe0d2df4a43e4bbe23d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_us_history" +"description": "The following are questions (with answers) about high\ + \ school us history.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_us_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc4cddf553bf0144b5d4ecc5eabe8efef0cf0367 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml @@ -0,0 +1,6 @@ +"dataset_name": "high_school_world_history" +"description": "The following are questions (with answers) about high\ + \ school world history.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_high_school_world_history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml new file mode 100644 index 0000000000000000000000000000000000000000..314edeb6c26c6a6be2d819b7c66e047cd48f8933 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml @@ -0,0 +1,6 @@ +"dataset_name": "human_aging" +"description": "The following are questions (with answers) about human\ + \ aging.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_human_aging" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a1473819ab4307f1e02024a0828ad9803710a59b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml @@ -0,0 +1,6 @@ +"dataset_name": "human_sexuality" +"description": "The following are questions (with answers) about human\ + \ sexuality.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_human_sexuality" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ea8944bcc109000525b90f26f1d0da914d17437 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml @@ -0,0 +1,6 @@ +"dataset_name": "international_law" +"description": "The following are questions (with answers) about international\ + \ law.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_international_law" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fca1dda86cc382604ca1bcbc308e0062e08dfa80 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml @@ -0,0 +1,6 @@ +"dataset_name": "jurisprudence" +"description": "The following are questions (with answers) about jurisprudence.\n\ + \n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_jurisprudence" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1b576f9fb3d0ce1d21e8d7543b56a539300be36a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml @@ -0,0 +1,6 @@ +"dataset_name": "logical_fallacies" +"description": "The following are questions (with answers) about logical\ + \ fallacies.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_logical_fallacies" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15fc3f4bdf0f34e96149ca2f8dddc90d037e8483 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml @@ -0,0 +1,6 @@ +"dataset_name": "machine_learning" +"description": "The following are questions (with answers) about machine\ + \ learning.\n\n" +"tag": "mmlu_continuation_stem" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_machine_learning" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml new file mode 100644 index 0000000000000000000000000000000000000000..575604e0acf52132d9e489a070d28fd761e739eb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_management.yaml @@ -0,0 +1,6 @@ +"dataset_name": "management" +"description": "The following are questions (with answers) about management.\n\ + \n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_management" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af715bee02cfe813b5f045670c8e46dda258e77d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml @@ -0,0 +1,6 @@ +"dataset_name": "marketing" +"description": "The following are questions (with answers) about marketing.\n\ + \n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_marketing" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3bf63614168f648497d046f015472497a2ac7553 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml @@ -0,0 +1,6 @@ +"dataset_name": "medical_genetics" +"description": "The following are questions (with answers) about medical\ + \ genetics.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_medical_genetics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f457800932ec2fba831a1d81e6ca4495816f981f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml @@ -0,0 +1,6 @@ +"dataset_name": "miscellaneous" +"description": "The following are questions (with answers) about miscellaneous.\n\ + \n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_miscellaneous" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0df1392d5baceb1a3dda1464acbb0b025a8428e8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml @@ -0,0 +1,6 @@ +"dataset_name": "moral_disputes" +"description": "The following are questions (with answers) about moral\ + \ disputes.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_moral_disputes" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bea5e514b85a6ed83026a6fe9d399f92eb59ea99 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml @@ -0,0 +1,6 @@ +"dataset_name": "moral_scenarios" +"description": "The following are questions (with answers) about moral\ + \ scenarios.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_moral_scenarios" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8db80340b2a9984cb8c3e41766e3f0e89af8f252 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml @@ -0,0 +1,6 @@ +"dataset_name": "nutrition" +"description": "The following are questions (with answers) about nutrition.\n\ + \n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_nutrition" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..165de6c90ba1d4756c39e2f5605226dbeb86e314 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml @@ -0,0 +1,6 @@ +"dataset_name": "philosophy" +"description": "The following are questions (with answers) about philosophy.\n\ + \n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_philosophy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..02c4ee7f8af1856f498b7a55c83e085782e36666 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml @@ -0,0 +1,6 @@ +"dataset_name": "prehistory" +"description": "The following are questions (with answers) about prehistory.\n\ + \n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_prehistory" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb36a82b9c043b519379626f2d3618efdda9907b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml @@ -0,0 +1,6 @@ +"dataset_name": "professional_accounting" +"description": "The following are questions (with answers) about professional\ + \ accounting.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_professional_accounting" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac9f2592f41a2bcae43da174d2eb969cf1805251 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml @@ -0,0 +1,6 @@ +"dataset_name": "professional_law" +"description": "The following are questions (with answers) about professional\ + \ law.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_professional_law" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..328c128377609327abe0460e2d4ab6af716d02c3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml @@ -0,0 +1,6 @@ +"dataset_name": "professional_medicine" +"description": "The following are questions (with answers) about professional\ + \ medicine.\n\n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_professional_medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0cca5bde048a23367aa2ccebc893e9fa71996d98 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "professional_psychology" +"description": "The following are questions (with answers) about professional\ + \ psychology.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_professional_psychology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml new file mode 100644 index 0000000000000000000000000000000000000000..700c407c2377d8d4d83bbf88d8f7a003a2e2900d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml @@ -0,0 +1,6 @@ +"dataset_name": "public_relations" +"description": "The following are questions (with answers) about public\ + \ relations.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_public_relations" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4f5ef99e0f8fe8c98bc9994757d9cc6617e3550e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml @@ -0,0 +1,6 @@ +"dataset_name": "security_studies" +"description": "The following are questions (with answers) about security\ + \ studies.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_security_studies" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e78621aaa547b419f4133b94ce8dcba00c407f5c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "sociology" +"description": "The following are questions (with answers) about sociology.\n\ + \n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_sociology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..989bb29aa095e83c2744011775864ef27258ca28 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml @@ -0,0 +1,6 @@ +"dataset_name": "us_foreign_policy" +"description": "The following are questions (with answers) about us\ + \ foreign policy.\n\n" +"tag": "mmlu_continuation_social_sciences" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_us_foreign_policy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5c938190bdd755f411914905d5309daa6938f313 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml @@ -0,0 +1,6 @@ +"dataset_name": "virology" +"description": "The following are questions (with answers) about virology.\n\ + \n" +"tag": "mmlu_continuation_other" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_virology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f707670066d3f2db4554221a12a3983e2d8febf5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml @@ -0,0 +1,6 @@ +"dataset_name": "world_religions" +"description": "The following are questions (with answers) about world\ + \ religions.\n\n" +"tag": "mmlu_continuation_humanities" +"include": "_continuation_template_yaml" +"task": "mmlu_continuation_world_religions" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..609c20af2acbdd7ef36104dc97db97a40bfca6a5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "econometrics" +"description": "The following are multiple choice questions (with answers) about econometrics.\n\ + \n" +"tag": "mmlu_social_sciences_tasks" +"include": "_default_template_yaml" +"task": "mmlu_econometrics" +"task_alias": "econometrics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..be1ede3f69b3fe384e616ef96c0ad89701ddb67b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml @@ -0,0 +1,31 @@ +dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split +validation_split: validation +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: generate_until +doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step." +doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}" +filter_list: + - name: "get-answer" + filter: + - function: "regex" + regex_pattern: "(?<=answer is )(.*)(?=.)" + - function: "take_first" +generation_kwargs: + until: + - "" + do_sample: false + temperature: 0.0 +num_fewshot: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6235d5c0997558a123258cba3dfdb4b844a2fb60 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml @@ -0,0 +1,59 @@ +dataset_name: abstract_algebra +description: The following are multiple choice questions (with answers) about abstract + algebra. +fewshot_config: + sampler: first_n + samples: + - question: 'Statement 1 | Every element of a group generates a cyclic subgroup of + the group. Statement 2 | The symmetric group S_10 has 10 elements. + + (A) True, True (B) False, False (C) True, False (D) False, True' + target: Let's think step by step. A cyclic group is a group that is generated + by a single element. Hence a subgroup generated by a single element of a group + is cyclic and Statement 1 is True. The answer is (C). + - question: 'The symmetric group $S_n$ has $ + + actorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements. + + Find the characteristic of the ring 2Z. + + (A) 0 (B) 3 (C) 12 (D) 30' + target: Let's think step by step. A characteristic of a ring is R is $n$ if the + statement $ka = 0$ for all $a\in 2Z$ implies that $k$ is a multiple of $n$. + Assume that $ka = 0$ for all $a\in 2Z$ for some $k$. In particular $2k = 0$. + Hence $k=0$ and $n=0$. The answer is (A). + - question: 'Statement 1| Every function from a finite set onto itself must be one + to one. Statement 2 | Every subgroup of an abelian group is abelian. + + (A) True, True (B) False, False (C) True, False (D) False, True' + target: "Let's think step by step. Statement 1 is true. Let $S$ be a finite set.\ + \ If $f:S \nightarrow S$ is a onto function, then $|S| = |f(S)|$. If $f$ was\ + \ not one to one, then for finite domain $S$ the image would have less than\ + \ $S$ elements, a contradiction.\nStatement 2 is true. Let $G$ be an abelian\ + \ group and $H$ be a subgroup of $G$. We need to show that $H$ is abelian. Let\ + \ $a,b \\in H$. Then $a,b \\in G$ and $ab=ba$. Since $G$ is abelian, $ab=ba$.\ + \ Since $H$ is a subgroup of $G$, $ab \\in H$. Therefore, $ab=ba$ and $H$ is\ + \ abelian. The answer is (A)." + - question: 'Statement 1 | If aH is an element of a factor group, then |aH| divides + |a|. Statement 2 | If H and K are subgroups of G then HK is a subgroup of G. + + (A) True, True (B) False, False (C) True, False (D) False, True' + target: Let's think step by step. Statement 2 is false. Let $H$ be a subgroup + of $S_3$ generated by the cycle $(1,2)$ and $K$ be a subgroup of $S_3$ generated + by the cycle $(1,3)$. Both $H$ and $K$ have two elements, the generators and + the identity. However $HK$ contains cycles (1,2), (1,3) and (2,3,1), but the + inverse of (2,3,1) is (2,1,3) and it does not belong to HK, hence HK is not + a subgroup. The answer is (B). + - question: 'Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field. + + (A) 0 (B) 1 (C) 2 (D) 3' + target: 'Let''s think step by step. Z_3[x]/(x^2 + c) is a field if and only if + x^2 + c does not have roots in Z_3. That is x^2 + c != 0 for every x in Z_3. + If c = 0, then x^2 + c = x^2 has root 0. If c = 1 then x^2 + c = x^2 + 1 = 0 + + 1 for x = 0, 1 + 1 = 2 for x = 1 and 1 + 1 = 2 for x = 2, hence x^2 + 1 does + not have any roots. For c = 2 the polynomial x^2 + 2 has two roots at x = 1 + and x = 2. Hence Z_3[x]/(x^2 + c) is a field if and only if c = 1. The answer + is (B).' +tag: mmlu_flan_cot_fewshot_stem +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_abstract_algebra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5eccde7c6be9620f63bbf3b1de42f32c8e121539 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml @@ -0,0 +1,180 @@ +dataset_name: college_computer_science +description: The following are multiple choice questions (with answers) about college + computer science. +fewshot_config: + sampler: first_n + samples: + - question: 'Which of the following regular expressions is equivalent to (describes + the same set of strings as) (a* + b)*(c + d)? + + (A) a*(c + d)+ b(c + d) + + (B) a*(c + d)* + b(c + d)* + + (C) a*(c + d)+ b*(c + d) + + (D) (a + b)*c +(a + b)*d' + target: 'Let''s think step by step. We know that: + + 1. (X* + Y)* = (X + Y)* + + 2. X(Y + Z)? = XY + XZ + + Using equation 1 we can rewrite (a* + b)*(c + d)? as: + + 3. (a + b)*(c + d)? + + Using equation 2 we can rewrite equation 3 as: + + (a + b)*c + (a + b)*d The answer is (D).' + - question: 'The Singleton design pattern is used to guarantee that only a single instance + of a class may be instantiated. Which of the following is (are) true of this + design pattern? + + I. The Singleton class has a static factory method to provide its instance. + + II. The Singleton class can be a subclass of another class. + + III. The Singleton class has a private constructor. + + (A) I only + + (B) II only + + (C) III only + + (D) I, II, and III' + target: 'Let''s think step by step. Statement I is a correct statement about a + Singleton, because a Singleton restricts instantiation to a single, static method. + Statement II is also correct, because there is no inherent restriction regarding + the inheritance of a Singleton. Statement III is also correct, because a Singletons + must be instantiated only once, so its constructor is made private to prevent + any construction except via its static factory method. + + Given these facts, statements I, II, and III are all correct. The answer is + (D).' + - question: 'A certain pipelined RISC machine has 8 general-purpose registers R0, R1, + . . . , R7 and supports the following operations: + + ADD Rs1, Rs2, Rd (Add Rs1 to Rs2 and put the sum in Rd) + + MUL Rs1, Rs2, Rd (Multiply Rs1 by Rs2 and put the product in Rd) + + An operation normally takes one cycle; however, an operation takes two cycles + if it produces a result required by the immediately following operation in an + operation sequence. + + Consider the expression AB + ABC + BC, where variables A, B, C are located in + registers R0, R1, R2. If the contents of these three registers must not be modified, + what is the minimum number of clock cycles required for an operation sequence + that computes the value of AB + ABC + BC? + + (A) 5 (B) 6 (C) 7 (D) 8' + target: 'Let''s think step by step. First, we are given that A is in R0, B is + in R1, and C is in R2. + + Next, we can see that we must compute three multiplies (AB, BC, and ABC) and + two adds (AB + ABC, (AB + ABC) + BC) to compute our final answer, resulting + in a minimum of five clock cycles. + + Next, we can see that there is no way to avoid at least one pipeline stall when + computing our final answer, because to compute our final sum we must wait at + least one cycle for the results from the previous stage to be ready. Thus, our + minimum number of cycles must be 6. + + We can verify that we can create a solution that requires only six cycles as + follows: + + compute AB: MUL R0, R1, R3 + + compute BC: MUL R1, R2, R4 + + compute ABC: MUL R3, R4, R5 + + compute AB + BC: ADD R3, R4, R6 + + STALL + + compute AB + ABC + BC: ADD R5, R6, R7 + + So there are 6 cycles. The answer is (B).' + - question: 'A compiler generates code for the following assignment statement. + + G := (A + B) * C - (D + E) * F + + The target machine has a single accumulator and a single-address instruction + set consisting of instructions load, store, add, subtract, and multiply. For + the arithmetic operations, the left operand is taken from the accumulator and + the result appears in the accumulator. The smallest possible number of instructions + in the resulting code is + + (A) 5 (B) 6 (C) 7 (D) 9' + target: 'Let''s think step by step. We can compute the final answer with the following + sequence of operations: + + 1. LOAD D (accumulator = D) + + 2. ADD E (accumulator = D+E) + + 3. MUL F (accumulator = (D+E)*F) + + 4. STORE X (X = (D+E)*F) + + 5. LOAD A (accumulator = A) + + 6. ADD B (accumulator = A+B) + + 7. MUL C (accumulator = (A+B)*C) + + 8. SUB X (accumulator = (A+B)*C - (D+E)*F) + + 9. STORE G (G = (A+B)*C - (D+E)*F) + + This sequence takes 9 instructions. The answer is (D).' + - question: 'Consider a computer design in which multiple processors, each with a private + cache memory, share global memory using a single bus. This bus is the critical + system resource. Each processor can execute one instruction every 500 nanoseconds + as long as memory references are satisfied by its local cache. When a cache + miss occurs, the processor is delayed for an additional 2,000 nanoseconds. During + half of this additional delay, the bus is dedicated to serving the cache miss. + During the other half, the processor cannot continue, but the bus is free to + service requests from other processors. On average, each instruction requires + 2 memory references. On average, cache misses occur on 1 percent of references. + What proportion of the capacity of the bus would a single processor consume, + ignoring delays due to competition from other processors? + + (A) 1/50 (B) 1/27 (C) 1/25 (D) 2/27' + target: 'Let''s think step by step. We know that each instruction requires two + memory references per instruction, and that there is an average cache miss rate + of one percent. + + Thus a given processor has: + + (1 cache miss / 100 references) * (2 references / instruction) = + + (2 cache misses / 100 instructions), so: + + misses_per_instruction = 1 cache miss / 50 instructions. + + Next, we know that each instruction requires 500 nanoseconds when there is no + cache miss, and 500 + 2000 = 2500 nanoseconds when there is a cache miss. Thus: + + 50 instructions / (49 * 500) + (1 * 2500) nanoseconds, so: + + instructions_per_ns = 50 instructions / 27000 nanoseconds. + + Now, we know that each cache miss locks the bus for half of the 2000 nanosecond + cache miss delay, or 1000 nanoseconds, so: + + lock_ns_per_miss = 1000 nanoseconds / cache miss. + + Thus we can see that on average a single processor will lock the bus for: + + lock_ns_per_miss * misses_per_instruction * instructions_per_ns = + + (1000 nanoseconds / cache miss) * (1 cache miss / 50 instructions) * (50 instructions + / 27000 nanoseconds) = 1000 * (1/50) * (50/27000) = 1000/27000 = 1/27. The answer + is (B).' +tag: mmlu_flan_cot_fewshot_stem +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_college_computer_science diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1fb9eef860424b49adf108566edadaf1500dbf5f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml @@ -0,0 +1,77 @@ +dataset_name: elementary_mathematics +description: The following are multiple choice questions (with answers) about elementary + mathematics. +fewshot_config: + sampler: first_n + samples: + - question: 'Olivia used the rule "Add 11" to create the number pattern shown below. + 10, 21, 32, 43, 54. Which statement about the number pattern is true? + + (A) The 10th number in the pattern will be an even number. + + (B) The number pattern will never have two even numbers next to each other. + + (C) The next two numbers in the pattern will be an even number then an odd number. + + (D) If the number pattern started with an odd number then the pattern would + have only odd numbers in it.' + target: Let's think step by step. Choice A is incorrect because every even-numbered + term in the pattern is odd, and 10 is an even number. Choice B is correct, because + adding an odd number (in this case 11) to an odd number produces an even number, + and adding an odd number to an even number produces an odd number. Thus the + terms in the pattern will alternate between odd and even, so there will never + be two even numbers next to each other. Choice C is incorrect because the last + term in the example is even (54), and we know that the terms will alternate + between even and odd. Choice D is incorrect because the terms in the pattern + will alternate between odd and even, regardless of the value of the first term. + The answer is (B). + - question: 'The population of the city where Michelle was born is 145,826. What is + the value of the 5 in the number 145,826? + + (A) 5 thousands + + (B) 5 hundreds + + (C) 5 tens + + (D) 5 ones' + target: 'Let''s think step by step. Choice A is correct, because there are three + digits following the 5, so + + the 5 is in the thousands place. Thus the other choices are incorrect. The answer + is (A).' + - question: "A store sells 107 different colors of paint. They have 25 cans of each\ + \ color in storage. The number of cans of paint the store has in storage can\ + \ be found using the expression below. 107 \xD7 25. How many cans of paint does\ + \ the store have in storage?\n(A) 749\n(B) 2,675\n(C) 2,945\n(D) 4,250" + target: Let's think step by step. We can calculate 107 x 25 = (100 x 25) + (7 + x 25) = 2500 + 175 = 2675. The answer is (B). + - question: 'A total of 30 players will play basketball at a park. There will be exactly + 5 players on each team. Which statement correctly explains how to find the number + of teams needed? + + (A) Add 5 to 30 to find 35 teams. + + (B) Divide 30 by 5 to find 6 teams. + + (C) Multiply 30 and 5 to find 150 teams. + + (D) Subtract 5 from 30 to find 25 teams.' + target: Let's think step by step. We want to find the number of teams. We know + that there are 5 players/team, and 30 players. Thus to get the number of teams + we divide players by players/team, so 30 players / 5 players/team = 6 teams. + The answer is (B). + - question: 'Which expression is equivalent to 5 x 9? + + (A) (5 x 4) x (6 x 5) + + (B) (5 x 5) + (5 x 4) + + (C) (5 x 5) + (5 x 9) + + (D) (5 x 9) x (6 x 9)' + target: 'Let''s think step by step. We know that 9 = (5 + 4), so 5 x 9 = 5 x (5 + + 4) = (5 x 5) + (5 x 4). The answer is (B).' +tag: mmlu_flan_cot_fewshot_stem +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_elementary_mathematics diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0dfb19f924761c6dd56cc6b3b9ada38b5bf473e0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml @@ -0,0 +1,69 @@ +dataset_name: high_school_biology +description: The following are multiple choice questions (with answers) about high + school biology. +fewshot_config: + sampler: first_n + samples: + - question: "In animal cells, which of the following represents the most likely pathway\ + \ that a secretory protein takes as it is synthesized in a cell?\n(A) Plasma\ + \ membrane\u2013Golgi apparatus\u2013ribosome\u2013secretory vesicle\u2013rough\ + \ ER (B) Ribosome\u2013Golgi apparatus\u2013rough ER\u2013secretory vesicle\u2013\ + plasma membrane (C) Plasma membrane\u2013Golgi apparatus\u2013ribosome\u2013\ + secretory vesicle\u2013rough ER (D) Ribosome\u2013rough ER\u2013Golgi apparatus\u2013\ + secretory vesicle\u2013plasma membrane" + target: Let's think step by step. Protein synthesis starts at the ribosome, so + we can eliminate (A) and (C). The ribosome is often in the endoplasmic reticulum + and moves from there to the Golgi apparatus, where it is modified and packaged + into a vesicle. The vesicle then floats to the plasma membrane and is secreted. + The answer is (D). + - question: "A mutation in a bacterial enzyme changed a previously polar amino acid\ + \ into a nonpolar amino acid. This amino acid was located at a site distant\ + \ from the enzyme\u2019s active site. How might this mutation alter the enzyme\u2019\ + s substrate specificity?\n(A) By changing the enzyme\u2019s pH optimum (B) By\ + \ changing the enzyme\u2019s location in the cell (C) By changing the shape\ + \ of the protein (D) An amino acid change away from the active site cannot alter\ + \ the enzyme\u2019s substrate specificity." + target: Let's think step by step. A change in an amino acid leads to a change + in the primary structure of the protein. A change in the primary structure may + lead to a change in the secondary and the tertiary structure of the protein. + A change in the tertiary structure means a change in the shape of the protein, + so (C) has to be correct. Since the change does not affect the active site of + the enzyme, we do not expect the activity of the enzyme to be affected. The + answer is (C). + - question: 'Which of the following is not a way to form recombinant DNA? + + (A) Translation (B) Conjugation (C) Specialized transduction (D) Transformation' + target: 'Let''s think step by step. The introduction of foreign DNA or RNA into + bacteria or eukaryotic cells is a common technique in molecular biology and + scientific research. There are multiple ways foreign DNA can be introduced into + cells including transformation, transduction, conjugation, and transfection. + In contrast, (A) is not a way to form DNA: during translation the ribosomes + synthesize proteins from RNA. The answer is (A).' + - question: 'Homologous structures are often cited as evidence for the process of natural + selection. All of the following are examples of homologous structures EXCEPT + + (A) the wings of a bird and the wings of a bat (B) the flippers of a whale and + the arms of a man (C) the pectoral fins of a porpoise and the flippers of a + seal (D) the forelegs of an insect and the forelimbs of a dog' + target: "Let's think step by step. \u200B\u200BHomologous structures are similar\ + \ physical features in organisms that share a common ancestor \u200B\u200Bbut\ + \ different functions. Comparisons (B) and (C) are clearly homologous because\ + \ they share a common ancestor and the structures serve different purposes.\ + \ Bat wings and birg wings are also homologous, while they are both wings, the\ + \ forelimbs serve different purposes. Insects and dogs are very far ancestors\ + \ since one is vertebrate while the other is invertebrate and the forelimbs\ + \ serve the same purpose, so they are not homologous. The answer is (D)." + - question: 'Which of the following is not known to be involved in the control of cell + division? + + (A) Cyclins (B) Protein kinases (C) Checkpoints (D) Fibroblast cells' + target: 'Let''s think step by step. Normal cells move through the cell cycle in + a regulated way. At the checkpoint stage, they use information about their own + internal state and cues from the environment around them to decide whether to + proceed with cell division. Cues like these act by changing the activity of + core cell cycle regulators inside the cell. The most common regulators are cyclins + and cyclin-dependent kinases. Fibroblast cells do not play any role in cell + division. The answer is (D).' +tag: mmlu_flan_cot_fewshot_stem +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_high_school_biology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6f21030ae880ec0d6d42f0e2618c312b55b82549 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml @@ -0,0 +1,84 @@ +dataset_name: high_school_computer_science +description: The following are multiple choice questions (with answers) about high + school computer science. +fewshot_config: + sampler: first_n + samples: + - question: 'Which of the following is an example of the use of a device on the Internet + of Things (IoT) ? + + (A) A car alerts a driver that it is about to hit an object. (B) A hiker uses + a G P S watch to keep track of her position. (C) A refrigerator orders milk + from an online delivery service when the milk in the refrigerator is almost + gone. (D) A runner uses a watch with optical sensors to monitor his heart rate.' + target: Let's think step by step. The term Internet of Things (IoT) refers to + common devices which are connected to the internet, enabling new functionality. + Choice A is incorrect because it does not describe an internet connected device. + In choice B, the watch is only described as having GPS functionality but no + internet connectivity. Choice C describes a common device (a refrigerator) which + has internet connectivity enabling new functionality (online ordering). Choice + D does not mention internet connectivity for the watch, only optical sensors. + The answer is (C). + - question: 'Many Web browsers allow users to open anonymous windows. During a browsing + session in an anonymous window, the browser does not record a browsing history + or a list of downloaded files. When the anonymous window is exited, cookies + created during the session are deleted. Which of the following statements about + browsing sessions in an anonymous window is true? + + (A) The activities of a user browsing in an anonymous window will not be visible + to people who monitor the user''s network, such as the system administrator. + (B) Items placed in a Web store''s shopping cart for future purchase during + the anonymous browsing session will not be saved on the user''s computer. (C) + A user will not be able to log in to e-mail or social media accounts during + the anonymous browsing session. (D) A user browsing in an anonymous window will + be protected from viruses launched from any web sites visited or files downloaded.' + target: "Let's think step by step. Choice A is incorrect as it only describes\ + \ network traffic, which an anonymous browser does not change. Choice B is correct\ + \ as it correctly describes how an anonymous browser will prevent saving data\ + \ on the user\u2019s computer after the session is ended. Choice C is incorrect\ + \ because an anonymous browser will not prevent logging in to email or social\ + \ media accounts. Choice D is incorrect because an anonymous browser in itself\ + \ performs no virus protection. The answer is (B)." + - question: "In the program below, the initial value of X is 5 and the initial value\ + \ of Y is 10.\nIF (X < 0){\n DISPLAY (\"Foxtrot\")\n} ELSE {\n IF (X > Y){\n\ + \ DISPLAY (\"Hotel\")\n } ELSE {\n IF (Y > 0){\n DISPLAY (\"November\")\n\ + \ } ELSE {\n DISPLAY (\"Yankee\")\n }\n }\n}\nWhat is displayed as a result\ + \ of running the program?\n(A) Foxtrot (B) Hotel (C) November (D) Yankee" + target: Let's think step by step. Because X has the value 5, the first conditional + IF (X < 0) is false, so we move to the first ELSE clause. Because X is 5 and + Y is 10, the second conditional IF (X > Y) is false, so we move to the following + ELSE clause. Since Y is 10, the conditional IF (Y > 0) is true, so the command + DISPLAY ("November") is executed. The answer is (C). + - question: 'What is the output of "abc"[::-1] in Python 3? + + (A) Error (B) abc (C) cba (D) c' + target: Let's think step by step. We know that the slicing operator [::-1] takes + all of the elements in the string in reverse order, so we reverse the order + of the string "abc", resulting in "cba". The answer is (C). + - question: "A list of numbers has n elements, indexed from 1 to n. The following algorithm\ + \ is intended to display the number of elements in the list that have a value\ + \ greater than 100. The algorithm uses the variables count and position. Steps\ + \ 3 and 4 are missing.\n Step 1: Set count to 0 and position to 1.\n Step 2:\ + \ If the value of the element at index position is greater than 100, increase\ + \ the value of count by 1.\n Step 3: (missing step)\n Step 4: (missing step)\n\ + \ Step 5: Display the value of count.\nWhich of the following could be used\ + \ to replace steps 3 and 4 so that the algorithm works as intended?\n(A) Step\ + \ 3: Increase the value of position by 1.\n Step 4: Repeat steps 2 and 3 until\ + \ the value of count is greater than 100.\n(B) Step 3: Increase the value of\ + \ position by 1.\n Step 4: Repeat steps 2 and 3 until the value of position\ + \ is greater than n.\n(C) Step 3: Repeat step 2 until the value of count is\ + \ greater than 100.\n Step 4: Increase the value of position by 1.\n(D) Step\ + \ 3: Repeat step 2 until the value of position is greater than n.\n Step 4:\ + \ Increase the value of count by 1." + target: 'Let''s think step by step. Choice A is incorrect, because its Step 4 + has an incorrect termination condition, stopping when count is greater than + 100. We need to stop after inspecting all elements in the list. Choice B is + correct because it correctly increments both count and position, and correctly + repeats these steps and terminates when all elements in the list have been inspected. + Choice C is incorrect because it incorrectly increments the variable count until + its value is greater than 100, regardless of the elements in the list. Choice + D is incorrect because its step 3 does not increment the value of position, + so it will repeat forever. The answer is (B).' +tag: mmlu_flan_cot_fewshot_stem +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_high_school_computer_science diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8cea5109f6570086dce3cf1815dc50f1889d80ad --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml @@ -0,0 +1,156 @@ +dataset_name: high_school_us_history +description: The following are multiple choice questions (with answers) about high + school us history. +fewshot_config: + sampler: first_n + samples: + - question: "This question refers to the following information.\nI come not to urge\ + \ personal claims, nor to seek individual benefits; I appear as the advocate\ + \ of those who cannot plead their own cause; I come as the friend of those who\ + \ are deserted, oppressed, and desolate. In the Providence of God, I am the\ + \ voice of the maniac whose piercing cries from the dreary dungeons of your\ + \ jails penetrate not your Halls of Legislation. I am the Hope of the poor crazed\ + \ beings who pine in the cells, and stalls, and cages, and waste rooms of your\ + \ poor-houses. I am the Revelation of hundreds of wailing, suffering creatures,\ + \ hidden in your private dwellings, and in pens and cabins\u2014shut out, cut\ + \ off from all healing influences, from all mind-restoring cares.\u2026 Could\ + \ their melancholy histories be spread before you as revealed to my grieved\ + \ spirit during the last three months, how promptly, how earnestly would you\ + \ search out the most approved means of relief; how trifling, how insignificant,\ + \ by comparison, would appear the sacrifices you are asked to make; how would\ + \ a few dimes and dollars, gathered from each citizen, diminish in value as\ + \ a possession, compared with the certain benefits and vast good to be secured\ + \ for the suffering insane...by the consecration and application of a sufficient\ + \ fund to the construction of a suitable hospital.\u2026\n\u2014Dorothea Dix,\ + \ Memorial Soliciting a State Hospital for the Protection and Cure of the Insane,\n\ + Submitted to the General Assembly of North Carolina, November 1848\nDorothea\ + \ Dix can best be compared to whom?\n(A) Abigail Adams (B) Clara Barton (C)\ + \ Shirley Temple (D) Hillary Clinton" + target: Let's think step by step. We refer to Wikipedia articles on us history + for help. Both Dorothea Dix and Clara barton are American nurses. The answer + is (B). + - question: "This question refers to the following information.\n\"As our late Conduct\ + \ at the Conestoga Manor and Lancaster have occasioned much Speculation & a\ + \ great diversity of Sentiments in this and neighboring Governments; some vindicating\ + \ & others condemning it; some charitably alleviating the Crime, & others maliciously\ + \ painting it in the most odious & detestable Colours, we think it our duty\ + \ to lay before the Publick, the whole Matter as it appeared, & still appears,\ + \ to us. . . .\n\"If these things are not sufficient to prove an unjustifiable\ + \ Attachment in the Quakers to the Indians Savages, a fixed Resolution to befriend\ + \ them & an utter insensibility to human Distresses, let us consider a few more\ + \ recent Facts. When we found the last Summer that we were likely to get no\ + \ Assistance from the Government, some Volunteers went out at our own Expense,\ + \ determined to drive our Enemies from our Borders; & when we came near to the\ + \ great Island, we understood that a Number of their Warriors had gone out against\ + \ our Frontiers. Upon this we returned and came up with them and fought with\ + \ them at the Munfey Hill where we lost some of our Men & killed some of their\ + \ Warriors & thereby saved our Frontiers from this Story in another Expedition.\ + \ But no sooner had we destroyed their Provisions on the great Island, & ruined\ + \ their trade with the good People at Bethlehem, but these very Indians, who\ + \ were justly suspected of having murdered our Friends in Northampton County,\ + \ were by the Influence of some Quakers taken under the Protection of the Government\ + \ to screen them from the Resentments of the Friends and Relations of the Murdered,\ + \ & to support them thro the Winter.\"\n\u2014\"Apology of the Paxton Boys\"\ + \ (pamphlet), 1764 (Note: \"apology\" in this context should be read as an explanation,\ + \ not an admission of guilt or regret.\nThe sentiments expressed in the explanation\ + \ above reflect which of the ongoing tensions during the colonial period of\ + \ American history?\n(A) Tensions between British policies and the aspirations\ + \ of North American colonists. (B) Tensions between American Indians allied\ + \ with the French and those allied with the British. (C) Tensions between freed\ + \ African Americans and white planters. (D) Tensions between backcountry settlers\ + \ and elites within colonial America." + target: Let's think step by step. We refer to Wikipedia articles on us history + for help. After the French and Indian War, the Scotch-Irish settlers attacked + American Indians. After the attacks on the Conestoga, about 250 Paxton Boys + present their grievances to the Pennsylvania legislature. As mentioned in the + information, the Paxton Boys cited resentiment at local elites. The answer is + (D). + - question: "This question refers to the following information.\nOur leaders talk about\ + \ stopping aggression from the north, but this was a struggle among groups of\ + \ Vietnamese until we intervened. We seem bent upon saving the Vietnamese from\ + \ Ho Chi Minh even if we have to kill them and demolish their country to do\ + \ it. As the native people survey bombed-out villages, women and children burned\ + \ by napalm, rice crops destroyed and cities overrun with our military personnel,\ + \ they are doubtless saying secretly of the Vietcong guerillas and of the American\ + \ forces, \"A plague on both your houses.\" \u2026 Stop the bombing, north and\ + \ south, end search and destroy offensive sweeps, and confine our military action\ + \ to holding operations on the ground. Bombing the north has failed to halt\ + \ or seriously check the flow of troops to the south and may, in fact, have\ + \ prompted a much greater war effort by Hanoi.\n\u2014Senator George McGovern,\ + \ \"The Lessons of Vietnam,\" April 25, 1967\nWhich of the following opinions\ + \ from the 1960s most directly reflects the perspective of George McGovern's\ + \ speech?\n(A) Americans must maximize their technological edge in Vietnam.\ + \ (B) American bombing in Vietnam is step by step leading to progress in the\ + \ war. (C) American bombing in Vietnam is a failure. (D) America must not give\ + \ in to defeatism about the war in Vietnam." + target: Let's think step by step. We refer to Wikipedia articles on us history + for help. "Stop the bombing" and "Bombing the north has failed to halt or seriously + check the flow of troops to the south" indicate that the perspective of George + McGovern's speech is that Amerian bombing in Vietnam is a failure. The answer + is (C). + - question: "This question refers to the following information.\n\"In the new Code\ + \ of Laws which I suppose it will be necessary for you to make I desire you\ + \ would Remember the Ladies, and be more generous and favorable to them than\ + \ your ancestors. Do not put such unlimited power into the hands of the Husbands.\ + \ Remember all Men would be tyrants if they could. If particular care and attention\ + \ is not paid to the Ladies we are determined to foment a Rebellion, and will\ + \ not hold ourselves bound by any Laws in which we have no voice, or Representation.\"\ + \nAbigail Adams, in a letter to John Adams, 1776\n\"Special legislation for\ + \ woman has placed us in a most anomalous position. Women invested with the\ + \ rights of citizens in one section\u2014voters, jurors, office-holders\u2014\ + crossing an imaginary line, are subjects in the next. In some States, a married\ + \ woman may hold property and transact business in her own name; in others,\ + \ her earnings belong to her husband. In some States, a woman may testify against\ + \ her husband, sue and be sued in the courts; in others, she has no redress\ + \ in case of damage to person, property, or character. In case of divorce on\ + \ account of adultery in the husband, the innocent wife is held to possess no\ + \ right to children or property, unless by special decree of the court. But\ + \ in no State of the Union has the wife the right to her own person, or to any\ + \ part of the joint earnings of the co-partnership during the life of her husband.\ + \ In some States women may enter the law schools and practice in the courts;\ + \ in others they are forbidden. In some universities girls enjoy equal educational\ + \ advantages with boys, while many of the proudest institutions in the land\ + \ deny them admittance, though the sons of China, Japan and Africa are welcomed\ + \ there. But the privileges already granted in the several States are by no\ + \ means secure.\"\nSusan B. Anthony, \"Declaration of Rights for Women,\" July\ + \ 4, 1876\nThe sentiments expressed in the second excerpt by Susan B. Anthony\ + \ are most likely in support of\n(A) the Equal Rights Amendment (B) universal\ + \ suffrage (C) states' rights (D) prohibition" + target: Let's think step by step. We refer to Wikipedia articles on us history + for help. The above information mentioned that women are in an anomalous position + in terms of legislation. Women's earnings do not belong to themselves, or they + cannot testify against her husbands. Susan believes women should have equal + legal rights as men. The answer is (B). + - question: 'This question refers to the following information. + + "Society in every state is a blessing, but government even in its best state + is but a necessary evil; in its worst state an intolerable one; for when we + suffer, or are exposed to the same miseries by a government, which we might + expect in a country without government, our calamity is heightened by reflecting + that we furnish the means by which we suffer. Government, like dress, is the + badge of lost innocence; the palaces of kings are built on the ruins of the + bowers of paradise. For were the impulses of conscience clear, uniform, and + irresistibly obeyed, man would need no other lawgiver; but that not being the + case, he finds it necessary to surrender up a part of his property to furnish + means for the protection of the rest; and this he is induced to do by the same + prudence which in every other case advises him out of two evils to choose the + least. Wherefore, security being the true design and end of government, it unanswerably + follows that whatever form thereof appears most likely to ensure it to us, with + the least expense and greatest benefit, is preferable to all others." + + Thomas Paine, Common Sense, 1776 + + Which of the following "miseries" alluded to above were most condemned by Anti-Federalists + of the post-Revolutionary era? + + (A) Organized response to Bacon''s Rebellion (B) Federal response to Shays''s + Rebellion (C) Federal response to the Whiskey Rebellion (D) Federal response + to Pontiac''s Rebellion' + target: 'Let''s think step by step. We refer to Wikipedia articles on us history + for help. Anti-Federalists do not believe centralized government power, and + suspect Washington''s military response to Whiskey Rebellion. Bacon''s Rebellion + and Pontiac''s Rebellion happen before the Revolution and they can be ruled + out. The answer is (C).' +tag: mmlu_flan_cot_fewshot_humanities +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_high_school_us_history diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2fbdaf05c137270f4ff4207e7c6ce81c2a34d30c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml @@ -0,0 +1,100 @@ +dataset_name: high_school_world_history +description: The following are multiple choice questions (with answers) about high + school world history. +fewshot_config: + sampler: first_n + samples: + - question: "This question refers to the following information.\n\"At least one of\ + \ the [world's] societies would have to somehow enormously increase its productivity\ + \ [in order to achieve global hegemony]. That quantum jump would have to be\ + \ made before the various scientific, technological, agricultural, and industrial\ + \ revolutions on which our post-quantum-leap world rests. It could only be accomplished\ + \ by exploiting the ecosystems, mineral resources, and human assets of whole\ + \ continents outside the lands of the society making the jump. Western Europe\ + \ did just that by means of its brutality and guns and, more important, by geographical\ + \ and ecological luck.\"\nCopyright \xA9 2015 Cambridge University Press.\n\ + Alfred Crosby, historian, Ecological Imperialism, 2004\nThe \"quantum jump\"\ + \ mentioned in the passage most directly contributed to which of the following\ + \ developments in the period 1450\u20131750 C.E.?\n(A) A breakdown in trade\ + \ routes through the collapse of the established state structure (B) An increase\ + \ in the population of the world through more plentiful supplies of food (C)\ + \ The spread of Chinese and Indian belief systems across the world (D) An increase\ + \ in social unrest" + target: Let's think step by step. We refer to Wikipedia articles on world history + for help. The "quantum jump" mentioned in the passage refers to the conquest + of the New World and the Columbian Exchange. Choice (A) and (C) did not happen + in history. Choice (C) refers to the human assets. The answer is (B). + - question: "This question refers to the following information.\n\"The struggle against\ + \ neo-colonialism is not aimed at excluding the capital of the developed world\ + \ from operating in less developed countries. It is aimed at preventing the\ + \ financial power of the developed countries being used in such a way as to\ + \ impoverish the less developed.\nNon-alignment, as practiced by Ghana and many\ + \ other countries, is based on co-operation with all States whether they be\ + \ capitalist, socialist or have a mixed economy. Such a policy, therefore, involves\ + \ foreign investment from capitalist countries, but it must be invested in accordance\ + \ with a national plan drawn up by the government of the non-aligned State with\ + \ its own interests in mind. The issue is not what return the foreign investor\ + \ receives on his investments\u2026The question is one of power. A State in\ + \ the grip of neo-colonialism is not master of its own destiny.\"\nKwame Nkrumah,\ + \ Neo-Colonialism, 1965\nWhich of the following provides the best context for\ + \ Nkrumah's writings?\n(A) The Industrial Revolution (B) Decolonization (C)\ + \ Regional Free Trade Associations (D) Autarky" + target: Let's think step by step. We refer to Wikipedia articles on world history + for help. The passage expresses a point that the successful fight against neo-colonialism + were in danger and the newly independent nations like Ghana may be re-colonized + via financial power of the developed countries. The answer is (B). + - question: "This question refers to the following information.\n\"Indeed, as both\ + \ the fatwas of distinguished [scholars] who base their opinion on reason and\ + \ tradition alike and the consensus of the Sunni community agree that the ancient\ + \ obligation of extirpation, extermination, and expulsion of evil innovation\ + \ must be the aim of our exalted aspiration, for \"Religious zeal is a victory\ + \ for the Faith of God the Beneficent\"; then, in accordance with the words\ + \ of the Prophet (Peace upon him!) \"Whosoever introduces evil innovation into\ + \ our order must be expelled\" and \"Whosoever does aught against our order\ + \ must be expelled,\" action has become necessary and exigent\u2026\"\nLetter\ + \ from Ottoman Sultan Selim I to Safavid Shah Ismail I, 1514\nThe letter from\ + \ Selim I is most clearly an example of which of the following?\n(A) The maintenance\ + \ of military supremacy at all costs (B) Expanding tensions between religious\ + \ sects (C) Factors that brought about the collapse of the Ottoman Empire (D)\ + \ Peacemaking efforts among the Islamic empires" + target: Let's think step by step. We refer to Wikipedia articles on world history + for help. The passage is an example of expanding tensions between Selim and + Ismail. In the passage the Selim references the fatwa and the consensus of the + Sunni community to against whosoever introduces evil. The answer is (B). + - question: 'This question refers to the following information. + + "The real grievance of the worker is the insecurity of his existence; he is + not sure that he will always have work, he is not sure that he will always be + healthy, and he foresees that he will one day be old and unfit to work. If he + falls into poverty, even if only through a prolonged illness, he is then completely + helpless, exam_ins to his own devices, and society does not currently recognize + any real obligation towards him beyond the usual help for the poor, even if + he has been working all the time ever so faithfully and diligently. The usual + help for the poor, however, leaves a lot to be desired, especially in large + cities, where it is very much worse than in the country." + + Otto von Bismarck, 1884 + + Otto von Bismarck likely made this speech in reaction to which of the following + issues? + + (A) Social acceptance of child labor (B) Declining life expectancy in Germany + (C) Criticisms of German trade tariffs (D) Negative effects attributed to industrial + capitalism' + target: Let's think step by step. We refer to Wikipedia articles on world history + for help. The passage talks about the grievance of the work under the industrial + capitalism. The answer is (D). + - question: "This question refers to the following information.\nHe contains all works\ + \ and desires and all perfumes and all tastes. He enfolds the whole universe\ + \ and in silence is loving to all. This is the Spirit that is in my heart, this\ + \ is Brahman. To him I shall come when I go beyond this life, and to him will\ + \ come he who has faith and doubts not.\n\u2014The Upanishads, India, c. 1000\ + \ BCE\nTo which religion does the speaker most likely belong?\n(A) Hinduism\ + \ (B) Buddhism (C) Shintoism (D) Zoroastrianism" + target: 'Let''s think step by step. We refer to Wikipedia articles on world history + for help. Brahman refers to the ultimate reality of all things in the Hindu + religion. In contrast, Buddhism does not have a concept of supreme God. The + answer is (A).' +tag: mmlu_flan_cot_fewshot_humanities +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_high_school_world_history diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3eec010845fa68ad974bdb7cd922a0028365d96e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml @@ -0,0 +1,42 @@ +dataset_name: human_aging +description: The following are multiple choice questions (with answers) about human + aging. +fewshot_config: + sampler: first_n + samples: + - question: 'All other things being equal, which of the following persons is more likely + to show osteoporosis? + + (A) An older Hispanic American woman (B) An older African American woman (C) + An older Asian American woman (D) An older Native American woman' + target: Let's think step by step. We refer to Wikipedia articles on human aging + for help. Although osteoporosis can occur at any age, the risk is higher for + older people. It is most common in Asian and non-Hispanic white women. The answer + is (C). + - question: 'The finding that adults tend to remember events from their adolescence + better than from other periods in their lives is referred to as the + + (A) Adolescence advantage (B) Reminiscence bump (C) Memorial memorial (D) Quadratic + retrieval spike' + target: Let's think step by step. We refer to Wikipedia articles on human aging + for help. Reminiscence bump is a phenomenon that older adults tend to recollect + events during their young ages. People usually have a period of childhood amnesia + from birth to around age 5, and a reminiscence bump between 10 and 30. The answer + is (B). + - question: 'Which element in tobacco smoke is responsible for cancers? + + (A) Nicotine (B) Tar (C) Carbon monoxide (D) Smoke particles' + target: Let's think step by step. We refer to Wikipedia articles on human aging + for help. The benzene, acrylamide and acrylonitrile in tar interact with the + lungs and cause DNA mutations in cells of the lungs, and lead to cancer. The + answer is (B). + - question: 'When older adults move to a new state after retirement, which of the following + is the more likely destination? + + (A) Texas (B) California (C) Hawaii (D) Vermont' + target: 'Let''s think step by step. We refer to Wikipedia articles on human aging + for help. Texas does not have state tax, and has low cost of living compared + with the other three options. The answer is (A).' +tag: mmlu_flan_cot_fewshot_other +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_human_aging diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dab78f0c5ec1042d23240bb71f59b212885585aa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml @@ -0,0 +1,50 @@ +dataset_name: human_sexuality +description: The following are multiple choice questions (with answers) about human + sexuality. +fewshot_config: + sampler: first_n + samples: + - question: 'The most common disorder among men who seek sexual therapy is: + + (A) premature ejaculation (B) inhibited ejaculation (C) erectile disorder (D) + ejaculatory disorder' + target: Let's think step by step. We refer to Wikipedia articles on human sexuality + for help. The most common type of arousal disorder for men is erectile dysfunction, + meaning that a person is not able to get any physical satisfaction from sexual + activity although he may be interested in it. The answer is (C). + - question: 'A woman who knows she has active herpes and untreated syphilis but continues + to have sex without informing her partners of her condition has, in psychoanalytic + terms: + + (A) a strong ego (B) a weak superego (C) a weak id (D) a strong superego' + target: Let's think step by step. We refer to Wikipedia articles on human sexuality + for help. A person with weak superego tends to be delinquent, criminal or have + antisocial personality. The action of the woman who knows she has active venereal + disease but still have sex with her partners indicate she may has antisocial + personality. The answer is (B). + - question: 'The nature of homosexual activities that occur during preadolescence include + all but which of the following? + + (A) sexual intercourse (B) circle jerks (C) exhibitionism (D) touching each + other''s genitals' + target: Let's think step by step. We refer to Wikipedia articles on human sexuality + for help. Sexual intercourse is prohibited during preadolescence. The answer + is (A). + - question: 'Women''s ability to have multiple orgasms is primarily due to: + + (A) the fact that they do not have a refractory period. (B) the response of + the inner layers of the vagina. (C) having alternating orgasms in different + locations. (D) the G-Spot.' + target: Let's think step by step. We refer to Wikipedia articles on human sexuality + for help. The refractory period is the time when a person is not able to be + erect or is not interested in sex. The answer is (A). + - question: 'Morning sickness is typically a problem: + + (A) during the first trimester (B) during the second trimester (C) during the + third trimester (D) all through the pregnancy' + target: 'Let''s think step by step. We refer to Wikipedia articles on human sexuality + for help. Morning sickness usually begins by nine weeks after conception, corresponding + to the first trimester. The answer is (A).' +tag: mmlu_flan_cot_fewshot_social_sciences +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_human_sexuality diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd72cab8cf8ea0553a633810e59c0ba4059c2176 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml @@ -0,0 +1,61 @@ +dataset_name: logical_fallacies +description: The following are multiple choice questions (with answers) about logical + fallacies. +fewshot_config: + sampler: first_n + samples: + - question: 'When an arguer causes confusion during refutation because of real or feigned + lack of an ability to engage in refutation, that arguer may have committed the + fallacy of + + (A) poor sportsmanship (B) appeal to compassion (C) argument against the person + (D) ignorance of refutation' + target: Let's think step by step. We refer to Wikipedia articles on logical fallacies + for help. Ignorance of refutation, one of Aristotle's original list of logical + fallacies in his Organon, is when someone causes confusion in an argument through + real or feigned inability to engage in refutation, in order to win the argument. + The answer is (D). + - question: 'The complex question fallacy consists of + + (A) arguing something is inferior just because it doesn''t do something it was + never intended to do. (B) including more than one claim in the proposition and + treating proof for one claim as proof for all the claims. (C) drawing a conclusion + before examining the evidence, and only considering evidence that supports that + conclusion. (D) asking a question that includes either an unproven assumption + or more than one question, thus making a straightforward yes or no answer meaningless.' + target: Let's think step by step. We refer to Wikipedia articles on logical fallacies + for help. The complex question fallacy is when someone makes a single yes or + no answer to a question meaningless, by including either an unproven assumption + or many questions. The latter is also known as the many questions fallacy. The + answer is (D). + - question: 'Arguing that what is true of the parts must be true of the whole is the + fallacy of... + + (A) Division (B) Composition (C) Appeal to the person (D) Appeal to ignorance' + target: Let's think step by step. We refer to Wikipedia articles on logical fallacies + for help. Fallacy of composition occurs when someone argues what is true of + the parts must be true of the whole. The answer is (B). + - question: 'Which of the following is true of a valid categorical syllogism? + + (A) The minor premise must deny the antecedent (B) The major premise must affirm + the consequent (C) The middle term must be used in at least one premise in a + universal or unqualified sense (D) All of the above' + target: 'Let''s think step by step. We refer to Wikipedia articles on logical + fallacies for help. A valid categorical syllogism must satisfy several conditions: + (1) the syllogism must have exactly three terms (2) every term of the syllogism + must be used twice exactly, (3) a term may be used only once in any premise, + and (4) the middle term must be used in at least one premise in a universal + or unqualified sense, etc. Only (C) is true. The answer is (C).' + - question: 'If someone attacks the character of an opposing arguer, instead of responding + to that opponent''s arguments, the first person has probably committed which + of the following fallacies? + + (A) tu quoque (B) horse laugh (C) argument against the person (D) ignoratio + elenchi' + target: 'Let''s think step by step. We refer to Wikipedia articles on logical + fallacies for help. The argument against the person fallacy occurs when someone + irrelevantly attacks the character of an opposing arguer, instead of addressing + that opponent''s arguments. The answer is (C).' +tag: mmlu_flan_cot_fewshot_humanities +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_logical_fallacies diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45a92e075582d6c0e2eb11c0310f87fc5debb4bb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml @@ -0,0 +1,64 @@ +dataset_name: moral_disputes +description: The following are multiple choice questions (with answers) about moral + disputes. +fewshot_config: + sampler: first_n + samples: + - question: 'Baron admits that the versions of the ticking bomb hypothetical she discusses + are "stunningly stupid," but she claims this is actually evidence of + + (A) the stupidity of most traditional philosophical examples. (B) a general + lack of intelligence among people with advanced degrees. (C) the wrongness of + torture. (D) the readiness on the part of many intelligent people to see torture + as the best solution to deal with terrorism.' + target: Let's think step by step. We refer to Wikipedia articles on moral disputes + for help. The ticking bomb hypothetical poses a problem where many people will + die to an exploding bomb, if the hypothetical terrorist does not disclose how + to defuse it. Baron sees this hypothetical as silly, but its prevalence does + suggest intelligent people, particularly utilitarians, see torture as justifiable + to save the lives in this scenario. The answer is (D). + - question: 'A fertilized ovum is also known as + + (A) a zygote. (B) an embryo. (C) a viability. (D) a blastocyst.' + target: Let's think step by step. We refer to Wikipedia articles on moral disputes + for help. Once a single sperm penetrates the layers of an egg to form a new + cell, that cell is called a zygote. The answer is (A). + - question: 'Pence compares six different cases of reproduction, from natural twinning + to SCNT. What conclusion does he draw from this comparison? + + (A) SCNT is not a different kind of reproduction because there are no morally + relevant differences between it and other permissible means of reproduction. + (B) Because there is a low risk of harm for natural twinning, there will be + a low risk of harm for SCNT. (C) both A and B (D) neither A nor B' + target: Let's think step by step. We refer to Wikipedia articles on moral disputes + for help. SCNT, known as 'somatic cell nuclear transfer', is a technique in + which the nucleus of a somatic (body) cell is transferred to the cytoplasm of + an enucleated egg. This is used in farm animal cloning. Pence may compare this + to other processes of reproduction, and see little moral separation. The answer + is (A). + - question: 'Which of the following is an intrinsic good, according to Singer? + + (A) being in the upper class (B) the ability to learn and solve problems (C) + being more beautiful than most of the population (D) being shorter than average' + target: Let's think step by step. We refer to Wikipedia articles on moral disputes + for help. Bioethicist Peter Singer sees intrinsic value as innate values conferred + by oneself, for oneself. Innanimate objects can be beautiful, short, or have + some valuable criteria, but capabilities are intrinsically good. The answer + is (B). + - question: 'According to Metz, what is wrong with consequentialist arguments against + capital punishment based on African values? + + (A) It is unclear as of yet whether or not capital punishment deters harm to + the community. (B) It is unclear as of yet whether or not capital punishment + deters harm to any individuals. (C) Consequentialism is not supported by African + values. (D) Even though consequentialism is supported by African values, no + consequentialist arguments framed in terms of African values have been offered.' + target: 'Let''s think step by step. We refer to Wikipedia articles on moral disputes + for help. Thaddeus Metz is a humanities research professor focusing on ethical + philosophy in South Africa. Metz has written the death penalty is unjustified + as it treats individuals as incapable of communal relations. It is unclear that + capital punishment is to the benefit of, or a deterrent of harm to the community. + The answer is (A).' +tag: mmlu_flan_cot_fewshot_humanities +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_moral_disputes diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe384b1e2b7d19c216f8344d5c249f2c16dc723b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml @@ -0,0 +1,55 @@ +dataset_name: public_relations +description: The following are multiple choice questions (with answers) about public + relations. +fewshot_config: + sampler: first_n + samples: + - question: 'Earth Hour was a campaign launched by which organization? + + (A) Greenpeace (B) The UN (C) Oxfam (D) World Wildlife Fund' + target: Let's think step by step. We refer to Wikipedia articles on public relations + for help. Earth Hour is a worldwide movement oragnized launched by the World + Wildlife Fund. The answer is (D). + - question: 'In issues management, what is the most proactive approach to addressing + negative or misleading information posted online about your organization? + + (A) Buy domain names that could be used by opposition groups. (B) Post anonymous + comments on blogs to combat this information. (C) Prepare a news release that + discredits the inaccurate information. (D) Make policy changes to address complaints + highlighted on these sites.' + target: Let's think step by step. We refer to Wikipedia articles on public relations + for help. In issues management, the most proactive approach to addressing negative + or misleading information posted online is to make policy changes to address + complaints highlighted on those sites. The answer is (D). + - question: 'At which stage in the planning process would a situation analysis be carried + out? + + (A) Defining the program (B) Planning the program (C) Taking action and implementing + ideas (D) Evaluation of the program' + target: Let's think step by step. We refer to Wikipedia articles on public relations + for help. Situation analyses are typically carried out during the planning process + stage of defining the program. The answer is (A). + - question: 'Which of these statements is true of the Vatican in 2010 at the time of + the accusations of child abuse cover-ups? + + (A) There was a coordinated media response. (B) Consistent messages were communicated. + (C) Criticisms were taken as attacks on the Catholic Church. (D) The credibility + of the Vatican was upheld.' + target: Let's think step by step. We refer to Wikipedia articles on public relations + for help. In 2010 when there were accusations of child abuse cover-ups, the + Vatican took those criticisms as attacks on the Catholic Church. The answer + is (C). + - question: 'What should a public relations media practitioner do if she does not know + the answer to a reporter''s question? + + (A) Give the reporter other information she is certain is correct. (B) Say that + the information is ''off the record'' and will be disseminated later. (C) Say + ''I don''t know'' and promise to provide the information later. (D) Say ''no + comment,'' rather than appear uninformed.' + target: 'Let''s think step by step. We refer to Wikipedia articles on public relations + for help. If a public relations media practitioner does not know the answer + to a reporter''s question, they should say ''I don''t know'' and offer to provide + the information later. The answer is (C).' +tag: mmlu_flan_cot_fewshot_social_sciences +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_public_relations diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4229d64785ded8673d421a9fb1571d0cce705a93 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml @@ -0,0 +1,58 @@ +dataset_name: sociology +description: The following are multiple choice questions (with answers) about sociology. +fewshot_config: + sampler: first_n + samples: + - question: 'Which of the following is not a problem associated with official statistics + on strike action? + + (A) most strikes go unnoticed by employers and the mass media (B) not all industrial + disputes will be reported by the employer (C) the definition of strikes excludes + those that involve fewer than ten workers or last less than one day (D) it is + hard to compare strikes that were measured in different ways' + target: Let's think step by step. We refer to Wikipedia articles on sociology + for help. Official statistics on strike action can be problematic because not + all industrial disputes will be reported by employers, the definition of strikes + excludes those that involves fewer than ten workers or last less than one day, + and it is hard to compare strikes that were measured in different ways. Thus, + (A) is not a problem associated with official statistics on strike action. The + answer is (A). + - question: 'What does Berger (1963) describe as a metaphor for social reality? + + (A) a fairground ride (B) a circus (C) a puppet theatre (D) a ballet' + target: Let's think step by step. We refer to Wikipedia articles on sociology + for help. Berger describes social reality using the metaphor of a puppet theatre. + The answer is (C). + - question: 'The term ''hegemony'' refers to: + + (A) the tendency for the working class not to realize their own interests (B) + a dominant ideology that legitimates economic, political and cultural power + (C) a form of dual consciousness based on ideology and everyday experiences + (D) a mode of payment given for outstanding topiary' + target: Let's think step by step. We refer to Wikipedia articles on sociology + for help. Hegemony refers to a dominant ideology that legitimates economic, + policital, and cultural power. The answer is (B). + - question: 'The shift from ''civil religion'' to ''common religion'' means that: + + (A) the increasing bureaucracy of the state has made religion only a marginal + part of our lives (B) despite the weakening of traditional authority, our everyday + lives and ''common sense'' remain shaped by religious beliefs and values (C) + religious participation in collective worship may have declined, but people + still practise their faiths in private (D) people are much more likely to discuss + their religious beliefs in public, informal settings' + target: Let's think step by step. We refer to Wikipedia articles on sociology + for help. The shift from civil religion to common religion means that despite + the weakening of traditional authority, our everyday lives and common sense + remain shaped by religious beliefs and values. The answer is (B). + - question: 'Which of the following did the post-war welfare state of 1948 not aim + to provide: + + (A) free health care and education for all (B) a minimum wage (C) full employment + (D) universal welfare' + target: 'Let''s think step by step. We refer to Wikipedia articles on sociology + for help. The post-war welfare state of 1948 aimed to provide free healthcare + and education, full employment, and universal welfare. But it did not aim to + provide a minimum wage. The answer is (B).' +tag: mmlu_flan_cot_fewshot_social_sciences +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_sociology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e70f0ee24cc946a1cfbc51cb87d4fde20d8171c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml @@ -0,0 +1,45 @@ +dataset_name: virology +description: The following are multiple choice questions (with answers) about virology. +fewshot_config: + sampler: first_n + samples: + - question: 'The median survival time to AIDS and death was established by following: + + (A) Seroprevalent HIV-infected individuals (B) Seronegatives (C) Seroconverters + (D) High-risk seronegatives' + target: Let's think step by step. We refer to Wikipedia articles on virology for + help. The median survival time to AIDS and death was established as a result + of the development of seroconverters. The answer is (C). + - question: 'Which of the following is a morphological characteristic of the paramyxoviruses. + + (A) Fragile viruses often visualised with RNA spewing from the inside (B) Elongate + viruses (C) Icosahedral viruses with envelope (D) Very large viruses' + target: Let's think step by step. We refer to Wikipedia articles on virology for + help. Paramyxoviruses are fragile viruses often visualised with RNA spewing + from the inside. The answer is (A). + - question: 'The most important goal of a behavioral intervention is: + + (A) Change in behavior (B) Comprehensive coverage (C) Effective use of behavioral + theory (D) Sustained behavior change' + target: Let's think step by step. We refer to Wikipedia articles on virology for + help. The prim goal of a behavioral intervention is to cause sustained behavior + change. The answer is (D). + - question: 'A key factor facilitating the application of nested case-control studies + from the MACS was: + + (A) Data collection (B) Establishment of a repository of biologic specimens + (C) Participant interest (D) Administration of the questionnaire by staff' + target: Let's think step by step. We refer to Wikipedia articles on virology for + help. The Multicenter AIDS Cohort Study's use of nested case-control studies + was facilitated by the establishment of a repository of biologic specimens. + The answer is (B). + - question: 'Why are parvoviruses a highly impactful parasite? + + (A) Because they have no nucleic acid (B) They require a helper virus (C) Only + replicate in dividing cells (D) Can integrate into host chromosomes' + target: 'Let''s think step by step. We refer to Wikipedia articles on virology + for help. Paroviruses are highly impactful because they do not have nucleic + acid. The answer is (A).' +tag: mmlu_flan_cot_fewshot_other +include: _mmlu_flan_cot_fewshot_template_yaml +task: mmlu_flan_cot_fewshot_virology