diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7c148d9fbec8be41fd89a01aa8590deabd2c4cad --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/README.md @@ -0,0 +1,59 @@ +# GPQA + +### Paper + +Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark + +Abstract: https://arxiv.org/abs/2311.12022 + +We present GPQA, a challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. We ensure that the questions are high-quality and extremely difficult: experts who have or are pursuing PhDs in the corresponding domains reach 65% accuracy (74% when discounting clear mistakes the experts identified in retrospect), while highly skilled non-expert validators only reach 34% accuracy, despite spending on average over 30 minutes with unrestricted access to the web (i.e., the questions are “Google-proof”). The questions are also difficult for state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving 39% accuracy. If we are to use future AI systems to help us answer very hard questions—for example, when developing new scientific knowledge—we need to develop *scalable oversight* methods that enable humans to supervise their outputs, which may be difficult even if the supervisors are themselves skilled and knowledgeable. The difficulty of GPQA both for skilled non-experts and frontier AI systems should enable realistic scalable oversight experiments, which we hope can help devise ways for human experts to reliably get truthful information from AI systems that surpass human capabilities. + +Homepage: `https://github.com/idavidrein/gpqa/tree/main` + +### Citation + +``` +@misc{rein2023gpqa, + title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark}, + author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman}, + year={2023}, + eprint={2311.12022}, + archivePrefix={arXiv}, + primaryClass={cs.AI} +} +``` + +This dataset is gated, so you will have to accept the terms of use at https://huggingface.co/datasets/Idavidrein/gpqa and login via `huggingface-cli login` using your HF Hub token before running this task. + +### Groups, Tags, and Tasks + +#### Groups + +None + +#### Tags + +* `gpqa`: runs all GPQA variants. + +#### Tasks + +* `gpqa_{main, diamond, extended}_zeroshot` +* `gpqa_{main, diamond, extended}_n_shot` +* `gpqa_{main, diamond, extended}_generative_n_shot` +* `gpqa_{main, diamond, extended}_cot_zeroshot` +* `gpqa_{main, diamond, extended}_cot_n_shot` + +### Checklist + +For adding novel benchmarks/datasets to the library: + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: + +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..73ccb876a449a1e8eda5984d977194f6b0c064d9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/_generate_configs.py @@ -0,0 +1,26 @@ +import yaml +from tqdm import tqdm + + +def main() -> None: + subset = ["extended", "diamond", "main"] + setting = "cot_n_shot" + for task in tqdm(subset): + file_name = f"gpqa_{task}_{setting}.yaml" + try: + with open(f"{file_name}", "w") as f: + f.write("# Generated by _generate_configs.py\n") + yaml.dump( + { + "include": f"_gpqa_{setting}_yaml", + "task": f"gpqa_{task}_{setting}", + "dataset_name": f"gpqa_{task}", + }, + f, + ) + except FileExistsError: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml new file mode 100644 index 0000000000000000000000000000000000000000..97c0603bcc94f0c689269ea9859b62bdfab7644e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml @@ -0,0 +1,38 @@ +dataset_path: Idavidrein/gpqa +tag: gpqa +output_type: generate_until +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n" +doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: " +doc_to_target: answer +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "(?<=The answer is )(.*)(?=.)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "multi_choice_regex" + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" +generation_kwargs: + until: + - "" + do_sample: false + temperature: 0.0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..24e5f4f90f1f770f9f792e4aeef51e08d3aa08d9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _gpqa_cot_n_shot_yaml +task: gpqa_diamond_cot_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..002ede9a82110e3679bf3e1e958ded4342e408e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _gpqa_cot_n_shot_yaml +task: gpqa_extended_cot_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..916b6ea06a2e22042344b668191adbb3c91c4e75 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _gpqa_cot_n_shot_yaml +task: gpqa_main_cot_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..96bcd52b140fd0a5896f55c0a52ea2fd5453fd53 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_n_shot/utils.py @@ -0,0 +1,39 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "choices": [choices[0], choices[1], choices[2], choices[3]], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..bda00784cc2fa26b5f0d488cf7b6aea37243353d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/_generate_configs.py @@ -0,0 +1,26 @@ +import yaml +from tqdm import tqdm + + +def main() -> None: + subset = ["extended", "diamond", "main"] + setting = "cot_zeroshot" + for task in tqdm(subset): + file_name = f"gpqa_{task}_{setting}.yaml" + try: + with open(f"{file_name}", "w") as f: + f.write("# Generated by _generate_configs.py\n") + yaml.dump( + { + "include": f"_gpqa_{setting}_yaml", + "task": f"gpqa_{task}_{setting}", + "dataset_name": f"gpqa_{task}", + }, + f, + ) + except FileExistsError: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c487a8c4a3e3806bfa265fa7dc7a3f897ddedff --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml @@ -0,0 +1,38 @@ +dataset_path: Idavidrein/gpqa +tag: gpqa +output_type: generate_until +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: " +doc_to_target: answer +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "(?<=The answer is )(.*)(?=.)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "multi_choice_regex" + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" +generation_kwargs: + until: + - "" + do_sample: false + temperature: 0.0 +num_fewshot: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e6a840fa1815096f5fa180ed06223e3523a06214 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _gpqa_cot_zeroshot_yaml +task: gpqa_diamond_cot_zeroshot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f542a6148f231e2d7e7e2a5a3437047459e3856 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _gpqa_cot_zeroshot_yaml +task: gpqa_extended_cot_zeroshot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c14604854294c4551e2602e573488c6a7fef254 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _gpqa_cot_zeroshot_yaml +task: gpqa_main_cot_zeroshot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..96bcd52b140fd0a5896f55c0a52ea2fd5453fd53 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/cot_zeroshot/utils.py @@ -0,0 +1,39 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "choices": [choices[0], choices[1], choices[2], choices[3]], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..e2c011ea02d25ca1d3550210f4a4644c97fa52c2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/_generate_configs.py @@ -0,0 +1,26 @@ +import yaml +from tqdm import tqdm + + +def main() -> None: + subset = ["extended", "diamond", "main"] + setting = "generative_n_shot" + for task in tqdm(subset): + file_name = f"gpqa_{task}_{setting}.yaml" + try: + with open(f"{file_name}", "w") as f: + f.write("# Generated by _generate_configs.py\n") + yaml.dump( + { + "include": f"_gpqa_{setting}_yaml", + "task": f"gpqa_{task}_{setting}", + "dataset_name": f"gpqa_{task}", + }, + f, + ) + except FileExistsError: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml new file mode 100644 index 0000000000000000000000000000000000000000..f43a9a414cb4e53e7d5e83787ae6c1e5de109111 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml @@ -0,0 +1,39 @@ +dataset_path: Idavidrein/gpqa +tag: gpqa +output_type: generate_until +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n" +doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:" +doc_to_target: answer +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "(?<=The answer is )(.*)(?=.)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "multi_choice_regex" + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" +generation_kwargs: + until: + - "" + - "Question:" + - "<|im_end|>" + temperature: 0.0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a42094e8ba8ef6037820255b74a8830d550b8a9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _gpqa_generative_n_shot_yaml +task: gpqa_diamond_generative_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc40c2d97684c50b3992f5adf894ebe0c138b4ae --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _gpqa_generative_n_shot_yaml +task: gpqa_extended_generative_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..865f3cb5efa3d4b8641843cfde7db3c95bd8b8b3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _gpqa_generative_n_shot_yaml +task: gpqa_main_generative_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..96bcd52b140fd0a5896f55c0a52ea2fd5453fd53 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/generative/utils.py @@ -0,0 +1,39 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "choices": [choices[0], choices[1], choices[2], choices[3]], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..c01f208e767cb813e6d2116caf74c3d0b2fccfb3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/_generate_configs.py @@ -0,0 +1,26 @@ +import yaml +from tqdm import tqdm + + +def main() -> None: + subset = ["extended", "diamond", "main"] + + for task in tqdm(subset): + file_name = f"gpqa_{task}_n_shot.yaml" + try: + with open(f"{file_name}", "w") as f: + f.write("# Generated by _generate_configs.py\n") + yaml.dump( + { + "include": "_gpqa_n_shot_yaml", + "task": f"gpqa_{task}_n_shot", + "dataset_name": f"gpqa_{task}", + }, + f, + ) + except FileExistsError: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml new file mode 100644 index 0000000000000000000000000000000000000000..8406f8aabfa9d10eec18ef7a8565b6393a0bfc03 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml @@ -0,0 +1,21 @@ +dataset_path: Idavidrein/gpqa +tag: gpqa +output_type: multiple_choice +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n" +doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:" +doc_to_target: answer +doc_to_choice: ["(A)", "(B)", "(C)", "(D)"] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_diamond_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_diamond_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3043a7e53647ff72d535abc113dfccebaa1bd43c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_diamond_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _gpqa_n_shot_yaml +task: gpqa_diamond_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_extended_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_extended_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d16b505b355bccb3d6fd70eb16b307c12d06a09 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_extended_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _gpqa_n_shot_yaml +task: gpqa_extended_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_main_n_shot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_main_n_shot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e5f3e9532ab41c0158409e6afb47393806c4177 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/gpqa_main_n_shot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _gpqa_n_shot_yaml +task: gpqa_main_n_shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e0b886d2879216094214ce534438e4db0c5e60f8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/n_shot/utils.py @@ -0,0 +1,41 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +rng = random.Random(42) + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + rng.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..79afbd6f1d8d4b2eb54455d734f6245357580bd3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/_generate_configs.py @@ -0,0 +1,26 @@ +import yaml +from tqdm import tqdm + + +def main() -> None: + subset = ["extended", "diamond", "main"] + setting = "zeroshot" + for task in tqdm(subset): + file_name = f"gpqa_{task}_{setting}.yaml" + try: + with open(f"{file_name}", "w") as f: + f.write("# Generated by _generate_configs.py\n") + yaml.dump( + { + "include": f"_gpqa_{setting}_yaml", + "task": f"gpqa_{task}_{setting}", + "dataset_name": f"gpqa_{task}", + }, + f, + ) + except FileExistsError: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml new file mode 100644 index 0000000000000000000000000000000000000000..500f1921bec3db0d1282b8501b7a0841ebbb79c4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml @@ -0,0 +1,21 @@ +dataset_path: Idavidrein/gpqa +tag: gpqa +output_type: multiple_choice +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:" +doc_to_target: answer +doc_to_choice: ["(A)", "(B)", "(C)", "(D)"] +num_fewshot: 0 +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_diamond_zeroshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_diamond_zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c3a7921c30b3ff09e82aacb4c0e915010f698966 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_diamond_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _gpqa_zeroshot_yaml +task: gpqa_diamond_zeroshot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_extended_zeroshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_extended_zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e7347f11154351ad4560200a3f3bf54106a1a8f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_extended_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _gpqa_zeroshot_yaml +task: gpqa_extended_zeroshot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_main_zeroshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_main_zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1a8d7fb59025d148130f2a468cb1bbdfad959102 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/gpqa_main_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _gpqa_zeroshot_yaml +task: gpqa_main_zeroshot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c2317e02efd132aea27ec8c8fad284df55ccd382 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/gpqa/zeroshot/utils.py @@ -0,0 +1,38 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_generate_configs.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_generate_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..fd81edbbd0a6d74efe2ea68f04210bfaaef72661 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_generate_configs.py @@ -0,0 +1,34 @@ +from typing import List + +import yaml + + +def generate_yaml_content(vocab_name: str, level: str): + content = { + "dataset_name": f"{vocab_name}_{level}", + "tag": f"med_concepts_qa_{vocab_name}_tasks", + "include": "_default_template_yaml", + "task": f"med_concepts_qa_{vocab_name}_{level}", + "task_alias": f"{vocab_name}_{level}", + } + return content + + +def generate_yaml_files( + vocab_names: List[str], levels: List[str], file_name_prefix: str +): + for vocab_name in vocab_names: + for level in levels: + yaml_content = generate_yaml_content(vocab_name, level) + filename = f"{file_name_prefix}_{vocab_name}_{level}.yaml" + with open(filename, "w") as yaml_file: + yaml.dump(yaml_content, yaml_file, default_flow_style=False) + print(f"Done to generated {filename}") + + +if __name__ == "__main__": + generate_yaml_files( + vocab_names=["icd9cm", "icd10cm", "icd9proc", "icd10proc", "atc"], + levels=["easy", "medium", "hard"], + file_name_prefix="med_concepts_qa", + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c737127f986f3fb2a85fcbfd7e167587563de0f9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml @@ -0,0 +1,10 @@ +group: med_concepts_qa +task: + - med_concepts_qa_icd9cm + - med_concepts_qa_icd10cm + - med_concepts_qa_icd9proc + - med_concepts_qa_icd10proc + - med_concepts_qa_atc +aggregate_metric_list: + - metric: acc + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c4afc1bdfe7c9bf453ebed1fa38f21f94e6906ad --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml @@ -0,0 +1,6 @@ +group: med_concepts_qa_icd10cm +task: + - med_concepts_qa_icd10cm_tasks +aggregate_metric_list: + - metric: acc + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..407ea4088df28c4370f2797adb47c01b440caebc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml @@ -0,0 +1,6 @@ +group: med_concepts_qa_icd10proc +task: + - med_concepts_qa_icd10proc_tasks +aggregate_metric_list: + - metric: acc + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a54d27d97690886c2d0fb716b48dd0daf1a4bbab --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml @@ -0,0 +1,5 @@ +dataset_name: atc_hard +include: _default_template_yaml +tag: med_concepts_qa_atc_tasks +task: med_concepts_qa_atc_hard +task_alias: atc_hard diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0fe8da7ddd08f2b8a51777f06c359ef32f7c2106 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml @@ -0,0 +1,5 @@ +dataset_name: icd10cm_easy +include: _default_template_yaml +tag: med_concepts_qa_icd10cm_tasks +task: med_concepts_qa_icd10cm_easy +task_alias: icd10cm_easy diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b291b0eb42fad609ea3e49a384f8fd55a48ff71 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml @@ -0,0 +1,5 @@ +dataset_name: icd10cm_hard +include: _default_template_yaml +tag: med_concepts_qa_icd10cm_tasks +task: med_concepts_qa_icd10cm_hard +task_alias: icd10cm_hard diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..301f559d7c1cef808e880a809f2a0a91ad9ca227 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml @@ -0,0 +1,5 @@ +dataset_name: icd10proc_easy +include: _default_template_yaml +tag: med_concepts_qa_icd10proc_tasks +task: med_concepts_qa_icd10proc_easy +task_alias: icd10proc_easy diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..459fa464ffead9d78810394484ca8cc49cf67d18 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml @@ -0,0 +1,5 @@ +dataset_name: icd9proc_easy +include: _default_template_yaml +tag: med_concepts_qa_icd9proc_tasks +task: med_concepts_qa_icd9proc_easy +task_alias: icd9proc_easy diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5424623c308011cf0891a83ca7bbd9a971e26a0d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml @@ -0,0 +1,5 @@ +dataset_name: icd9proc_hard +include: _default_template_yaml +tag: med_concepts_qa_icd9proc_tasks +task: med_concepts_qa_icd9proc_hard +task_alias: icd9proc_hard diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e4acc8c3810cc6f8bbe32216b8d0cdfa9463135 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_anatomy" +"description": "The following are multiple choice questions (with answers) about anatomy.\n\ + \n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_anatomy" +"task_alias": "anatomy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e836e31c55e8cb6b203049f75c13aaf57870a6f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_business_ethics" +"description": "The following are multiple choice questions (with answers) about business\ + \ ethics.\n\n" +"tag": "mmlusr_answer_only_other_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_business_ethics" +"task_alias": "business ethics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1ef709675c2c7016d2aabbcde3bb3ff3dcafcbcc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_clinical_knowledge" +"description": "The following are multiple choice questions (with answers) about clinical\ + \ knowledge.\n\n" +"tag": "mmlusr_answer_only_other_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_clinical_knowledge" +"task_alias": "clinical knowledge" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b967895a7017094a9b5f78ad9ddbeff7a56ab6f9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_college_biology" +"description": "The following are multiple choice questions (with answers) about college\ + \ biology.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_college_biology" +"task_alias": "college biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8dd100e7bc82bcc44d7f121e0dab7852dbc69074 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_college_chemistry" +"description": "The following are multiple choice questions (with answers) about college\ + \ chemistry.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_college_chemistry" +"task_alias": "college chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml new file mode 100644 index 0000000000000000000000000000000000000000..132e0b6041126c058b61a3ddcb27e51383134c77 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_college_medicine" +"description": "The following are multiple choice questions (with answers) about college\ + \ medicine.\n\n" +"tag": "mmlusr_answer_only_other_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_college_medicine" +"task_alias": "college medicine" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba3d60d51b852e27637d1c1ab43af1d4da7fc057 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_computer_security" +"description": "The following are multiple choice questions (with answers) about computer\ + \ security.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_computer_security" +"task_alias": "computer security" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0a84ecc3c5015d2338cfffc88877e39b9be7c75 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_conceptual_physics" +"description": "The following are multiple choice questions (with answers) about conceptual\ + \ physics.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_conceptual_physics" +"task_alias": "conceptual physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab695e6ab46ea036cdccd91487c47aca35f1a99a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_electrical_engineering" +"description": "The following are multiple choice questions (with answers) about electrical\ + \ engineering.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_electrical_engineering" +"task_alias": "electrical engineering" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dff9fbf25b49d3b59d0ae716cd4bf3a210547742 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_elementary_mathematics" +"description": "The following are multiple choice questions (with answers) about elementary\ + \ mathematics.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_elementary_mathematics" +"task_alias": "elementary mathematics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e26ed865bc94d0923f7f1103d89f556a914df8aa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_formal_logic" +"description": "The following are multiple choice questions (with answers) about formal\ + \ logic.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_formal_logic" +"task_alias": "formal logic" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41ed53cb9a75af634f1b241da2345985f72b65e5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_biology" +"description": "The following are multiple choice questions (with answers) about high\ + \ school biology.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_biology" +"task_alias": "high school biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95a3303f3de284b7759a08dba7440398d009296f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_chemistry" +"description": "The following are multiple choice questions (with answers) about high\ + \ school chemistry.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_chemistry" +"task_alias": "high school chemistry" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e665fb3400855e8eefc2b8d8dc8054bb0b48279e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_computer_science" +"description": "The following are multiple choice questions (with answers) about high\ + \ school computer science.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_computer_science" +"task_alias": "high school computer science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a4980060190c2d9870a7da722d8a28aa3d2f3af --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_geography" +"description": "The following are multiple choice questions (with answers) about high\ + \ school geography.\n\n" +"tag": "mmlusr_answer_only_social_sciences_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_geography" +"task_alias": "high school geography" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95e35cd8b177c7aab73d90f85f29812d4c2d69fb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_macroeconomics" +"description": "The following are multiple choice questions (with answers) about high\ + \ school macroeconomics.\n\n" +"tag": "mmlusr_answer_only_social_sciences_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_macroeconomics" +"task_alias": "high school macroeconomics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3af9a2c79bf4f5c090a309138139350d40581be --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_microeconomics" +"description": "The following are multiple choice questions (with answers) about high\ + \ school microeconomics.\n\n" +"tag": "mmlusr_answer_only_social_sciences_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_microeconomics" +"task_alias": "high school microeconomics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df77619cbb8d2b8ccc42083beac223920833e3ec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_psychology" +"description": "The following are multiple choice questions (with answers) about high\ + \ school psychology.\n\n" +"tag": "mmlusr_answer_only_social_sciences_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_psychology" +"task_alias": "high school psychology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2119fb39d1cf03028a61f6f57cac66343ba308ba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_statistics" +"description": "The following are multiple choice questions (with answers) about high\ + \ school statistics.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_statistics" +"task_alias": "high school statistics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2287ae457a1f4013ac607955f0050d7ce7f1caad --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_high_school_us_history" +"description": "The following are multiple choice questions (with answers) about high\ + \ school us history.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_high_school_us_history" +"task_alias": "high school us history" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05e482d1683ab518d7b7258ee99b8cc532af0f00 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_international_law" +"description": "The following are multiple choice questions (with answers) about international\ + \ law.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_international_law" +"task_alias": "international law" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml new file mode 100644 index 0000000000000000000000000000000000000000..73edd6cb293f97e2ae112c159602fc6f57caeb24 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_jurisprudence" +"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\ + \n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_jurisprudence" +"task_alias": "jurisprudence" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab18c9270e07ff3101fcb25e219f94e1ff80aef0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_logical_fallacies" +"description": "The following are multiple choice questions (with answers) about logical\ + \ fallacies.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_logical_fallacies" +"task_alias": "logical fallacies" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1b833c706f24750d113100414d7b8857c936899e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_machine_learning" +"description": "The following are multiple choice questions (with answers) about machine\ + \ learning.\n\n" +"tag": "mmlusr_answer_only_stem_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_machine_learning" +"task_alias": "machine learning" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26ec67401daf8f5e3af5f6c4b1ca8937927a6850 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_management" +"description": "The following are multiple choice questions (with answers) about management.\n\ + \n" +"tag": "mmlusr_answer_only_other_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_management" +"task_alias": "management" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12153929983be65dcbcbc59b195414607dd1ecae --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_miscellaneous" +"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\ + \n" +"tag": "mmlusr_answer_only_other_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_miscellaneous" +"task_alias": "miscellaneous" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dee1c01eb1b9ca1cb85ae2c0ba8d0638a78c9515 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_moral_scenarios" +"description": "The following are multiple choice questions (with answers) about moral\ + \ scenarios.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_moral_scenarios" +"task_alias": "moral scenarios" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..538dea756ccc6b97d3071922fa1a2f67d5daede8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_philosophy" +"description": "The following are multiple choice questions (with answers) about philosophy.\n\ + \n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_philosophy" +"task_alias": "philosophy" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b9f45995cbf30cf4453f30ed47a644718349c695 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_professional_accounting" +"description": "The following are multiple choice questions (with answers) about professional\ + \ accounting.\n\n" +"tag": "mmlusr_answer_only_other_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_professional_accounting" +"task_alias": "professional accounting" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..caccccf0ded8e25f7c92e4788885583f5f014168 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_professional_law" +"description": "The following are multiple choice questions (with answers) about professional\ + \ law.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_professional_law" +"task_alias": "professional law" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58a9fc2d31f1c9758e0fbb9b90f13922cfb20247 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_professional_psychology" +"description": "The following are multiple choice questions (with answers) about professional\ + \ psychology.\n\n" +"tag": "mmlusr_answer_only_social_sciences_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_professional_psychology" +"task_alias": "professional psychology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e72f02f5595c91afbb06a97414bc78866c47203 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_security_studies" +"description": "The following are multiple choice questions (with answers) about security\ + \ studies.\n\n" +"tag": "mmlusr_answer_only_social_sciences_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_security_studies" +"task_alias": "security studies" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6014213538672bf733088ea63b9d1262f47a1ff4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml @@ -0,0 +1,7 @@ +"dataset_name": "answer_only_world_religions" +"description": "The following are multiple choice questions (with answers) about world\ + \ religions.\n\n" +"tag": "mmlusr_answer_only_humanities_tasks" +"include": "_mmlusr_a_yml" +"task": "mmlusr_answer_only_world_religions" +"task_alias": "world religions" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f1341bd59050caa11c56a9a36210428417e6c9f4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlusr/answer_only/utils.py @@ -0,0 +1,19 @@ +import datasets + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _helper(doc): + # Assuming that the 'answer' field in the dataset now contains numbers 0-3 instead of 'A', 'B', 'C', 'D' + answer_list = ["A", "B", "C", "D"] + # Convert numeric index to corresponding letter + answer_index = int(doc["answer"]) # Make sure the answer is an integer + answer_letter = answer_list[answer_index] + + out_doc = { + "questions": doc["question"], + "choices": [doc["choice1"], doc["choice2"], doc["choice3"], doc["choice4"]], + "answer": answer_letter, # Include the letter for clarity + } + return out_doc + + return dataset.map(_helper)