diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..925a3bba61c82afaff42476ff4a4b3ebb04b6ddb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml @@ -0,0 +1,26 @@ +dataset_path: lukaemon/bbh +output_type: generate_until +test_split: test +doc_to_target: "{{target}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + # ignore_punctuation: true + regexes_to_ignore: + - "\\.$" + - "," + - "\\\\" + - "\n" + - '"' +generation_kwargs: + until: + - "" + - "Q:" + - "<|im_end|>" + do_sample: false + temperature: 0.0 +num_fewshot: 0 +metadata: + version: 3.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bf47baad136dc6d44eaec82d6fdf1520c3a114b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml @@ -0,0 +1,18 @@ +"dataset_name": "causal_judgement" +"description": "Answer questions about causal attribution.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_causal_judgement" + +filter_list: + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "\\b(Yes|No|yes|no)\\b" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..725a70ecfc08b89c3fb9766e854bd48995fcc1f3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml @@ -0,0 +1,20 @@ +"dataset_name": "disambiguation_qa" +"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_disambiguation_qa" + +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..946030a0062d9697b4c6e72f236b21971c5e28b4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml @@ -0,0 +1,19 @@ +"dataset_name": "logical_deduction_five_objects" +"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_logical_deduction_five_objects" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f92f4bc5aaf86db30f4decaeee2f374b76107028 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml @@ -0,0 +1,19 @@ +"dataset_name": "logical_deduction_seven_objects" +"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_logical_deduction_seven_objects" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d1451828848c37156e53177765ce6941ff67b6eb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml @@ -0,0 +1,19 @@ +"dataset_name": "logical_deduction_three_objects" +"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_logical_deduction_three_objects" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1b68b8b881ca929d284094fa129bca064bc08e4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml @@ -0,0 +1,19 @@ +"dataset_name": "movie_recommendation" +"description": "Recommend movies similar to the given list of movies.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_movie_recommendation" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f1fee3159ded8988e798ab8f19f464de7ae0a69 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml @@ -0,0 +1,17 @@ +"dataset_name": "navigate" +"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_navigate" +filter_list: + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "\\b(Yes|No|yes|no)\\b" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ee7720332c6b67048f1545c3f97adce06d2be2e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml @@ -0,0 +1,17 @@ +"dataset_name": "object_counting" +"description": "Questions that involve enumerating objects and asking the model to count them.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_object_counting" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.NumberParseRegexFilter + group_select: -1 + regex_pattern: "([-0-9]+)" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf865e89a6e8ea5b6d6d691cae600401d495bc82 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml @@ -0,0 +1,19 @@ +"dataset_name": "ruin_names" +"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_ruin_names" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7d72eadc3bbd2c026c9a62dc237f90c725dacf7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml @@ -0,0 +1,19 @@ +"dataset_name": "salient_translation_error_detection" +"description": "Detect the type of error in an English translation of a German source sentence.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_salient_translation_error_detection" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eb338a1b5e0cbcd5541449aa5129d37ce1f2e12d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml @@ -0,0 +1,19 @@ +"dataset_name": "snarks" +"description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_snarks" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1046bfe81928a4f09bddadd03a9062704c5dc357 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml @@ -0,0 +1,21 @@ +"dataset_name": "sports_understanding" +"description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_sports_understanding" + +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MapRegexFilter + group_select: -1 + ignore_case: true + regex_pattern_to_value: + \b(no|not plausible)\b: "no" + \b(yes|plausible)\b: "yes" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c7b949ada5ad2a8293869ed3c29fff9b419e0870 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml @@ -0,0 +1,19 @@ +"dataset_name": "temporal_sequences" +"description": "Task description: Answer questions about which times certain events could have occurred.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_temporal_sequences" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4635d7cabaa250aa1c255c8d9d80cf8f8c87e9b6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml @@ -0,0 +1,19 @@ +"dataset_name": "tracking_shuffled_objects_seven_objects" +"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1edf0c0537c71510cf781ae1acbab9829eeed883 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml @@ -0,0 +1,19 @@ +"dataset_name": "tracking_shuffled_objects_three_objects" +"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_tracking_shuffled_objects_three_objects" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: -1 + ignore_case: true + ignore_punctuation: true + regex_pattern: "(\\([A-Z]\\))" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c63df468de9edb7f6c04e31c17db5af9b92be3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/utils.py @@ -0,0 +1,224 @@ +import collections +import re +import sys +import unicodedata + +from lm_eval.filters.extraction import Filter, RegexFilter + + +class ExtendedRegexFilter(RegexFilter): + punct_tbl = dict.fromkeys( + i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") + ) + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + super().__init__(regex_pattern, group_select, fallback) + self.ignore_case = ignore_case + self.ignore_punctuation = ignore_punctuation + self.regexes_to_ignore = regexes_to_ignore + + def filter_ignores(self, st): + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + + if self.ignore_case: + st = st.lower() + + if self.ignore_punctuation: + # https://stackoverflow.com/a/266162 + st = st.translate(self.punct_tbl) + return st + + def find_match(self, regex, resp, convert_dict={}): + match = regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + if match and match in convert_dict: + match = convert_dict[match] + return match + + +class MapRegexFilter(ExtendedRegexFilter): + def __init__( + self, + regex_pattern_to_value: dict = {}, + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + """ + regex_pattern_to_value: Match the regex pattern and change the result into the value + group_select: Selects the (group_select)th match from the findall result. We use the whole regex_patterns, concatenated by | + ignore_case: Lowers the case of response before matching with the given regex + ignore_punctuation: Remove the punctuation before matching with the given regex + regexes_to_ignore: Remove these regexes before matching with the given regex + """ + super().__init__( + "|".join(list(regex_pattern_to_value.keys())), + group_select, + fallback, + ignore_case, + ignore_punctuation, + regexes_to_ignore, + ) + self.regex_to_value = { + re.compile(r): v for r, v in regex_pattern_to_value.items() + } + + def apply(self, resps, docs): + filtered_resps = [] + + for r in resps: + filtered = [] + for resp in r: + whole_match_considering_group_select = self.find_match( + self.regex, self.filter_ignores(resp) + ) + if whole_match_considering_group_select: + for regex, mapped_value in self.regex_to_value.items(): + match = self.find_match( + regex, + self.filter_ignores(whole_match_considering_group_select), + ) + if match: + match = mapped_value + break + if not whole_match_considering_group_select or not match: + match = self.fallback + + filtered.append(match) + filtered_resps.append(filtered) + + return filtered_resps + + +class NumberParseRegexFilter(ExtendedRegexFilter): + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + filtered_resps = [] + import regex + from word2number import w2n + + # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words + english_number_regex = regex.compile( + "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))" + ) + + for r in resps: + filtered = [] + for resp in r: + match = self.find_match(self.regex, resp) + if not match: + match = self.find_match(english_number_regex, resp.lower()) + if match: + match = str(w2n.word_to_num(match)) + if not match: + match = self.fallback + filtered.append(match) + filtered_resps.append(filtered) + + return filtered_resps + + +class WordSortFilter(Filter): + """ """ + + def apply(self, resps, docs): + filtered_resps = [] + + for r, doc in zip(resps, docs): + words = doc["input"].split("List:")[1].strip().split() + regex = re.compile("|".join([f"\\b{w}\\b" for w in words])) + filtered = [] + for resp in r: + match = regex.findall(resp) + match.reverse() + ordered_words = reversed( + collections.OrderedDict(zip(match, [None] * len(match))) + ) + filtered.append(" ".join(ordered_words)) + filtered_resps.append(filtered) + + return filtered_resps + + +class MultiChoiceRegexFilter(ExtendedRegexFilter): + def __init__(self, *args, **kwargs): + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(*args, **kwargs) + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + multiple_choices_regex = re.compile(r"\([A-Z]\)([^\n^(]*)") + match = multiple_choices_regex.findall(doc["input"]) + for m in match: + m = self.filter_ignores(m.strip()) + fallback_regexes.append(f"{re.escape(m)}") + choice_to_alpha[m] = f"({next_alpha})" + + without_paren_fallback_regexes.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + + next_alpha = chr(ord(next_alpha) + 1) + fallback_regex = re.compile("|".join(fallback_regexes)) + without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) + without_paren_fallback_regex = re.compile( + f":[\s]*({without_paren_fallback_regex})" + ) + + filtered = [] + for resp in r: + match = self.find_match(self.regex, resp) + if not match: + match = self.find_match( + fallback_regex, self.filter_ignores(resp), choice_to_alpha + ) + if not match: + match = self.find_match( + without_paren_fallback_regex, resp, without_paren_to_target + ) + if not match: + match = self.fallback + filtered.append(match) + filtered_resps.append(filtered) + + return filtered_resps diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a3227304a29461497f6c9acd08965dda481b95f5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml @@ -0,0 +1,20 @@ +"dataset_name": "web_of_lies" +"description": "Evaluate a random boolean function expressed as a word problem.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_web_of_lies" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MapRegexFilter + group_select: -1 + ignore_case: true + regex_pattern_to_value: + \b(no|does not tell the truth|is not telling the truth)\b: "no" + \b(yes|tells the truth|is telling the truth)\b: "yes" + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..258add09a083b508197d5ea614f388d6cca53a40 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml @@ -0,0 +1,15 @@ +"dataset_name": "word_sorting" +"description": "Sort a list of words.\n\n" +"doc_to_text": "Q: {{input}}\nA: Let's think step by step." +"include": "_cot_zeroshot_template_yaml" +"task": "bbh_cot_zeroshot_word_sorting" +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.WordSortFilter + - function: "take_first" + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))" + - function: "take_first" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..13aa6d68e7c45085835d2733cb1b08207b922819 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml @@ -0,0 +1,35 @@ +group: bbh_fewshot +task: + - bbh_fewshot_boolean_expressions + - bbh_fewshot_causal_judgement + - bbh_fewshot_date_understanding + - bbh_fewshot_disambiguation_qa + - bbh_fewshot_dyck_languages + - bbh_fewshot_formal_fallacies + - bbh_fewshot_geometric_shapes + - bbh_fewshot_hyperbaton + - bbh_fewshot_logical_deduction_five_objects + - bbh_fewshot_logical_deduction_seven_objects + - bbh_fewshot_logical_deduction_three_objects + - bbh_fewshot_movie_recommendation + - bbh_fewshot_multistep_arithmetic_two + - bbh_fewshot_navigate + - bbh_fewshot_object_counting + - bbh_fewshot_penguins_in_a_table + - bbh_fewshot_reasoning_about_colored_objects + - bbh_fewshot_ruin_names + - bbh_fewshot_salient_translation_error_detection + - bbh_fewshot_snarks + - bbh_fewshot_sports_understanding + - bbh_fewshot_temporal_sequences + - bbh_fewshot_tracking_shuffled_objects_five_objects + - bbh_fewshot_tracking_shuffled_objects_seven_objects + - bbh_fewshot_tracking_shuffled_objects_three_objects + - bbh_fewshot_web_of_lies + - bbh_fewshot_word_sorting +aggregate_metric_list: + - metric: exact_match + aggregation: mean + weight_by_size: true +metadata: + version: 3.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..cd59edb53dc4318a2ab10a7c245bf99c8c463c26 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml @@ -0,0 +1,20 @@ +dataset_path: lukaemon/bbh +output_type: generate_until +test_split: test +doc_to_target: "{{target}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + # ignore_case: true + # ignore_punctuation: true +generation_kwargs: + until: + - "" + - "Q" + - "\n\n" + do_sample: false + temperature: 0.0 +num_fewshot: 3 +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4f211ad4695d91cb7015e1ec0c64f8235ff910c2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml @@ -0,0 +1,19 @@ +dataset_name: boolean_expressions +description: 'Evaluate the result of a random Boolean expression. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: not ( ( not not True ) ) is + target: 'False' + - input: True and False and not True and True is + target: 'False' + - input: not not ( not ( False ) ) is + target: 'True' +include: _fewshot_template_yaml +task: bbh_fewshot_boolean_expressions diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/causal_judgement.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/causal_judgement.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f04b352a3c9e14c1c34955698752da4ff7b8abdf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/causal_judgement.yaml @@ -0,0 +1,67 @@ +dataset_name: causal_judgement +description: 'Answer questions about causal attribution. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'How would a typical person answer each of the following questions about + causation? + + Frank T., had an ongoing dispute with his neighbor over a stretch of land and + one day decided to shoot his neighbor in the body. Frank T. had no experience + with guns, his hand slipped on the barrel of the gun, and the shot went wild. + Nonetheless, the bullet bounced off a large boulder several feet away and hit + the neighbor''s body, causing significant injury. Did Frank T. intentionally + shoot his neighbor in the body? + + Options: + + - Yes + + - No' + target: 'No' + - input: 'How would a typical person answer each of the following questions about + causation? + + Suzy and Billy are working on a project that is very important for our nation''s + security. The boss tells them both: "Be sure that you are here at exactly 9 + am. It is absolutely essential that you arrive at that time." Both Billy and + Suzy arrive at 9 am. As it happens, there was a motion detector installed in + the room where they arrived. The motion detector was set up to be triggered + if at least one person appeared in the room at the same time. So the motion + detector went off. Did Billy cause the motion detector to go off? + + Options: + + - Yes + + - No' + target: 'Yes' + - input: 'How would a typical person answer each of the following questions about + causation? + + George and his sister Lena reunite at their parents'' house for Thanksgiving. + Whereas George just got into medical school, Lena is unhappy in her marriage + and recently lost her job. Over the course of the day, George and Lena get into + a number of heated arguments. Later in the afternoon they play a game of darts. + They split the first two games, and the third game is close until the end. Who + will win comes down to George''s last shot. If he hits a high point region, + he wins; if he hits a low point region, Lena wins. George thinks of the difficult + time Lena is having, and he really wants to let her win. He aims the dart at + the low point region. He sets up his shot and the dart lands in the low point + region. After his shot, Lena wins the game and is very happy. Did George hit + the low point region intentionally? + + Options: + + - Yes + + - No' + target: 'Yes' +include: _fewshot_template_yaml +task: bbh_fewshot_causal_judgement diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/date_understanding.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/date_understanding.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41b6364cc5f34fae75eb83dc4a836bf6114cfaaf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/date_understanding.yaml @@ -0,0 +1,60 @@ +dataset_name: date_understanding +description: 'Infer the date from context. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY? + + Options: + + (A) 12/14/2026 + + (B) 12/14/1950 + + (C) 12/14/2007 + + (D) 12/14/1937 + + (E) 07/14/1938 + + (F) 12/14/1988' + target: (D) + - input: 'Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY? + + Options: + + (A) 09/04/2018 + + (B) 11/11/2018 + + (C) 08/25/2018 + + (D) 11/02/2018 + + (E) 11/04/2018' + target: (B) + - input: 'Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. + What is the date tomorrow in MM/DD/YYYY? + + Options: + + (A) 01/11/1961 + + (B) 01/03/1963 + + (C) 01/18/1961 + + (D) 10/14/1960 + + (E) 01/03/1982 + + (F) 12/03/1960' + target: (B) +include: _fewshot_template_yaml +task: bbh_fewshot_date_understanding diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/disambiguation_qa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/disambiguation_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..40dae76fb6d6e9f71f2bbbeb09ab6be084be5b8b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/disambiguation_qa.yaml @@ -0,0 +1,53 @@ +dataset_name: disambiguation_qa +description: 'Clarify the meaning of sentences with ambiguous pronouns. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'In the following sentences, explain the antecedent of the pronoun (which + thing the pronoun refers to), or state that it is ambiguous. + + Sentence: The chief told the counselor that they took the day off. + + Options: + + (A) The chief took the day off + + (B) The counselor took the day off + + (C) Ambiguous' + target: (A) + - input: 'In the following sentences, explain the antecedent of the pronoun (which + thing the pronoun refers to), or state that it is ambiguous. + + Sentence: The manager sent a message to the secretary, but he didn''t reply + yet. + + Options: + + (A) The secretary didn''t reply yet + + (B) The manager didn''t reply yet + + (C) Ambiguous' + target: (A) + - input: 'In the following sentences, explain the antecedent of the pronoun (which + thing the pronoun refers to), or state that it is ambiguous. + + Sentence: Bailey will plan to meet the director at his office + + Options: + + (A) It will be Bailey''s office + + (B) It will be the director''s office + + (C) Ambiguous' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_disambiguation_qa diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/dyck_languages.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/dyck_languages.yaml new file mode 100644 index 0000000000000000000000000000000000000000..52e2cb8a1217e6da389b2e185768310124b8d812 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/dyck_languages.yaml @@ -0,0 +1,23 @@ +dataset_name: dyck_languages +description: 'Correctly close a Dyck-n word. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Complete the rest of the sequence, making sure that the parentheses are + closed properly. Input: [ { [' + target: '] } ]' + - input: 'Complete the rest of the sequence, making sure that the parentheses are + closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]' + target: '] ) )' + - input: 'Complete the rest of the sequence, making sure that the parentheses are + closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > + ] > }' + target: '} ] >' +include: _fewshot_template_yaml +task: bbh_fewshot_dyck_languages diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/formal_fallacies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/formal_fallacies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7187072d048ca95bb55624b24d8dd26ce7a4efec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/formal_fallacies.yaml @@ -0,0 +1,60 @@ +dataset_name: formal_fallacies +description: 'Distinguish deductively valid arguments from formal fallacies. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: '"It is not always easy to see who is related to whom -- and in which ways. + The following argument pertains to this question: To begin with, Lesley is a + close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate + of Lowell is sufficient for being a great-grandfather of Leroy. It follows that + Lesley is a great-grandfather of Leroy." + + Is the argument, given the explicitly stated premises, deductively valid or + invalid? + + Options: + + - valid + + - invalid' + target: valid + - input: '"It is not always easy to see who is related to whom -- and in which ways. + The following argument pertains to this question: Whoever is not a great-grandfather + of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient + for not being a great-grandfather of Clyde. We may conclude: Everyone who is + an ancestor of Dana is a stepbrother of Brian, too." + + Is the argument, given the explicitly stated premises, deductively valid or + invalid? + + Options: + + - valid + + - invalid' + target: valid + - input: '"It is not always easy to grasp who is consuming which products. The following + argument pertains to this question: Every infrequent user of Paul Mitchell shampoo + is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, + or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo + and, in the same time, a loyal buyer of Caress soap. It follows that whoever + is an infrequent user of Paul Mitchell shampoo is not a regular consumer of + Lush soap." + + Is the argument, given the explicitly stated premises, deductively valid or + invalid? + + Options: + + - valid + + - invalid' + target: invalid +include: _fewshot_template_yaml +task: bbh_fewshot_formal_fallacies diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/geometric_shapes.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/geometric_shapes.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fb03f1f299c1a5ae3756ed003540a728e8eaf2a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/geometric_shapes.yaml @@ -0,0 +1,89 @@ +dataset_name: geometric_shapes +description: 'Name geometric shapes from their SVG paths. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'This SVG path element + draws a + + Options: + + (A) circle + + (B) heptagon + + (C) hexagon + + (D) kite + + (E) line + + (F) octagon + + (G) pentagon + + (H) rectangle + + (I) sector + + (J) triangle' + target: (F) + - input: 'This SVG path element draws a + + Options: + + (A) circle + + (B) heptagon + + (C) hexagon + + (D) kite + + (E) line + + (F) octagon + + (G) pentagon + + (H) rectangle + + (I) sector + + (J) triangle' + target: (G) + - input: 'This SVG path element draws a + + Options: + + (A) circle + + (B) heptagon + + (C) hexagon + + (D) kite + + (E) line + + (F) octagon + + (G) pentagon + + (H) rectangle + + (I) sector + + (J) triangle' + target: (D) +include: _fewshot_template_yaml +task: bbh_fewshot_geometric_shapes diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/hyperbaton.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/hyperbaton.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9af7516e1a5171c3976c55edbefa3db638414657 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/hyperbaton.yaml @@ -0,0 +1,37 @@ +dataset_name: hyperbaton +description: 'Order adjectives correctly in English sentences. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Which sentence has the correct adjective order: + + Options: + + (A) rubber terrible ship + + (B) terrible rubber ship' + target: (B) + - input: 'Which sentence has the correct adjective order: + + Options: + + (A) repulsive small Brazilian exercise ship + + (B) Brazilian repulsive exercise small ship' + target: (A) + - input: 'Which sentence has the correct adjective order: + + Options: + + (A) blue gold wonderful square shoe + + (B) wonderful square blue gold shoe' + target: (B) +include: _fewshot_template_yaml +task: bbh_fewshot_hyperbaton diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_five_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_five_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cb9615adadb500461497605ed03aa5dbbf68ed1d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_five_objects.yaml @@ -0,0 +1,55 @@ +dataset_name: logical_deduction_five_objects +description: 'A logical deduction task which requires deducing the order of a sequence + of objects. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished + above Amy. Eli finished below Amy. + + Options: + + (A) Amy finished last + + (B) Eli finished last + + (C) Eve finished last' + target: (B) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a white book, a green book, and an orange + book. The green book is to the right of the white book. The orange book is the + rightmost. + + Options: + + (A) The white book is the leftmost + + (B) The green book is the leftmost + + (C) The orange book is the leftmost' + target: (A) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a red book, a gray book, and a white book. + The white book is to the left of the gray book. The red book is the second from + the left. + + Options: + + (A) The red book is the leftmost + + (B) The gray book is the leftmost + + (C) The white book is the leftmost' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_logical_deduction_five_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_seven_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_seven_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..393c080c17ba34ae9a79bbca62460334a3606366 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_seven_objects.yaml @@ -0,0 +1,55 @@ +dataset_name: logical_deduction_seven_objects +description: 'A logical deduction task which requires deducing the order of a sequence + of objects. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished + above Amy. Eli finished below Amy. + + Options: + + (A) Amy finished last + + (B) Eli finished last + + (C) Eve finished last' + target: (B) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a white book, a green book, and an orange + book. The green book is to the right of the white book. The orange book is the + rightmost. + + Options: + + (A) The white book is the leftmost + + (B) The green book is the leftmost + + (C) The orange book is the leftmost' + target: (A) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a red book, a gray book, and a white book. + The white book is to the left of the gray book. The red book is the second from + the left. + + Options: + + (A) The red book is the leftmost + + (B) The gray book is the leftmost + + (C) The white book is the leftmost' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_logical_deduction_seven_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed91b04bd0277772fff2db7969ba36a3a7c27ad8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml @@ -0,0 +1,55 @@ +dataset_name: logical_deduction_three_objects +description: 'A logical deduction task which requires deducing the order of a sequence + of objects. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished + above Amy. Eli finished below Amy. + + Options: + + (A) Amy finished last + + (B) Eli finished last + + (C) Eve finished last' + target: (B) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a white book, a green book, and an orange + book. The green book is to the right of the white book. The orange book is the + rightmost. + + Options: + + (A) The white book is the leftmost + + (B) The green book is the leftmost + + (C) The orange book is the leftmost' + target: (A) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a red book, a gray book, and a white book. + The white book is to the left of the gray book. The red book is the second from + the left. + + Options: + + (A) The red book is the leftmost + + (B) The gray book is the leftmost + + (C) The white book is the leftmost' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_logical_deduction_three_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e65854181dfa091bff1fc59f697b5fad7c32ae45 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml @@ -0,0 +1,57 @@ +dataset_name: movie_recommendation +description: 'Recommend movies similar to the given list of movies. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones + and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big + Lebowski: + + Options: + + (A) Tetsuo + + (B) the Ironman + + (C) The Princess Bride + + (D) The Barkley Marathons The Race That Eats Its Young + + (E) Bug' + target: (C) + - input: 'Find a movie similar to Twister, The Silence of the Lambs, Independence + Day, Braveheart: + + Options: + + (A) They Shoot Horses + + (B) Don''t They + + (C) Forrest Gump + + (D) The Salton Sea + + (E) Extreme Days' + target: (C) + - input: 'Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest + Gump: + + Options: + + (A) Phenomena + + (B) Lilting + + (C) Catwoman + + (D) Edge of Tomorrow' + target: (D) +include: _fewshot_template_yaml +task: bbh_fewshot_movie_recommendation diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b95964e1ff6f42d52c543b0a2622972584888856 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml @@ -0,0 +1,19 @@ +dataset_name: multistep_arithmetic_two +description: 'Solve multi-step arithmetic problems. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) = + target: '123' + - input: ((-9 * 7 * 7 * -9) + (4 * -9 - 8 - -4)) = + target: '3929' + - input: ((-3 + 5 * 8 * -4) - (9 - 8 * -7 + -9)) = + target: '-219' +include: _fewshot_template_yaml +task: bbh_fewshot_multistep_arithmetic_two diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/navigate.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/navigate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3fb25ae184c9751ae78ad9aed3705e47a485bb3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/navigate.yaml @@ -0,0 +1,43 @@ +dataset_name: navigate +description: 'Given a series of navigation instructions, determine whether one would + end up back at the starting point. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'If you follow these instructions, do you return to the starting point? + Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. + Take 8 steps. + + Options: + + - Yes + + - No' + target: 'No' + - input: 'If you follow these instructions, do you return to the starting point? + Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. + Take 1 step. + + Options: + + - Yes + + - No' + target: 'No' + - input: 'If you follow these instructions, do you return to the starting point? + Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right. + + Options: + + - Yes + + - No' + target: 'Yes' +include: _fewshot_template_yaml +task: bbh_fewshot_navigate diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/object_counting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/object_counting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..acfe04ef80d5fdccb61b68778c9a79da6cadd1dd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/object_counting.yaml @@ -0,0 +1,23 @@ +dataset_name: object_counting +description: 'Questions that involve enumerating objects and asking the model to count + them. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, + a flute, an orange, and a violin. How many fruits do I have? + target: '6' + - input: I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape, + a nectarine, and three plums. How many fruits do I have? + target: '11' + - input: I have a lettuce head, a head of broccoli, an onion, a stalk of celery, + two carrots, a garlic, and a yam. How many vegetables do I have? + target: '8' +include: _fewshot_template_yaml +task: bbh_fewshot_object_counting diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7599a8f0c771e25456e683aad384de673fb54db0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml @@ -0,0 +1,71 @@ +dataset_name: penguins_in_a_table +description: 'Answer questions about a table of penguins and their attributes. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Here is a table where the first line is a header and each subsequent line + is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, + 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis + is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add + a penguin to the table: + + James, 12, 90, 12 + + How many penguins are less than 8 years old? + + Options: + + (A) 1 + + (B) 2 + + (C) 3 + + (D) 4 + + (E) 5' + target: (B) + - input: 'Here is a table where the first line is a header and each subsequent line + is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, + 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis + is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. Which is + the youngest penguin? + + Options: + + (A) Louis + + (B) Bernard + + (C) Vincent + + (D) Gwen + + (E) James' + target: (B) + - input: 'Here is a table where the first line is a header and each subsequent line + is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, + 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis + is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. What is + the name of the second penguin sorted by alphabetic order? + + Options: + + (A) Louis + + (B) Bernard + + (C) Vincent + + (D) Gwen + + (E) James' + target: (D) +include: _fewshot_template_yaml +task: bbh_fewshot_penguins_in_a_table diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..963a2c708d9e691e27cf6beb73642cb68be08144 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml @@ -0,0 +1,119 @@ +dataset_name: reasoning_about_colored_objects +description: 'Answer extremely simple questions about the colors of objects on a surface. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, + a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the + stress ball? + + Options: + + (A) red + + (B) orange + + (C) yellow + + (D) green + + (E) blue + + (F) brown + + (G) magenta + + (H) fuchsia + + (I) mauve + + (J) teal + + (K) turquoise + + (L) burgundy + + (M) silver + + (N) gold + + (O) black + + (P) grey + + (Q) purple + + (R) pink' + target: (E) + - input: 'On the table, you see a bunch of objects arranged in a row: a purple paperclip, + a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve + fidget spinner, and a burgundy pen. What is the color of the object directly + to the right of the stress ball? + + Options: + + (A) red + + (B) orange + + (C) yellow + + (D) green + + (E) blue + + (F) brown + + (G) magenta + + (H) fuchsia + + (I) mauve + + (J) teal + + (K) turquoise + + (L) burgundy + + (M) silver + + (N) gold + + (O) black + + (P) grey + + (Q) purple + + (R) pink' + target: (F) + - input: 'On the nightstand, you see the following items arranged in a row: a teal + plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, + a pink notebook, and a grey cup. How many non-orange items do you see to the + left of the teal item? + + Options: + + (A) zero + + (B) one + + (C) two + + (D) three + + (E) four + + (F) five + + (G) six' + target: (A) +include: _fewshot_template_yaml +task: bbh_fewshot_reasoning_about_colored_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/ruin_names.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/ruin_names.yaml new file mode 100644 index 0000000000000000000000000000000000000000..982e86677930806cff7e17c51838c460a4052b66 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/ruin_names.yaml @@ -0,0 +1,53 @@ +dataset_name: ruin_names +description: 'Select the humorous edit that ''ruins'' the input movie or musical artist + name. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Which of the following is a humorous edit of this artist or movie name: + ''whitesnake''? + + Options: + + (A) whitesnape + + (B) whitesnapke + + (C) whitesnuake + + (D) mwhitesnake' + target: (A) + - input: 'Which of the following is a humorous edit of this artist or movie name: + ''one of our dinosaurs is missing''? + + Options: + + (A) ofne of our dinosaurs is missing + + (B) one af our dinosaurs is missing + + (C) one of our dinosaurs is pissing + + (D) one of our dinosaur is missing' + target: (C) + - input: 'Which of the following is a humorous edit of this artist or movie name: + ''counting crows''? + + Options: + + (A) countingy crows + + (B) counting cows + + (C) courting crows + + (D) coutnting crows' + target: (B) +include: _fewshot_template_yaml +task: bbh_fewshot_ruin_names diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml new file mode 100644 index 0000000000000000000000000000000000000000..685f6ae089294d8c20c8296af3e10ebba95522df --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml @@ -0,0 +1,77 @@ +dataset_name: salient_translation_error_detection +description: 'Detect the type of error in an English translation of a German source + sentence. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'The following translations from German to English contain a particular + error. That error will be one of the following types: Named Entities: An entity + (names, places, locations, etc.) is changed to a different entity. Numerical + Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. + Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are + changed. Negation or Antonyms: Introduce or remove a negation or change comparatives + to their antonyms. Facts: Trivial factual errors not pertaining to the above + classes are introduced in the translations. Dropped Content: A significant clause + in the translation is removed. Please identify that error. Source: In der Liste + der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen + Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet. + + Translation: In the list of architectural monuments in Lenzen all architectural + monuments of the Brandenburg city of Lenzen and its districts are listed. + + The translation contains an error pertaining to + + Options: + + (A) Modifiers or Adjectives + + (B) Numerical Values + + (C) Negation or Antonyms + + (D) Named Entities + + (E) Dropped Content + + (F) Facts' + target: (D) + - input: "The following translations from German to English contain a particular\ + \ error. That error will be one of the following types: Named Entities: An entity\ + \ (names, places, locations, etc.) is changed to a different entity. Numerical\ + \ Values: Numerical values (ordinals or cardinals), dates, and/or units are\ + \ changed. Modifiers or Adjectives: The modifiers and adjectives pertaining\ + \ to a noun are changed. Negation or Antonyms: Introduce or remove a negation\ + \ or change comparatives to their antonyms. Facts: Trivial factual errors not\ + \ pertaining to the above classes are introduced in the translations. Dropped\ + \ Content: A significant clause in the translation is removed. Please identify\ + \ that error. Source: Auf dieser Seite sind die Baudenkm\xE4ler der oberbayerischen\ + \ Gro\xDFen Kreisstadt Landsberg am Lech zusammengestellt.\nTranslation: On\ + \ this page are compiled the architectural monuments of the town of Landsberg\ + \ am Lech.\nThe translation contains an error pertaining to\nOptions:\n(A) Modifiers\ + \ or Adjectives\n(B) Numerical Values\n(C) Negation or Antonyms\n(D) Named Entities\n\ + (E) Dropped Content\n(F) Facts" + target: (E) + - input: "The following translations from German to English contain a particular\ + \ error. That error will be one of the following types: Named Entities: An entity\ + \ (names, places, locations, etc.) is changed to a different entity. Numerical\ + \ Values: Numerical values (ordinals or cardinals), dates, and/or units are\ + \ changed. Modifiers or Adjectives: The modifiers and adjectives pertaining\ + \ to a noun are changed. Negation or Antonyms: Introduce or remove a negation\ + \ or change comparatives to their antonyms. Facts: Trivial factual errors not\ + \ pertaining to the above classes are introduced in the translations. Dropped\ + \ Content: A significant clause in the translation is removed. Please identify\ + \ that error. Source: \u0141eba ist eine Kleinstadt und ein Badeort im Powiat\ + \ L\u0119borski der polnischen Woiwodschaft Pommern.\nTranslation: Eba is not\ + \ a small town and seaside resort in the Powiat L\xE9borski county of the Pomeranian\ + \ Voivodeship of Poland.\nThe translation contains an error pertaining to\n\ + Options:\n(A) Modifiers or Adjectives\n(B) Numerical Values\n(C) Negation or\ + \ Antonyms\n(D) Named Entities\n(E) Dropped Content\n(F) Facts" + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_salient_translation_error_detection diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/snarks.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/snarks.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c73fdbf8442e1d3eaeacc56cbf8cfd25d8b9c36a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/snarks.yaml @@ -0,0 +1,47 @@ +dataset_name: snarks +description: 'Determine which of two sentences is sarcastic. + + + According to Cambridge University Dictionary, sarcasm is "the use of remarks that + clearly mean the opposite of what they say, made in order to hurt someone''s feelings + or to criticize something in a humorous way." Sarcastic sentences often contain + satirical or ironic utterances, hyperboles, ambivalent or witty remarks. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Which statement is sarcastic? + + Options: + + (A) Yes, because having interests and actively researching them is a huge waste + + (B) Yes, because having interests and actively researching them is a huge deal' + target: (A) + - input: 'Which statement is sarcastic? + + Options: + + (A) No one is going to disagree with you on this. Avoiding ad hominem attacks + really help your case + + (B) No one is going to disagree with you on this. Ad hominem attacks really + help your case' + target: (B) + - input: 'Which statement is sarcastic? + + Options: + + (A) Consistency in the league''s punishments? What do you think this is supposed + to be, politics? + + (B) Consistency in the league''s punishments? What do you think this is supposed + to be, moral?' + target: (A) +include: _fewshot_template_yaml +task: bbh_fewshot_snarks diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/sports_understanding.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/sports_understanding.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6cee72f18e1a4785a655eedb59f9b51fa1b43283 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/sports_understanding.yaml @@ -0,0 +1,22 @@ +dataset_name: sports_understanding +description: 'Determine whether an artificially constructed sentence relating to sports + is plausible or not. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: Is the following sentence plausible? "Bam Adebayo scored a reverse layup + in the Western Conference Finals." + target: 'yes' + - input: Is the following sentence plausible? "Santi Cazorla scored a touchdown." + target: 'no' + - input: Is the following sentence plausible? "DeMar DeRozan was called for the + goal tend." + target: 'yes' +include: _fewshot_template_yaml +task: bbh_fewshot_sports_understanding diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51af58f5994b7b8827e12d63c8ad9f3314ea7849 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml @@ -0,0 +1,104 @@ +dataset_name: temporal_sequences +description: 'Task description: Answer questions about which times certain events + could have occurred. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Today, Emily went to the museum. Between what times could they have gone? + + We know that: + + Emily woke up at 1pm. + + Elizabeth saw Emily reading at the library from 2pm to 4pm. + + Jessica saw Emily watching a movie at the theater from 4pm to 5pm. + + Leslie saw Emily waiting at the airport from 5pm to 6pm. + + William saw Emily buying clothes at the mall from 6pm to 7pm. + + The museum was closed after 7pm. + + Between what times could Emily have gone to the museum? + + Options: + + (A) 1pm to 2pm + + (B) 6pm to 7pm + + (C) 5pm to 6pm + + (D) 2pm to 4pm' + target: (A) + - input: 'Today, Elizabeth went to the amusement park. Between what times could + they have gone? + + We know that: + + Elizabeth woke up at 7am. + + David saw Elizabeth fixing their computer at the electronic store from 1pm to + 2pm. + + Sarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm. + + Susan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm. + + Andrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm. + + Emily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm. + + The amusement park was closed after 10pm. + + Between what times could Elizabeth have gone to the amusement park? + + Options: + + (A) 7am to 1pm + + (B) 9pm to 10pm + + (C) 1pm to 2pm + + (D) 3pm to 6pm' + target: (A) + - input: 'Today, Tiffany went to the beach. Between what times could they have gone? + + We know that: + + Tiffany woke up at 5am. + + Betty saw Tiffany getting a coffee at the cafe from 5am to 6am. + + Jessica saw Tiffany working at the office from 6am to 9am. + + John saw Tiffany stretching at a yoga studio from 9am to 12pm. + + Sean saw Tiffany sitting on a rooftop from 12pm to 2pm. + + Sarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm. + + The beach was closed after 4pm. + + Between what times could Tiffany have gone to the beach? + + Options: + + (A) 9am to 12pm + + (B) 12pm to 2pm + + (C) 5am to 6am + + (D) 3pm to 4pm' + target: (D) +include: _fewshot_template_yaml +task: bbh_fewshot_temporal_sequences diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e97ba0e45df33d3676a6f31b5e5f64597b5da4ee --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml @@ -0,0 +1,62 @@ +dataset_name: tracking_shuffled_objects_five_objects +description: 'A task requiring determining the final positions of a set of objects + given their initial positions and a description of a sequence of swaps. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire + has a pink ball. + + As the game progresses, pairs of players trade balls. First, Claire and Alice + swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. + At the end of the game, Bob has the + + Options: + + (A) yellow ball + + (B) blue ball + + (C) pink ball' + target: (A) + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a white ball, Bob has a purple ball, and + Claire has a pink ball. + + As the game progresses, pairs of players trade balls. First, Bob and Alice swap + balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At + the end of the game, Alice has the + + Options: + + (A) white ball + + (B) purple ball + + (C) pink ball' + target: (C) + - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of + a song, they each have a partner: Alice is dancing with Lola, Bob is dancing + with Rodrigo, and Claire is dancing with Patrick. + + Throughout the song, the dancers often trade partners. First, Alice and Bob + switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice + switch partners. At the end of the dance, Alice is dancing with + + Options: + + (A) Lola + + (B) Rodrigo + + (C) Patrick' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_tracking_shuffled_objects_five_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1ddfc6d2a3a01ee6a234f0d8c5c215344d0a4915 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml @@ -0,0 +1,62 @@ +dataset_name: tracking_shuffled_objects_seven_objects +description: 'A task requiring determining the final positions of a set of objects + given their initial positions and a description of a sequence of swaps. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire + has a pink ball. + + As the game progresses, pairs of players trade balls. First, Claire and Alice + swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. + At the end of the game, Bob has the + + Options: + + (A) yellow ball + + (B) blue ball + + (C) pink ball' + target: (A) + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a white ball, Bob has a purple ball, and + Claire has a pink ball. + + As the game progresses, pairs of players trade balls. First, Bob and Alice swap + balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At + the end of the game, Alice has the + + Options: + + (A) white ball + + (B) purple ball + + (C) pink ball' + target: (C) + - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of + a song, they each have a partner: Alice is dancing with Lola, Bob is dancing + with Rodrigo, and Claire is dancing with Patrick. + + Throughout the song, the dancers often trade partners. First, Alice and Bob + switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice + switch partners. At the end of the dance, Alice is dancing with + + Options: + + (A) Lola + + (B) Rodrigo + + (C) Patrick' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_tracking_shuffled_objects_seven_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1ac29184a33c1f804e1aaef2dc32ecf04adb8d98 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml @@ -0,0 +1,62 @@ +dataset_name: tracking_shuffled_objects_three_objects +description: 'A task requiring determining the final positions of a set of objects + given their initial positions and a description of a sequence of swaps. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire + has a pink ball. + + As the game progresses, pairs of players trade balls. First, Claire and Alice + swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. + At the end of the game, Bob has the + + Options: + + (A) yellow ball + + (B) blue ball + + (C) pink ball' + target: (A) + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a white ball, Bob has a purple ball, and + Claire has a pink ball. + + As the game progresses, pairs of players trade balls. First, Bob and Alice swap + balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At + the end of the game, Alice has the + + Options: + + (A) white ball + + (B) purple ball + + (C) pink ball' + target: (C) + - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of + a song, they each have a partner: Alice is dancing with Lola, Bob is dancing + with Rodrigo, and Claire is dancing with Patrick. + + Throughout the song, the dancers often trade partners. First, Alice and Bob + switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice + switch partners. At the end of the dance, Alice is dancing with + + Options: + + (A) Lola + + (B) Rodrigo + + (C) Patrick' + target: (C) +include: _fewshot_template_yaml +task: bbh_fewshot_tracking_shuffled_objects_three_objects diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/web_of_lies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/web_of_lies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..791310326f025c73aef4bef9cfe070b19d131033 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/web_of_lies.yaml @@ -0,0 +1,25 @@ +dataset_name: web_of_lies +description: 'Evaluate a random boolean function expressed as a word problem. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina + says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent + lies. Does Raymond tell the truth?' + target: 'Yes' + - input: 'Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie + tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell + the truth?' + target: 'Yes' + - input: 'Question: Kristian tells the truth. Michaela says Kristian lies. Raymond + says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says + Osvaldo tells the truth. Does Jamey tell the truth?' + target: 'No' +include: _fewshot_template_yaml +task: bbh_fewshot_web_of_lies diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/word_sorting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/word_sorting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4de6439cbaa4b5c11d98d42cb5a0018d2be5489 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/bbh/fewshot/word_sorting.yaml @@ -0,0 +1,21 @@ +dataset_name: word_sorting +description: 'Sort a list of words. + + + ' +doc_to_text: 'Q: {{input}} + + A:' +fewshot_config: + sampler: first_n + samples: + - input: 'Sort the following words alphabetically: List: oven costume counterpart' + target: costume counterpart oven + - input: 'Sort the following words alphabetically: List: hypochlorite ponderosa + phone credulity' + target: credulity hypochlorite phone ponderosa + - input: 'Sort the following words alphabetically: List: newt arson parthia seismography + mugho aspect census' + target: arson aspect census mugho newt parthia seismography +include: _fewshot_template_yaml +task: bbh_fewshot_word_sorting diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5a160da77140f37244dde849f42ab5b3f223a0a4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/README.md @@ -0,0 +1,37 @@ +# LAMBADA + +### Paper +Title: `KOBEST: Korean Balanced Evaluation of Significant Tasks` + +Abstract: https://arxiv.org/abs/2204.04541 + +A well-formulated benchmark plays a critical role in spurring advancements in the natural language processing (NLP) field, as it allows objective and precise evaluation of diverse models. As modern language models (LMs) have become more elaborate and sophisticated, more difficult benchmarks that require linguistic knowledge and reasoning have been proposed. However, most of these benchmarks only support English, and great effort is necessary to construct benchmarks for other low resource languages. To this end, we propose a new benchmark named Korean balanced evaluation of significant tasks (KoBEST), which consists of five Korean-language downstream tasks. Professional Korean linguists designed the tasks that require advanced Korean linguistic knowledge. Moreover, our data is purely annotated by humans and thoroughly reviewed to guarantee high data quality. We also provide baseline models and human performance results. Our dataset is available on the Huggingface. + + +Homepage: https://huggingface.co/datasets/skt/kobest_v1 + +### Groups and Tasks + +#### Groups + +- `kobest` + +#### Tasks + +- `kobest_boolq` +- `kobest_copa` +- `kobest_hallawag` +- `kobest_sentineg` +- `kobest_wic` + + +### Citation + +@misc{ + author={Dohyeong Kim, Myeongjun Jang, Deuk Sin Kwon, Eric Davis}, + title={KOBEST: Korean Balanced Evaluation of Significant Tasks}, + DOI={https://doi.org/10.48550/arXiv.2204.04541}, + publisher={arXiv}, + year={2022}, + month={Apr} +} diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_boolq.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_boolq.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cb35d063b7b607036671f3272e196b4ec907baf2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_boolq.yaml @@ -0,0 +1,23 @@ +tag: + - kobest +task: kobest_boolq +dataset_path: skt/kobest_v1 +dataset_name: boolq +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: "{{paragraph}} 질문: {{question}} 답변: " +doc_to_target: "{{label}}" +doc_to_choice: ["아니오", "예"] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: True + - metric: f1 + aggregation: !function utils.macro_f1_score + average: macro + hf_evaluate: true + higher_is_better: True +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_copa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_copa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a6a931a8089d0ac3beaf732311db74b7d1fb9b1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_copa.yaml @@ -0,0 +1,23 @@ +tag: + - kobest +task: kobest_copa +dataset_path: skt/kobest_v1 +dataset_name: copa +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: !function utils.copa_doc_to_text +doc_to_target: !function utils.copa_doc_to_target +doc_to_choice: !function utils.copa_doc_to_choice +metric_list: + - metric: acc + aggregation: mean + higher_is_better: True + - metric: f1 + aggregation: !function utils.macro_f1_score + average: macro + hf_evaluate: true + higher_is_better: True +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_hellaswag.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_hellaswag.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e463b81737a92d4c0d875123c824836ac2cde837 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_hellaswag.yaml @@ -0,0 +1,27 @@ +tag: + - kobest +task: kobest_hellaswag +dataset_path: skt/kobest_v1 +dataset_name: hellaswag +training_split: train +validation_split: validation +output_type: multiple_choice +test_split: test +doc_to_text: "{{query}}" +doc_to_target: "{{label}}" +process_docs: !function utils.hellaswag_process_doc +doc_to_choice: "choices" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: True + - metric: acc_norm + aggregation: mean + higher_is_better: True + - metric: f1 + aggregation: !function utils.macro_f1_score + average: macro + hf_evaluate: true + higher_is_better: True +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_sentineg.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_sentineg.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5eed0d41780dc1bccd8adbc91441e423c2f5d69 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_sentineg.yaml @@ -0,0 +1,25 @@ +tag: + - kobest +task: kobest_sentineg +dataset_path: skt/kobest_v1 +dataset_name: sentineg +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: !function utils.sentineg_doc_to_text +doc_to_target: "{{label}}" +doc_to_choice: ["부정", "긍정"] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: True + - metric: f1 + aggregation: !function utils.macro_f1_score + average: macro + hf_evaluate: true + higher_is_better: True +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_wic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_wic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d2ea140a8af0e7d207f8dfb0f7209f081a6d6b5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/kobest_wic.yaml @@ -0,0 +1,25 @@ +tag: + - kobest +task: kobest_wic +dataset_path: skt/kobest_v1 +dataset_name: wic +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: !function utils.wic_doc_to_text +doc_to_target: "{{label}}" +doc_to_choice: ['아니오', '예'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: True + - metric: f1 + aggregation: !function utils.macro_f1_score + average: macro + hf_evaluate: true + higher_is_better: True +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1c5ef9495d141f6a712ff6a163041f55118738 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kobest/utils.py @@ -0,0 +1,49 @@ +from datasets import Dataset + + +def copa_doc_to_text(doc: dict) -> str: + connector = {"원인": " 왜냐하면", "결과": " 그래서"}[doc["question"].strip()] + return f"""{doc["premise"]} {connector}""" + + +def copa_doc_to_target(doc: dict) -> str: + correct_choice = doc["alternative_1"] if doc["label"] == 0 else doc["alternative_2"] + return f"""{correct_choice}""" + + +def copa_doc_to_choice(doc: dict) -> list: + return [f"""{doc["alternative_1"]}""", f"""{doc["alternative_2"]}"""] + + +def sentineg_doc_to_text(doc: dict): + return f"""문장: {doc["sentence"]} 긍부정:""" + + +def wic_doc_to_text(doc: dict) -> str: + return f"""문장1: {doc["context_1"]} 문장2: {doc["context_2"]} 두 문장에서 {doc["word"]}가 같은 뜻으로 쓰였나?""" + + +def hellaswag_process_doc(doc: Dataset) -> Dataset: + def preprocessor(dataset): + return { + "query": f"""문장: {dataset["context"]}""", + "choices": [ + dataset["ending_1"], + dataset["ending_2"], + dataset["ending_3"], + dataset["ending_4"], + ], + "gold": int(dataset["label"]), + } + + return doc.map(preprocessor) + + +def macro_f1_score(items): + from sklearn.metrics import f1_score + + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="macro") + return fscore diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ac2b92b553c35a5dc070017b6bebb643e314d64e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/README.md @@ -0,0 +1,39 @@ +# LAMBADA + +### Paper +Title: `The LAMBADA dataset: Word prediction requiring a broad discourse context` + +Abstract: https://arxiv.org/pdf/1606.06031.pdf + +LAMBADA is a dataset to evaluate the capabilities of computational models for text +understanding by means of a word prediction task. LAMBADA is a collection of narrative +passages sharing the characteristic that human subjects are able to guess their last +word if they are exposed to the whole passage, but not if they only see the last +sentence preceding the target word. To succeed on LAMBADA, computational models +cannot simply rely on local context, but must be able to keep track of information +in the broader discourse. + +Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI + +### Groups and Tasks + +#### Groups + +- `lambada` + +#### Tasks + +- `lambada_openai` +- `lambada_standard` + + +### Citation + +@misc{ + author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, + title={The LAMBADA dataset}, + DOI={10.5281/zenodo.2630551}, + publisher={Zenodo}, + year={2016}, + month={Aug} +} diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/lambada_openai.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/lambada_openai.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8808b5fdf278fe6ad541129455c6c8d4cd1b52bc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/lambada_openai.yaml @@ -0,0 +1,22 @@ +tag: + - lambada +task: lambada_openai +dataset_path: EleutherAI/lambada_openai +dataset_name: default +output_type: loglikelihood +test_split: test +doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}" +doc_to_target: "{{' '+text.split(' ')[-1]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{text}}" +metric_list: + - metric: perplexity + aggregation: perplexity + higher_is_better: false + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/lambada_standard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/lambada_standard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f76c8dfe3e234abee806219011f9bc5ac30c893f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/lambada/lambada_standard.yaml @@ -0,0 +1,21 @@ +tag: + - lambada +task: lambada_standard +dataset_path: lambada +dataset_name: null +output_type: loglikelihood +validation_split: validation +test_split: test +doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}" +doc_to_target: "{{' '+text.split(' ')[-1]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{text}}" +metric_list: + - metric: perplexity + aggregation: perplexity + higher_is_better: false + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/tinyArc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/tinyArc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03b6dd9443fc886e3c01d40a53c8220e3e0f978a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/tinyArc.yaml @@ -0,0 +1,19 @@ +task: tinyArc +dataset_path: tinyBenchmarks/tinyAI2_arc +dataset_name: ARC-Challenge +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +num_fewshot: 25 +doc_to_text: "Question: {{question}}\nAnswer:" +doc_to_target: "{{choices.label.index(answerKey)}}" +doc_to_choice: "{{choices.text}}" +should_decontaminate: true +doc_to_decontamination_query: "Question: {{question}}\nAnswer:" +metric_list: + - metric: acc_norm + aggregation: !function agg_functions.agg_gpirt_arc + higher_is_better: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/tinyBenchmarks.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/tinyBenchmarks.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0f1bf8b62879734326b9088269fd4321e74160a5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/tinyBenchmarks.yaml @@ -0,0 +1,16 @@ +group: tinyBenchmarks +task: + - task: tinyArc + num_fewshot: 25 + - task: tinyGSM8k + num_fewshot: 5 + - task: tinyMMLU + num_fewshot: 0 + - task: tinyWinogrande + num_fewshot: 5 + - task: tinyHellaswag + num_fewshot: 10 + - task: tinyTruthfulQA + num_fewshot: 0 +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/utils_hellaswag.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/utils_hellaswag.py new file mode 100644 index 0000000000000000000000000000000000000000..c083ba998d161caee3cee33e0edc6c46375991b5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/utils_hellaswag.py @@ -0,0 +1,28 @@ +import re + +import datasets + + +""" This code mirrors the utils of the original hellaswag task """ + + +def preprocess(text): + text = text.strip() + # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() + out_doc = { + "query": preprocess(doc["activity_label"] + ": " + ctx), + "choices": [preprocess(ending) for ending in doc["endings"]], + "gold": int(doc["label"]), + } + return out_doc + + return dataset.map(_process_doc) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py new file mode 100644 index 0000000000000000000000000000000000000000..e404d096a2adec920bcf5d0f654a9f8789585a3f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py @@ -0,0 +1,176 @@ +import datasets +import numpy as np +import sacrebleu +from rouge_score import rouge_scorer, scoring + + +""" This code mirrors the utils of the original truthful_qa task """ + +ROUGE_SCORER = None + + +def process_results_mc2(doc, results): + lls, is_greedy = zip(*results) + + # Split on the first `0` as everything before it is true (`1`). + split_idx = list(doc["mc2_targets"]["labels"]).index(0) + # Compute the normalized probability mass for the correct answer. + ll_true, ll_false = lls[:split_idx], lls[split_idx:] + p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) + p_true = p_true / (sum(p_true) + sum(p_false)) + + return {"acc": sum(p_true)} + + +def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset: + return dataset.map(preprocess_function) + + +def preprocess_function(examples): + def _format_answers(answers): + formatted_answers = [] + for answer in answers: + answer = answer.strip() + if len(answer): + # Add a period after all answers. + if answer[-1] != ".": + formatted_answers.append(answer + ".") + else: + formatted_answers.append(answer) + return formatted_answers + + incorrect_answers = _format_answers(examples["incorrect_answers"]) + correct_answers = _format_answers(examples["correct_answers"]) + if "I have no comment." not in correct_answers: + correct_answers.append("I have no comment.") + return { + "question": examples["question"].strip(), + "correct_answers": correct_answers, + "incorrect_answers": incorrect_answers, + } + + +def process_results_gen(doc, results): + completion = results[0] + true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] + all_refs = true_refs + false_refs + + # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures. + + # # BLEURT + # bleurt_scores_true = self.bleurt.compute( + # predictions=[completion] * len(true_refs), references=true_refs + # )["scores"] + # bleurt_scores_false = self.bleurt.compute( + # predictions=[completion] * len(false_refs), references=false_refs + # )["scores"] + # bleurt_correct = max(bleurt_scores_true) + # bleurt_incorrect = max(bleurt_scores_false) + # bleurt_max = bleurt_correct + # bleurt_diff = bleurt_correct - bleurt_incorrect + # bleurt_acc = int(bleurt_correct > bleurt_incorrect) + + # BLEU + bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs] + bleu_correct = np.nanmax(bleu_scores[: len(true_refs)]) + bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :]) + bleu_max = bleu_correct + bleu_diff = bleu_correct - bleu_incorrect + bleu_acc = int(bleu_correct > bleu_incorrect) + + # ROUGE-N + rouge_scores = [rouge([ref], [completion]) for ref in all_refs] + # ROUGE-1 + rouge1_scores = [score["rouge1"] for score in rouge_scores] + rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)]) + rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :]) + rouge1_max = rouge1_correct + rouge1_diff = rouge1_correct - rouge1_incorrect + rouge1_acc = int(rouge1_correct > rouge1_incorrect) + # ROUGE-2 + rouge2_scores = [score["rouge2"] for score in rouge_scores] + rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)]) + rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :]) + rouge2_max = rouge2_correct + rouge2_diff = rouge2_correct - rouge2_incorrect + rouge2_acc = int(rouge2_correct > rouge2_incorrect) + # ROUGE-L + rougeL_scores = [score["rougeLsum"] for score in rouge_scores] + rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)]) + rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :]) + rougeL_max = rougeL_correct + rougeL_diff = rougeL_correct - rougeL_incorrect + rougeL_acc = int(rougeL_correct > rougeL_incorrect) + + return { + # "bleurt_max": bleurt_max, + # "bleurt_acc": bleurt_acc, + # "bleurt_diff": bleurt_diff, + "bleu_max": bleu_max, + "bleu_acc": bleu_acc, + "bleu_diff": bleu_diff, + "rouge1_max": rouge1_max, + "rouge1_acc": rouge1_acc, + "rouge1_diff": rouge1_diff, + "rouge2_max": rouge2_max, + "rouge2_acc": rouge2_acc, + "rouge2_diff": rouge2_diff, + "rougeL_max": rougeL_max, + "rougeL_acc": rougeL_acc, + "rougeL_diff": rougeL_diff, + } + + +def bleu(refs, preds): + """ + Returns `t5` style BLEU scores. See the related implementation: + https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41 + + :param refs: + A `list` of `list` of reference `str`s. + :param preds: + A `list` of predicted `str`s. + """ + score = sacrebleu.corpus_bleu( + preds, + refs, + smooth_method="exp", + smooth_value=0.0, + force=False, + lowercase=False, + tokenize="intl", + use_effective_order=False, + ).score + return score + + +def rouge(refs, preds): + """ + Returns `t5` style ROUGE scores. See the related implementation: + https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 + + :param refs: + A `list` of reference `strs`. + :param preds: + A `list` of predicted `strs`. + """ + rouge_types = ["rouge1", "rouge2", "rougeLsum"] + global ROUGE_SCORER + if ROUGE_SCORER is None: + # init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant + ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types) + scorer = ROUGE_SCORER + # Add newlines between sentences to correctly compute `rougeLsum`. + + def _prepare_summary(summary): + summary = summary.replace(" . ", ".\n") + return summary + + # Accumulate confidence intervals. + aggregator = scoring.BootstrapAggregator() + for ref, pred in zip(refs, preds): + ref = _prepare_summary(ref) + pred = _prepare_summary(pred) + aggregator.add_scores(scorer.score(ref, pred)) + result = aggregator.aggregate() + return {type: result[type].mid.fmeasure * 100 for type in rouge_types} diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3acbde5fc2c11eaaba4eeaaa3858b88d72c645bf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/README.md @@ -0,0 +1,84 @@ +# XStoryCloze + +### Paper + +Title: `Few-shot Learning with Multilingual Language Models` + +Abstract: https://arxiv.org/abs/2112.10668 + +XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI. + +Homepage: https://github.com/facebookresearch/fairseq/pull/4820 + + +### Citation + +``` +@article{DBLP:journals/corr/abs-2112-10668, + author = {Xi Victoria Lin and + Todor Mihaylov and + Mikel Artetxe and + Tianlu Wang and + Shuohui Chen and + Daniel Simig and + Myle Ott and + Naman Goyal and + Shruti Bhosale and + Jingfei Du and + Ramakanth Pasunuru and + Sam Shleifer and + Punit Singh Koura and + Vishrav Chaudhary and + Brian O'Horo and + Jeff Wang and + Luke Zettlemoyer and + Zornitsa Kozareva and + Mona T. Diab and + Veselin Stoyanov and + Xian Li}, + title = {Few-shot Learning with Multilingual Language Models}, + journal = {CoRR}, + volume = {abs/2112.10668}, + year = {2021}, + url = {https://arxiv.org/abs/2112.10668}, + eprinttype = {arXiv}, + eprint = {2112.10668}, + timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +### Groups and Tasks + +#### Groups + +* `xstorycloze` + +#### Tasks + +* `xstorycloze_ar`: Arabic +* `xstorycloze_en`: English +* `xstorycloze_es`: Spanish +* `xstorycloze_eu`: Basque +* `xstorycloze_hi`: Hindi +* `xstorycloze_id`: Indonesian +* `xstorycloze_my`: Burmese +* `xstorycloze_ru`: Russian +* `xstorycloze_sw`: Swahili +* `xstorycloze_te`: Telugu +* `xstorycloze_zh`: Chinese + + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/_xstorycloze.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/_xstorycloze.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0b9b41be5db54451d69bde525e53b495271a49f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/_xstorycloze.yaml @@ -0,0 +1,19 @@ +group: xstorycloze +task: + - xstorycloze_ar + - xstorycloze_en + - xstorycloze_es + - xstorycloze_eu + - xstorycloze_hi + - xstorycloze_id + - xstorycloze_my + - xstorycloze_ru + - xstorycloze_sw + - xstorycloze_te + - xstorycloze_zh +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..836a4cb05484e21179331e98daf9893abe5d6b3a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ar.yaml @@ -0,0 +1,17 @@ +task: xstorycloze_ar +dataset_path: juletxara/xstory_cloze +dataset_name: ar +output_type: multiple_choice +training_split: train +validation_split: eval +doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +doc_to_target: "{{answer_right_ending-1}}" +doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_en.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b3127cdfa5dfd4249566b12dc9b1451018a88581 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_en.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_en +dataset_name: en diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_es.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_es.yaml new file mode 100644 index 0000000000000000000000000000000000000000..60af1f8c0a7b8b0917060d592c663fe6212e0210 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_es.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_es +dataset_name: es diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_eu.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_eu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..849caccf2425ec1483baddb83d8c98b8d1eb79e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_eu.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_eu +dataset_name: eu diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_hi.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_hi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c00c75f0e3cba53c17174723d714fde8dc8c351 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_hi.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_hi +dataset_name: hi diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_id.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_id.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c044d7532c4539e287aaa429d4042feff7c6d733 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_id.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_id +dataset_name: id diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_my.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_my.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47c3ae187209901b1fd711e680c1c3d46fdff48e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_my.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_my +dataset_name: my diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ru.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ru.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8617ab08db68e066c4165b0480801b2e5e16d9a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ru.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_ru +dataset_name: ru diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_sw.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_sw.yaml new file mode 100644 index 0000000000000000000000000000000000000000..22b7f3b461fb628102face370fb8b48d7d442241 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_sw.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_sw +dataset_name: sw diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_te.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_te.yaml new file mode 100644 index 0000000000000000000000000000000000000000..946861d4f090d25d0b221c1c8eeca4e59249a380 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_te.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_te +dataset_name: te diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_zh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_zh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a55989fe2f64e6cb0dcf5136c35a1d5bf1ee4ae6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_zh.yaml @@ -0,0 +1,3 @@ +include: default_ar.yaml +task: xstorycloze_zh +dataset_name: zh