koichi12 commited on Nov 28, 2024

Commit

e1a3cf8

verified ·

1 Parent(s): 3ebb242

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/README.md +94 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/direct_yaml +35 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/cot_yaml +36 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_bn.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_de.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_en.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_es.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_fr.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ru.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_sw.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_te.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_th.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml +12 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/gen_yaml.sh +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/cot_yaml +31 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_bn.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_de.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_en.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_es.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_fr.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ru.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_sw.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_te.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_th.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml +24 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/utils.py +228 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/README.md +59 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/_default_template_yaml +33 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml +5 -0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# MGSM
+### Paper
+Title: `Language Models are Multilingual Chain-of-Thought Reasoners`
+Abstract: https://arxiv.org/abs/2210.03057
+Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).
+The same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:
+- Spanish
+- French
+- German
+- Russian
+- Chinese
+- Japanese
+- Thai
+- Swahili
+- Bengali
+- Telugu
+GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.
+You can find the input and targets for each of the ten languages (and English) as `.tsv` files.
+We also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.
+Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm
+### Citation
+```
+@misc{cobbe2021training,
+    title={Training Verifiers to Solve Math Word Problems},
+    author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+    year={2021},
+    eprint={2110.14168},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+@misc{shi2022language,
+    title={Language Models are Multilingual Chain-of-Thought Reasoners},
+    author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
+    year={2022},
+    eprint={2210.03057},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+### Groups and Tasks
+#### Groups
+* `mgsm_direct`: Direct question
+  * `mgsm_direct_bn`: Bengali
+  * `mgsm_direct_de`: German
+  * `mgsm_direct_en`: English
+  * `mgsm_direct_es`: Spanish
+  * `mgsm_direct_fr`: French
+  * `mgsm_direct_ja`: Japanese
+  * `mgsm_direct_ru`: Russian
+  * `mgsm_direct_sw`: Swahili
+  * `mgsm_direct_te`: Telugu
+  * `mgsm_direct_th`: Thai
+  * `mgsm_direct_zh`: Chinese
+* `mgsm_cot_native`: Question with Answer followed by CoT prompt in the same language as the dataset.
+  * `mgsm_cot_native_bn`: Bengali
+  * `mgsm_cot_native_de`: German
+  * `mgsm_cot_native_en`: English
+  * `mgsm_cot_native_es`: Spanish
+  * `mgsm_cot_native_fr`: French
+  * `mgsm_cot_native_ja`: Japanese
+  * `mgsm_cot_native_ru`: Russian
+  * `mgsm_cot_native_sw`: Swahili
+  * `mgsm_cot_native_te`: Telugu
+  * `mgsm_cot_native_th`: Thai
+  * `mgsm_cot_native_zh`: Chinese
+Examplar Samples: https://github.com/google-research/url-nlp/blob/main/mgsm/exemplars.py
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/direct_yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm_direct
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[17:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"প্রশ্ন: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'প্রশ্ন:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_bn

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[29:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAntwort:"}}{% else %}{{"Frage: "+question+"\nAntwort:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Frage:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_de

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_en

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[23:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta:"}}{% else %}{{"Pregunta: "+question+"\nRespuesta:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Pregunta:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_es

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRéponse :"}}{% else %}{{"Question : "+question+"\nRéponse :"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question :'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_fr

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"問題: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - '問題:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_ja

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Задача: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Задача:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_ru

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Swali: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Swali:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_sw

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"ప్రశ్న: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'ప్రశ్న:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_te

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"โจทย์: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'โจทย์:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_th

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"问题: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - '问题:'
+  - </s>
+  - <|im_end|>
+include: direct_yaml
+task: mgsm_direct_zh

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/cot_yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm_cot_native
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_bn.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[17:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"প্রশ্ন: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'প্রশ্ন:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_bn

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_de.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[29:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Frage: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Frage:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_de

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_en.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_en

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_es.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[23:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Pregunta: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Pregunta:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_es

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_fr.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question : "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question :'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_fr

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - '問題:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_ja

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ru.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Задача: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Задача:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_ru

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_sw.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Swali: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Swali:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_sw

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_te.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"ప్రశ్న: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'ప్రశ్న:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_te

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_th.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"โจทย์: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'โจทย์:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_th

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - '问题:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_en_cot_zh

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/gen_yaml.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/bash
+python utils.py --overwrite --output-dir direct --mode direct
+python utils.py --overwrite --output-dir en_cot --mode en-cot
+python utils.py --overwrite --output-dir native_cot --mode native-cot

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/cot_yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm_cot_native
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+# target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+metadata:
+  version: 3.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_bn.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[17:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else %}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: The answer is (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'প্রশ্ন:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_bn

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_de.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[29:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{% else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: Die Antwort lautet (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Frage:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_de

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_en.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: The answer is (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_en

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_es.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[23:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{% else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: La respuesta es (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Pregunta:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_es

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_fr.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{% else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: La réponse est (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question :'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_fr

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題: "+question+"\nステップごとの答え:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: 答えは(\-?[0-9\.\,]+)です。
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '問題:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_ja

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ru.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: Ответ — (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Задача:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_ru

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_sw.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{% else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: Jibu ni (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Swali:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_sw

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_te.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: సమాధానం (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'ప్రశ్న:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_te

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_th.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: คำตอบคือ (\-?[0-9\.\,]+)
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'โจทย์:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_th

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% endif %}'
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: 答案是 (\-?[0-9\.\,]+)。
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '问题:'
+  - </s>
+  - <|im_end|>
+include: cot_yaml
+task: mgsm_native_cot_zh

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/utils.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import argparse
+import yaml
+LANGUAGES = {
+    "bn": {  # Bengali
+        # "QUESTION": "প্রশ্ন:",
+        "QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
+        # "ANSWER": "ধাপে ধাপে উত্তর:",
+        "ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
+        "DIRECT": "Answer:",
+        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+    },
+    "de": {  # German
+        "QUESTION": "Frage:",
+        # "ANSWER": "Schritt-für-Schritt-Antwort:",
+        "ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
+        "DIRECT": "Antwort:",
+        "REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
+    },
+    "en": {  # English
+        "QUESTION": "Question:",
+        "ANSWER": "Step-by-Step Answer:",
+        "DIRECT": "Answer:",
+        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+    },
+    "es": {  # Spanish
+        "QUESTION": "Pregunta:",
+        "ANSWER": "Respuesta paso a paso:",
+        "DIRECT": "Respuesta:",
+        "REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
+    },
+    "fr": {  # French
+        "QUESTION": "Question :",
+        # "ANSWER": "Réponse étape par étape :"
+        "ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
+        # "DIRECT": "Réponse :",
+        "DIRECT": "R\u00e9ponse :",
+        # "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
+        "REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
+    },
+    "ru": {  # Russian
+        # "QUESTION": "Задача:",
+        "QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
+        # "ANSWER": "Пошаговоерешение:",
+        "ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
+        "DIRECT": "Answer:",
+        # "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
+    },
+    "sw": {  # Swahili
+        "QUESTION": "Swali:",
+        "ANSWER": "Jibu la Hatua kwa Hatua:",
+        "DIRECT": "Answer:",
+        "REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
+    },
+    "te": {  # Telugu
+        # "QUESTION": "ప్రశ్న:",
+        "QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
+        # "ANSWER": "దశలవారీగా సమాధానం:",
+        "ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
+        "DIRECT": "Answer:",
+        # "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
+    },
+    "th": {  # Thai
+        # "QUESTION": "โจทย์:",
+        "QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
+        # "ANSWER": "คำตอบทีละขั้นตอน:",
+        "ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
+        "DIRECT": "Answer:",
+        # "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
+    },
+    "ja": {  # Japanese
+        # "QUESTION": "問題:",
+        "QUESTION": "\u554f\u984c:",
+        # "ANSWER": "ステップごとの答え:",
+        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
+        "DIRECT": "Answer:",
+        # "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
+        "REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
+    },
+    "zh": {  # Chinese
+        # "QUESTION": "问题:",
+        "QUESTION": "\u95ee\u9898:",
+        # "ANSWER": "逐步解答:",
+        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
+        "DIRECT": "Answer:",
+        # "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
+        "REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
+    },
+}
+def add_regex_pattern(regex_pattern):
+    if regex_pattern is None:
+        return {}
+    return {
+        "filter_list": [
+            {
+                "name": "strict-match",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": f"""{regex_pattern}""",
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+            {
+                "name": "flexible-extract",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
+                        "group_select": -1,
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+        ],
+    }
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        try:
+            QUESTION = LANGUAGES[lang]["QUESTION"]
+            yaml_template = "cot_yaml"
+            filter_list = {}
+            DELIMITER = None
+            if mode == "direct":
+                ANSWER = LANGUAGES[lang]["DIRECT"]
+                REGEX = None
+                task_name = f"mgsm_direct_{lang}"
+                yaml_template = "direct_yaml"
+            elif mode == "native-cot":
+                ANSWER = LANGUAGES[lang]["ANSWER"]
+                REGEX = LANGUAGES[lang]["REGEX"]
+                task_name = f"mgsm_native_cot_{lang}"
+                filter_list = add_regex_pattern(REGEX)
+                DELIMITER = "" if lang in ["zh", "ja"] else None
+            elif mode == "en-cot":
+                ANSWER = LANGUAGES["en"]["ANSWER"]
+                REGEX = LANGUAGES["en"]["REGEX"]
+                task_name = f"mgsm_en_cot_{lang}"
+            file_name = f"{task_name}.yaml"
+            ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": yaml_template,
+                        "dataset_name": lang,
+                        "task": f"{task_name}",
+                        "doc_to_text": f"""{{% if answer is not none %}}"""
+                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% endif %}}""",
+                        "doc_to_target": f"""{{% if answer is not none %}}"""
+                        f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{answer_number|string}}}}"""
+                        f"""{{% endif %}}""",
+                        **filter_list,
+                        "generation_kwargs": {
+                            "until": [QUESTION, "</s>", "<|im_end|>"],
+                            "do_sample": False,
+                        },
+                        **({"target_delimiter": DELIMITER} if DELIMITER else {}),
+                    },
+                    f,
+                    allow_unicode=True,
+                    width=float("inf"),
+                )
+        except FileExistsError:
+            err.append(file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    parser.add_argument(
+        "--mode",
+        default="native-cot",
+        choices=["direct", "native-cot", "en-cot"],
+        help="Mode of chain-of-thought",
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+if __name__ == "__main__":
+    main()

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# mmlu_pro
+### Paper
+Title: `MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark`
+Abstract: `In the age of large-scale language models, benchmarks like the Massive Multitask Language Understanding (MMLU) have been pivotal in pushing the boundaries of what AI can achieve in language comprehension and reasoning across diverse domains. However, as models continue to improve, their performance on these benchmarks has begun to plateau, making it increasingly difficult to discern differences in model capabilities. This paper introduces MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven MMLU benchmark by integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. Additionally, MMLU-Pro eliminates the trivial and noisy questions in MMLU. Our experimental results show that MMLU-Pro not only raises the challenge, causing a significant drop in accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability under varying prompts. With 24 different prompt styles tested, the sensitivity of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT) reasoning achieved better performance on MMLU-Pro compared to direct answering, which is in stark contrast to the findings on the original MMLU, indicating that MMLU-Pro includes more complex reasoning questions. Our assessments confirm that MMLU-Pro is a more discriminative benchmark to better track progress in the field.`
+Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
+### Citation
+```bibtex
+@misc{wang2024mmlupro,
+      title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark},
+      author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
+      year={2024},
+      eprint={2406.01574},
+      archivePrefix={arXiv},
+      primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
+}
+```
+### Groups and Tasks
+#### Groups
+* `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
+#### Tasks
+The following tasks evaluate subjects in the mmlu_pro dataset
+- `mmlu_pro_biology`
+- `mmlu_pro_business`
+- `mmlu_pro_chemistry`
+- `mmlu_pro_computer_science`
+- `mmlu_pro_economics`
+- `mmlu_pro_engineering`
+- `mmlu_pro_health`
+- `mmlu_pro_history`
+- `mmlu_pro_law`
+- `mmlu_pro_math`
+- `mmlu_pro_other`
+- `mmlu_pro_philosophy`
+- `mmlu_pro_physics`
+- `mmlu_pro_psychology`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/_default_template_yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+dataset_path: TIGER-Lab/MMLU-Pro
+test_split: test
+fewshot_split: validation
+fewshot_config:
+  sampler: first_n
+  doc_to_text: !function utils.fewshot_to_text
+  doc_to_target: ""
+output_type: generate_until
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "custom-extract"
+    filter:
+      - function: "regex"
+        regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
+        # regex_pattern: r".*[aA]nswer:\s*([A-J])",
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+    - "Q:"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+num_fewshot: 5
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 0.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_biology"
+task_alias: "biology"
+process_docs: !function utils.process_biology

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_business"
+task_alias: "business"
+process_docs: !function utils.process_business

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_chemistry"
+task_alias: "chemistry"
+process_docs: !function utils.process_chemistry

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_computer_science"
+task_alias: "computer_science"
+process_docs: !function utils.process_computer_science

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_economics"
+task_alias: "economics"
+process_docs: !function utils.process_economics

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_engineering"
+task_alias: "engineering"
+process_docs: !function utils.process_engineering

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_health"
+task_alias: "health"
+process_docs: !function utils.process_health

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_history"
+task_alias: "history"
+process_docs: !function utils.process_history

scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
+include: "_default_template_yaml"
+task: "mmlu_pro_law"
+task_alias: "law"
+process_docs: !function utils.process_law