koichi12 commited on
Commit
e1a3cf8
·
verified ·
1 Parent(s): 3ebb242

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/README.md +94 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/direct_yaml +35 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml +12 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml +12 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml +12 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml +12 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml +12 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml +12 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml +12 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml +12 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml +12 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml +12 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml +12 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/cot_yaml +36 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_bn.yaml +12 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_de.yaml +12 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_en.yaml +12 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_es.yaml +12 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_fr.yaml +12 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml +12 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ru.yaml +12 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_sw.yaml +12 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_te.yaml +12 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_th.yaml +12 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml +12 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/gen_yaml.sh +5 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/cot_yaml +31 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_bn.yaml +24 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_de.yaml +24 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_en.yaml +24 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_es.yaml +24 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_fr.yaml +24 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml +24 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ru.yaml +24 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_sw.yaml +24 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_te.yaml +24 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_th.yaml +24 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml +24 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/utils.py +228 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/README.md +59 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/_default_template_yaml +33 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml +5 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml +5 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml +5 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml +5 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml +5 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml +5 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml +5 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml +5 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml +5 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MGSM
2
+
3
+ ### Paper
4
+
5
+ Title: `Language Models are Multilingual Chain-of-Thought Reasoners`
6
+
7
+ Abstract: https://arxiv.org/abs/2210.03057
8
+
9
+ Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).
10
+
11
+ The same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:
12
+ - Spanish
13
+ - French
14
+ - German
15
+ - Russian
16
+ - Chinese
17
+ - Japanese
18
+ - Thai
19
+ - Swahili
20
+ - Bengali
21
+ - Telugu
22
+
23
+ GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.
24
+
25
+ You can find the input and targets for each of the ten languages (and English) as `.tsv` files.
26
+ We also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.
27
+
28
+ Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm
29
+
30
+
31
+ ### Citation
32
+
33
+ ```
34
+ @misc{cobbe2021training,
35
+ title={Training Verifiers to Solve Math Word Problems},
36
+ author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
37
+ year={2021},
38
+ eprint={2110.14168},
39
+ archivePrefix={arXiv},
40
+ primaryClass={cs.LG}
41
+ }
42
+ @misc{shi2022language,
43
+ title={Language Models are Multilingual Chain-of-Thought Reasoners},
44
+ author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
45
+ year={2022},
46
+ eprint={2210.03057},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
51
+
52
+ ### Groups and Tasks
53
+
54
+ #### Groups
55
+
56
+ * `mgsm_direct`: Direct question
57
+ * `mgsm_direct_bn`: Bengali
58
+ * `mgsm_direct_de`: German
59
+ * `mgsm_direct_en`: English
60
+ * `mgsm_direct_es`: Spanish
61
+ * `mgsm_direct_fr`: French
62
+ * `mgsm_direct_ja`: Japanese
63
+ * `mgsm_direct_ru`: Russian
64
+ * `mgsm_direct_sw`: Swahili
65
+ * `mgsm_direct_te`: Telugu
66
+ * `mgsm_direct_th`: Thai
67
+ * `mgsm_direct_zh`: Chinese
68
+ * `mgsm_cot_native`: Question with Answer followed by CoT prompt in the same language as the dataset.
69
+ * `mgsm_cot_native_bn`: Bengali
70
+ * `mgsm_cot_native_de`: German
71
+ * `mgsm_cot_native_en`: English
72
+ * `mgsm_cot_native_es`: Spanish
73
+ * `mgsm_cot_native_fr`: French
74
+ * `mgsm_cot_native_ja`: Japanese
75
+ * `mgsm_cot_native_ru`: Russian
76
+ * `mgsm_cot_native_sw`: Swahili
77
+ * `mgsm_cot_native_te`: Telugu
78
+ * `mgsm_cot_native_th`: Thai
79
+ * `mgsm_cot_native_zh`: Chinese
80
+
81
+ Examplar Samples: https://github.com/google-research/url-nlp/blob/main/mgsm/exemplars.py
82
+
83
+ ### Checklist
84
+
85
+ For adding novel benchmarks/datasets to the library:
86
+ * [ ] Is the task an existing benchmark in the literature?
87
+ * [ ] Have you referenced the original paper that introduced the task?
88
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
89
+
90
+
91
+ If other tasks on this dataset are already supported:
92
+ * [ ] Is the "Main" variant of this task clearly denoted?
93
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
94
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/direct_yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file will be included in the generated language-specific task configs.
2
+ # It doesn't have a yaml file extension as it is not meant to be imported directly
3
+ # by the harness.
4
+ group: mgsm_direct
5
+ dataset_path: juletxara/mgsm
6
+ dataset_name: null # Overridden by language-specific config.
7
+ output_type: generate_until
8
+ training_split: train
9
+ test_split: test
10
+ target_delimiter: ""
11
+ generation_kwargs:
12
+ until:
13
+ - "\n\n"
14
+ - "\n"
15
+ do_sample: false
16
+ temperature: 0.0
17
+ filter_list:
18
+ - name: remove_whitespace
19
+ filter:
20
+ - function: remove_whitespace
21
+ - function: take_first
22
+ - filter:
23
+ - function: regex
24
+ group_select: -1
25
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
26
+ - function: take_first
27
+ name: flexible-extract
28
+ metric_list:
29
+ - metric: exact_match
30
+ aggregation: mean
31
+ higher_is_better: true
32
+ ignore_case: true
33
+ ignore_punctuation: true
34
+ metadata:
35
+ version: 2.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: bn
3
+ doc_to_target: '{% if answer is not none %}{{answer[17:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"প্রশ্ন: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'প্রশ্ন:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_bn
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: de
3
+ doc_to_target: '{% if answer is not none %}{{answer[29:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAntwort:"}}{% else %}{{"Frage: "+question+"\nAntwort:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Frage:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_de
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: en
3
+ doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Question:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_en
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: es
3
+ doc_to_target: '{% if answer is not none %}{{answer[23:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta:"}}{% else %}{{"Pregunta: "+question+"\nRespuesta:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Pregunta:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_es
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: fr
3
+ doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nRéponse :"}}{% else %}{{"Question : "+question+"\nRéponse :"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Question :'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_fr
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: ja
3
+ doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"問題: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - '問題:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_ja
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: ru
3
+ doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Задача: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Задача:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_ru
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: sw
3
+ doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Swali: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Swali:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_sw
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: te
3
+ doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"ప్రశ్న: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'ప్రశ్న:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_te
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: th
3
+ doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"โจทย์: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'โจทย์:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_th
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: zh
3
+ doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"问题: "+question+"\nAnswer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - '问题:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: direct_yaml
12
+ task: mgsm_direct_zh
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/cot_yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file will be included in the generated language-specific task configs.
2
+ # It doesn't have a yaml file extension as it is not meant to be imported directly
3
+ # by the harness.
4
+ group: mgsm_cot_native
5
+ dataset_path: juletxara/mgsm
6
+ dataset_name: null # Overridden by language-specific config.
7
+ output_type: generate_until
8
+ training_split: train
9
+ test_split: test
10
+ generation_kwargs:
11
+ until:
12
+ - "\n\n"
13
+ - "\n"
14
+ do_sample: false
15
+ temperature: 0.0
16
+ target_delimiter: " "
17
+ metric_list:
18
+ - metric: exact_match
19
+ aggregation: mean
20
+ higher_is_better: true
21
+ ignore_case: true
22
+ ignore_punctuation: true
23
+ filter_list:
24
+ - name: "strict-match"
25
+ filter:
26
+ - function: "regex"
27
+ regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
28
+ - function: "take_first"
29
+ - filter:
30
+ - function: regex
31
+ group_select: -1
32
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
33
+ - function: take_first
34
+ name: flexible-extract
35
+ metadata:
36
+ version: 2.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_bn.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: bn
3
+ doc_to_target: '{% if answer is not none %}{{answer[17:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"প্রশ্ন: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'প্রশ্ন:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_bn
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_de.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: de
3
+ doc_to_target: '{% if answer is not none %}{{answer[29:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Frage: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Frage:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_de
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_en.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: en
3
+ doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Question:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_en
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_es.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: es
3
+ doc_to_target: '{% if answer is not none %}{{answer[23:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Pregunta: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Pregunta:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_es
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_fr.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: fr
3
+ doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question : "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Question :'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_fr
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: ja
3
+ doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - '問題:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_ja
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ru.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: ru
3
+ doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Задача: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Задача:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_ru
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_sw.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: sw
3
+ doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Swali: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'Swali:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_sw
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_te.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: te
3
+ doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"ప్రశ్న: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'ప్రశ్న:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_te
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_th.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: th
3
+ doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"โจทย์: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - 'โจทย์:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_th
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: zh
3
+ doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ generation_kwargs:
6
+ do_sample: false
7
+ until:
8
+ - '问题:'
9
+ - </s>
10
+ - <|im_end|>
11
+ include: cot_yaml
12
+ task: mgsm_en_cot_zh
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/gen_yaml.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ python utils.py --overwrite --output-dir direct --mode direct
4
+ python utils.py --overwrite --output-dir en_cot --mode en-cot
5
+ python utils.py --overwrite --output-dir native_cot --mode native-cot
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/cot_yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file will be included in the generated language-specific task configs.
2
+ # It doesn't have a yaml file extension as it is not meant to be imported directly
3
+ # by the harness.
4
+ group: mgsm_cot_native
5
+ dataset_path: juletxara/mgsm
6
+ dataset_name: null # Overridden by language-specific config.
7
+ output_type: generate_until
8
+ training_split: train
9
+ test_split: test
10
+ # target_delimiter: ""
11
+ generation_kwargs:
12
+ until:
13
+ - "\n\n"
14
+ - "\n"
15
+ do_sample: false
16
+ temperature: 0.0
17
+ target_delimiter: " "
18
+ metric_list:
19
+ - metric: exact_match
20
+ aggregation: mean
21
+ higher_is_better: true
22
+ ignore_case: true
23
+ ignore_punctuation: true
24
+ filter_list:
25
+ - name: "get-answer"
26
+ filter:
27
+ - function: "regex"
28
+ regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
29
+ - function: "take_first"
30
+ metadata:
31
+ version: 3.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_bn.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: bn
3
+ doc_to_target: '{% if answer is not none %}{{answer[17:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else %}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: The answer is (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'প্রশ্ন:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_bn
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_de.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: de
3
+ doc_to_target: '{% if answer is not none %}{{answer[29:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{% else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: Die Antwort lautet (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'Frage:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_de
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_en.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: en
3
+ doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: The answer is (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'Question:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_en
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_es.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: es
3
+ doc_to_target: '{% if answer is not none %}{{answer[23:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{% else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: La respuesta es (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'Pregunta:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_es
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_fr.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: fr
3
+ doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{% else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: La réponse est (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'Question :'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_fr
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: ja
3
+ doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題: "+question+"\nステップごとの答え:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: 答えは(\-?[0-9\.\,]+)です。
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - '問題:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_ja
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ru.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: ru
3
+ doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: Ответ — (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'Задача:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_ru
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_sw.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: sw
3
+ doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{% else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: Jibu ni (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'Swali:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_sw
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_te.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: te
3
+ doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: సమాధానం (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'ప్రశ్న:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_te
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_th.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: th
3
+ doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: คำตอบคือ (\-?[0-9\.\,]+)
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - 'โจทย์:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_th
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by utils.py
2
+ dataset_name: zh
3
+ doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
4
+ doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% endif %}'
5
+ filter_list:
6
+ - filter:
7
+ - function: regex
8
+ regex_pattern: 答案是 (\-?[0-9\.\,]+)。
9
+ - function: take_first
10
+ name: strict-match
11
+ - filter:
12
+ - function: regex
13
+ group_select: -1
14
+ regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
15
+ - function: take_first
16
+ name: flexible-extract
17
+ generation_kwargs:
18
+ do_sample: false
19
+ until:
20
+ - '问题:'
21
+ - </s>
22
+ - <|im_end|>
23
+ include: cot_yaml
24
+ task: mgsm_native_cot_zh
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mgsm/utils.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import yaml
4
+
5
+
6
+ LANGUAGES = {
7
+ "bn": { # Bengali
8
+ # "QUESTION": "প্রশ্ন:",
9
+ "QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
10
+ # "ANSWER": "ধাপে ধাপে উত্তর:",
11
+ "ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
12
+ "DIRECT": "Answer:",
13
+ "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
14
+ },
15
+ "de": { # German
16
+ "QUESTION": "Frage:",
17
+ # "ANSWER": "Schritt-für-Schritt-Antwort:",
18
+ "ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
19
+ "DIRECT": "Antwort:",
20
+ "REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
21
+ },
22
+ "en": { # English
23
+ "QUESTION": "Question:",
24
+ "ANSWER": "Step-by-Step Answer:",
25
+ "DIRECT": "Answer:",
26
+ "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
27
+ },
28
+ "es": { # Spanish
29
+ "QUESTION": "Pregunta:",
30
+ "ANSWER": "Respuesta paso a paso:",
31
+ "DIRECT": "Respuesta:",
32
+ "REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
33
+ },
34
+ "fr": { # French
35
+ "QUESTION": "Question :",
36
+ # "ANSWER": "Réponse étape par étape :"
37
+ "ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
38
+ # "DIRECT": "Réponse :",
39
+ "DIRECT": "R\u00e9ponse :",
40
+ # "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
41
+ "REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
42
+ },
43
+ "ru": { # Russian
44
+ # "QUESTION": "Задача:",
45
+ "QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
46
+ # "ANSWER": "Пошаговоерешение:",
47
+ "ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
48
+ "DIRECT": "Answer:",
49
+ # "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
50
+ "REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
51
+ },
52
+ "sw": { # Swahili
53
+ "QUESTION": "Swali:",
54
+ "ANSWER": "Jibu la Hatua kwa Hatua:",
55
+ "DIRECT": "Answer:",
56
+ "REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
57
+ },
58
+ "te": { # Telugu
59
+ # "QUESTION": "ప్రశ్న:",
60
+ "QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
61
+ # "ANSWER": "దశలవారీగా సమాధానం:",
62
+ "ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
63
+ "DIRECT": "Answer:",
64
+ # "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
65
+ "REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
66
+ },
67
+ "th": { # Thai
68
+ # "QUESTION": "โจทย์:",
69
+ "QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
70
+ # "ANSWER": "คำตอบทีละขั้นตอน:",
71
+ "ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
72
+ "DIRECT": "Answer:",
73
+ # "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
74
+ "REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
75
+ },
76
+ "ja": { # Japanese
77
+ # "QUESTION": "問題:",
78
+ "QUESTION": "\u554f\u984c:",
79
+ # "ANSWER": "ステップごとの答え:",
80
+ "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
81
+ "DIRECT": "Answer:",
82
+ # "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
83
+ "REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
84
+ },
85
+ "zh": { # Chinese
86
+ # "QUESTION": "问题:",
87
+ "QUESTION": "\u95ee\u9898:",
88
+ # "ANSWER": "逐步解答:",
89
+ "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
90
+ "DIRECT": "Answer:",
91
+ # "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
92
+ "REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
93
+ },
94
+ }
95
+
96
+
97
+ def add_regex_pattern(regex_pattern):
98
+ if regex_pattern is None:
99
+ return {}
100
+ return {
101
+ "filter_list": [
102
+ {
103
+ "name": "strict-match",
104
+ "filter": [
105
+ {
106
+ "function": "regex",
107
+ "regex_pattern": f"""{regex_pattern}""",
108
+ },
109
+ {
110
+ "function": "take_first",
111
+ },
112
+ ],
113
+ },
114
+ {
115
+ "name": "flexible-extract",
116
+ "filter": [
117
+ {
118
+ "function": "regex",
119
+ "regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
120
+ "group_select": -1,
121
+ },
122
+ {
123
+ "function": "take_first",
124
+ },
125
+ ],
126
+ },
127
+ ],
128
+ }
129
+
130
+
131
+ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
132
+ """
133
+ Generate a yaml file for each language.
134
+
135
+ :param output_dir: The directory to output the files to.
136
+ :param overwrite: Whether to overwrite files if they already exist.
137
+ """
138
+ err = []
139
+ for lang in LANGUAGES.keys():
140
+ try:
141
+ QUESTION = LANGUAGES[lang]["QUESTION"]
142
+
143
+ yaml_template = "cot_yaml"
144
+ filter_list = {}
145
+ DELIMITER = None
146
+ if mode == "direct":
147
+ ANSWER = LANGUAGES[lang]["DIRECT"]
148
+ REGEX = None
149
+ task_name = f"mgsm_direct_{lang}"
150
+ yaml_template = "direct_yaml"
151
+ elif mode == "native-cot":
152
+ ANSWER = LANGUAGES[lang]["ANSWER"]
153
+ REGEX = LANGUAGES[lang]["REGEX"]
154
+ task_name = f"mgsm_native_cot_{lang}"
155
+ filter_list = add_regex_pattern(REGEX)
156
+ DELIMITER = "" if lang in ["zh", "ja"] else None
157
+ elif mode == "en-cot":
158
+ ANSWER = LANGUAGES["en"]["ANSWER"]
159
+ REGEX = LANGUAGES["en"]["REGEX"]
160
+ task_name = f"mgsm_en_cot_{lang}"
161
+
162
+ file_name = f"{task_name}.yaml"
163
+ ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
164
+ with open(
165
+ f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
166
+ ) as f:
167
+ f.write("# Generated by utils.py\n")
168
+ yaml.dump(
169
+ {
170
+ "include": yaml_template,
171
+ "dataset_name": lang,
172
+ "task": f"{task_name}",
173
+ "doc_to_text": f"""{{% if answer is not none %}}"""
174
+ f"""{{{{question+"\\n{ANSWER}"}}}}"""
175
+ f"""{{% else %}}"""
176
+ f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
177
+ f"""{{% endif %}}""",
178
+ "doc_to_target": f"""{{% if answer is not none %}}"""
179
+ f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
180
+ f"""{{% else %}}"""
181
+ f"""{{{{answer_number|string}}}}"""
182
+ f"""{{% endif %}}""",
183
+ **filter_list,
184
+ "generation_kwargs": {
185
+ "until": [QUESTION, "</s>", "<|im_end|>"],
186
+ "do_sample": False,
187
+ },
188
+ **({"target_delimiter": DELIMITER} if DELIMITER else {}),
189
+ },
190
+ f,
191
+ allow_unicode=True,
192
+ width=float("inf"),
193
+ )
194
+ except FileExistsError:
195
+ err.append(file_name)
196
+
197
+ if len(err) > 0:
198
+ raise FileExistsError(
199
+ "Files were not created because they already exist (use --overwrite flag):"
200
+ f" {', '.join(err)}"
201
+ )
202
+
203
+
204
+ def main() -> None:
205
+ """Parse CLI args and generate language-specific yaml files."""
206
+ parser = argparse.ArgumentParser()
207
+ parser.add_argument(
208
+ "--overwrite",
209
+ default=False,
210
+ action="store_true",
211
+ help="Overwrite files if they already exist",
212
+ )
213
+ parser.add_argument(
214
+ "--output-dir", default=".", help="Directory to write yaml files to"
215
+ )
216
+ parser.add_argument(
217
+ "--mode",
218
+ default="native-cot",
219
+ choices=["direct", "native-cot", "en-cot"],
220
+ help="Mode of chain-of-thought",
221
+ )
222
+ args = parser.parse_args()
223
+
224
+ gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
225
+
226
+
227
+ if __name__ == "__main__":
228
+ main()
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mmlu_pro
2
+
3
+ ### Paper
4
+
5
+ Title: `MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark`
6
+
7
+ Abstract: `In the age of large-scale language models, benchmarks like the Massive Multitask Language Understanding (MMLU) have been pivotal in pushing the boundaries of what AI can achieve in language comprehension and reasoning across diverse domains. However, as models continue to improve, their performance on these benchmarks has begun to plateau, making it increasingly difficult to discern differences in model capabilities. This paper introduces MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven MMLU benchmark by integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. Additionally, MMLU-Pro eliminates the trivial and noisy questions in MMLU. Our experimental results show that MMLU-Pro not only raises the challenge, causing a significant drop in accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability under varying prompts. With 24 different prompt styles tested, the sensitivity of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT) reasoning achieved better performance on MMLU-Pro compared to direct answering, which is in stark contrast to the findings on the original MMLU, indicating that MMLU-Pro includes more complex reasoning questions. Our assessments confirm that MMLU-Pro is a more discriminative benchmark to better track progress in the field.`
8
+
9
+ Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
10
+
11
+ ### Citation
12
+
13
+ ```bibtex
14
+ @misc{wang2024mmlupro,
15
+ title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark},
16
+ author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
17
+ year={2024},
18
+ eprint={2406.01574},
19
+ archivePrefix={arXiv},
20
+ primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
21
+ }
22
+ ```
23
+
24
+ ### Groups and Tasks
25
+
26
+ #### Groups
27
+
28
+ * `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
29
+
30
+ #### Tasks
31
+
32
+ The following tasks evaluate subjects in the mmlu_pro dataset
33
+ - `mmlu_pro_biology`
34
+ - `mmlu_pro_business`
35
+ - `mmlu_pro_chemistry`
36
+ - `mmlu_pro_computer_science`
37
+ - `mmlu_pro_economics`
38
+ - `mmlu_pro_engineering`
39
+ - `mmlu_pro_health`
40
+ - `mmlu_pro_history`
41
+ - `mmlu_pro_law`
42
+ - `mmlu_pro_math`
43
+ - `mmlu_pro_other`
44
+ - `mmlu_pro_philosophy`
45
+ - `mmlu_pro_physics`
46
+ - `mmlu_pro_psychology`
47
+
48
+ ### Checklist
49
+
50
+ For adding novel benchmarks/datasets to the library:
51
+ * [x] Is the task an existing benchmark in the literature?
52
+ * [x] Have you referenced the original paper that introduced the task?
53
+ * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
54
+
55
+
56
+ If other tasks on this dataset are already supported:
57
+ * [ ] Is the "Main" variant of this task clearly denoted?
58
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
59
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/_default_template_yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: TIGER-Lab/MMLU-Pro
2
+ test_split: test
3
+ fewshot_split: validation
4
+ fewshot_config:
5
+ sampler: first_n
6
+ doc_to_text: !function utils.fewshot_to_text
7
+ doc_to_target: ""
8
+ output_type: generate_until
9
+ doc_to_text: !function utils.doc_to_text
10
+ doc_to_target: answer
11
+ filter_list:
12
+ - name: "custom-extract"
13
+ filter:
14
+ - function: "regex"
15
+ regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
16
+ # regex_pattern: r".*[aA]nswer:\s*([A-J])",
17
+ - function: "take_first"
18
+ generation_kwargs:
19
+ until:
20
+ - "</s>"
21
+ - "Q:"
22
+ - "<|im_end|>"
23
+ do_sample: false
24
+ temperature: 0.0
25
+ num_fewshot: 5
26
+ metric_list:
27
+ - metric: exact_match
28
+ aggregation: mean
29
+ higher_is_better: true
30
+ ignore_case: true
31
+ ignore_punctuation: true
32
+ metadata:
33
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_biology"
4
+ task_alias: "biology"
5
+ process_docs: !function utils.process_biology
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_business"
4
+ task_alias: "business"
5
+ process_docs: !function utils.process_business
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_chemistry"
4
+ task_alias: "chemistry"
5
+ process_docs: !function utils.process_chemistry
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_computer_science"
4
+ task_alias: "computer_science"
5
+ process_docs: !function utils.process_computer_science
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_economics"
4
+ task_alias: "economics"
5
+ process_docs: !function utils.process_economics
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_engineering"
4
+ task_alias: "engineering"
5
+ process_docs: !function utils.process_engineering
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_health"
4
+ task_alias: "health"
5
+ process_docs: !function utils.process_health
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_history"
4
+ task_alias: "history"
5
+ process_docs: !function utils.process_history
scripts/yans/lm-evaluation-harness/lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
2
+ include: "_default_template_yaml"
3
+ task: "mmlu_pro_law"
4
+ task_alias: "law"
5
+ process_docs: !function utils.process_law