diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cca14d968d2d87312d48fdb031e4a3518c9f915a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/README.md @@ -0,0 +1,52 @@ +# MathQA + +### Paper + +IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models +https://arxiv.org/pdf/2406.03368 + +IrokoBench is a human-translated benchmark dataset for 16 typologically diverse +low-resource African languages covering three tasks: natural language inference (AfriXNLI), +mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU). + + +### Citation + +``` +@misc{adelani2024irokobenchnewbenchmarkafrican, + title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, + author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp}, + year={2024}, + eprint={2406.03368}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2406.03368}, +} +``` + +### Groups and Tasks + +#### Groups + +* `afrimgsm`: All afrimgsm tasks +* `afrimgsm_direct`: afrimgsm_direct evaluates models performance on the curated dataset +* `afrimgsm_en_cot`: afrimgsm_en_cot includes 5-shot of exemplars for chain-of-thought approach +* `afrimgsm_translate`: afrimgsm_translate evaluates models in translate-test setting + +#### Tasks +* `afrimgsm_direct_{language_code}`: each task evaluates for one language +* `afrimgsm_en_cot_{language_code}`: each task evaluates for one language +* `afrimgsm_translate_{language_code}`: each task evaluates for one language + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? + * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04d0bdd67114f3c0887979fdce210f0fa94616e7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: amh +doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_amh diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5804270d4d0072764ca3d1190a75d7629bc251e9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: eng +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_eng diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4eae6fc4c790968040080aee824c345bd786db44 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: ewe +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_ewe diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16aeacf2c54706a18165bd1230ee812bb080ceb8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: fra +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_fra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a6668e989af297b60b1aafd53a3cb44e3936a60 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: hau +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_hau diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab79986a5dec2af92711a675b3a4d79b31b044a9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: ibo +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_ibo diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4c9c75af0ccfc6d2b0b18138dec074e10b6047e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: kin +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_kin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7136d7370cfd8f9e35b4ebc5e0615330b84edddc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: lin +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_lin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03fc0c2884cf9d14cadcf583cce1e81c47938963 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: lug +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_lug diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..49d7e93390dc5c63ce83364ea1ec8ede77537ea8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: orm +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_orm diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a61de85a3ffbbd5c2f3e91d5f26eb63a6241d78c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: sna +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_sna diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..455c1adcc5b896ce2c2140c9f30e8fa1857e60a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: sot +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_sot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..462ddfd378f8c02a872780a8013f0f74378551e0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: swa +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_swa diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c4673b7ba00668d5d3bdcacfd2e00f342362194 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: twi +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_twi diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml new file mode 100644 index 0000000000000000000000000000000000000000..08a8e030a4c0c0d444ac464b974d9886e434ff43 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: wol +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_wol diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2103d182f3ca1703c43e03279a6d1aa9bcc9532d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: xho +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_xho diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa084c32a645cab532b002565f3c8a324708d6ba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: yor +doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_yor diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dcffb6944658282d620f7dbcec9d6513bcaf36c5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: zul +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: direct_yaml +task: afrimgsm_direct_zul diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/direct_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/direct_yaml new file mode 100644 index 0000000000000000000000000000000000000000..be97482c9c08c309f511689a03e8e9635e1d583c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/direct_yaml @@ -0,0 +1,37 @@ +# This file will be included in the generated language-specific task configs. +# It doesn't have a yaml file extension as it is not meant to be imported directly +# by the harness. +group: + - afrimgsm + - afrimgsm_direct +dataset_path: masakhane/afrimgsm +dataset_name: null # Overridden by language-specific config. +output_type: generate_until +# training_split: train +test_split: test +target_delimiter: "" +generation_kwargs: + until: + - "\n\n" + - "\n" + do_sample: false + temperature: 0.0 +filter_list: + - name: remove_whitespace + filter: + - function: remove_whitespace + - function: take_first + - filter: + - function: regex + group_select: -1 + regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+) + - function: take_first + name: flexible-extract +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f00400d96d15547bb73acd53c84ad5d4ce6f024f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: amh +doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_amh diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea246f7c16cec59da6562b0e17b43da0268caa0e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: ewe +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_ewe diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16bf57b76e4d48384ee909854ce7ac4050215894 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: fra +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_fra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..841913b7c689a30833282cd40fdbc6a6db4a3dac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: kin +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_kin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..76d7fdb91fb8dd39b23d4c8c5a0513eaa6538a6d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: lin +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_lin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e9e5600e99104054e169ef1d29da528ef5a9be39 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: orm +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_orm diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml new file mode 100644 index 0000000000000000000000000000000000000000..058689623d3fa6147743052f840ab25f8ef0bb4f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: sna +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_sna diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6ecf4c44eff8d04d081a15062272ba168bab7ded --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: wol +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_wol diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9dc6691bdee31264bcba551b0288980de24b6e7f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: xho +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_xho diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ef29830fa23b3fa561276bf6472a453c7e80384 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: yor +doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_yor diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml new file mode 100644 index 0000000000000000000000000000000000000000..24f486e0af03eda4a290eee0881da5a3b07dd96c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: zul +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: cot_yaml +task: afrimgsm_en_cot_zul diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/gen_yaml.sh b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/gen_yaml.sh new file mode 100644 index 0000000000000000000000000000000000000000..5c0132822a7f3ba68230762e0342838583c29bd9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/gen_yaml.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# python utils.py --overwrite --output-dir direct --mode direct +# python utils.py --overwrite --output-dir direct_native --mode direct-native +# python utils.py --overwrite --output-dir en_cot --mode en-cot +# python utils.py --overwrite --output-dir native_cot --mode native-cot +python utils.py --overwrite --output-dir translate_direct --mode translate-direct diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/run.sh b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..075500be33775dc49288ce7f7180604c7c6f99ce --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/run.sh @@ -0,0 +1,6 @@ +lm_eval --model hf \ + --model_args pretrained="google/gemma-7b" --tasks afrimgsm_en_cot_eng,mgsm_en_cot_en,afrimgsm_native_cot_eng,mgsm_native_cot_en,afrimgsm_direct_eng,mgsm_direct_en,afrimgsm_direct_native_eng \ + --device cuda:0 \ + --batch_size 1 \ + --verbosity DEBUG \ + --limit 5 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..55fbe4bfdb590b6d352b71c16eebefef3cbb3399 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: amh +doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_amh diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1d729a5cab74ddeb5b3e03f97eadef54a5be3a3c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: eng +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_eng diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml new file mode 100644 index 0000000000000000000000000000000000000000..26191dc815bc0747c05af177e38662e4c4581bfb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: ewe +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_ewe diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f0331ee8f3f730372c3eaecb0defe0887bd6502 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: fra +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_fra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml new file mode 100644 index 0000000000000000000000000000000000000000..850dad6351a693c2a738a0a570e15da8b412a63a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: hau +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_hau diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8b81178cc719c44419e24b5e14fc5c3e61b73a7a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: ibo +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_ibo diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a8f53e2e7e7449b1db465062bfb8524b94d3c85 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: kin +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_kin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58044ee2b887d3a83f9004e303da6c2bc048703f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: lin +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_lin diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..87013c146f2ef8bddee0a82c2c21949bcac549b0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: lug +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_lug diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1dd19325a57022df444f04eba5eb1b3ced117b61 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: orm +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_orm diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d710b1da339ca0012239993417f83c946a7c3e09 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: sna +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_sna diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..643eaaeef10a1f70b3b7f13b58cb606dd6ae3f73 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: sot +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_sot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b882e89c24a75ce06a1790791a084e1c087acc1b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: swa +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_swa diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac946eb7f413d227dfe0fc5b770e0c6c7bc2d159 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: twi +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_twi diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dbcc6b2e0e553ebe5353abaebbf6030d68c5b024 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: wol +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_wol diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dfb3d74f40fac640988e1ffba3caf007d56b66ec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: xho +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_xho diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6b4c346ffeeacd42de58efab206db84af0168670 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: yor +doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_yor diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e79edffadafebb8e31c710e854157046d15b10e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml @@ -0,0 +1,12 @@ +# Generated by utils.py +dataset_name: zul +doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}' +doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}' +generation_kwargs: + do_sample: false + until: + - 'Question:' + - + - <|im_end|> +include: translate_direct_yaml +task: afrimgsm_translate_direct_zul diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e54d3ea43f33b037d1f28389a0c3d30c6589906 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml @@ -0,0 +1,36 @@ +# This file will be included in the generated language-specific task configs. +# It doesn't have a yaml file extension as it is not meant to be imported directly +# by the harness. +group: + - afrimgsm + - afrimgsm_translate +dataset_path: masakhane/afrimgsm-translate-test +dataset_name: null # Overridden by language-specific config. +output_type: generate_until +test_split: test +generation_kwargs: + until: + - "\n\n" + - "\n" + do_sample: false + temperature: 0.0 +target_delimiter: " " +filter_list: + - name: remove_whitespace + filter: + - function: remove_whitespace + - function: take_first + - filter: + - function: regex + group_select: -1 + regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+) + - function: take_first + name: flexible-extract +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0dd336f8b3cbeb85d5854beb923578f215f91632 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimgsm/utils.py @@ -0,0 +1,219 @@ +import argparse + +import yaml + + +languages = [ + "eng", + "amh", + "ibo", + "fra", + "sna", + "lin", + "wol", + "ewe", + "lug", + "xho", + "kin", + "twi", + "zul", + "orm", + "yor", + "hau", + "sot", + "swa", +] + +languages_REGEX = { + "eng": "The answer is (\\-?[0-9\\.\\,]+)", + "amh": "መልሱ (\\-?[0-9\\.\\,]+)", + "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)", + "fra": "La réponse est(\\-?[0-9\\.\\,]+)", + "sna": "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)", + "lin": "Eyano ezali (\\-?[0-9\\.\\,]+)", + "wol": "Tontu li (\\-?[0-9\\.\\,]+)", + "ewe": "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)", + "lug": "Ansa eri (\\-?[0-9\\.\\,]+)", + "xho": "Impendulo ngu (\\-?[0-9\\.\\,]+)", + "kin": "Igisubizo ni (\\-?[0-9\\.\\,]+)", + "twi": "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)", + "zul": "Impendulo ithi (\\-?[0-9\\.\\,]+)", + "orm": "Deebiin isaa (\\-?[0-9\\.\\,]+)", + "yor": "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)", + "hau": "Amsar ita ce (\\-?[0-9\\.\\,]+)", + "sot": "Karabo ke (\\-?[0-9\\.\\,]+)", + "swa": "Jibu ni (\\-?[0-9\\.\\,]+)", +} + +LANGUAGES = {} + +for lang in languages: + if lang == "amh": + LANGUAGES[lang] = { # English + "QUESTION": "ጥያቄ:", + "ANSWER": "በቅደም ተከተል መልስ:", + "DIRECT": "Answer:", + "REGEX": languages_REGEX[lang], + } + elif lang == "yor": + LANGUAGES[lang] = { # English + "QUESTION": "Ìbéèrè:", + "ANSWER": "Ìdáhùn lẹ́sẹsẹ:", + "DIRECT": "Answer:", + "REGEX": languages_REGEX[lang], + } + + else: + LANGUAGES[lang] = { # English + "QUESTION": "Question:", + "ANSWER": "Step-by-Step Answer:", + "DIRECT": "Answer:", + "REGEX": languages_REGEX[lang], + } + + +def add_regex_pattern(regex_pattern): + if regex_pattern is None: + return {} + return { + "filter_list": [ + { + "name": "strict-match", + "filter": [ + { + "function": "regex", + "regex_pattern": f"""{regex_pattern}""", + }, + { + "function": "take_first", + }, + ], + }, + { + "name": "flexible-extract", + "filter": [ + { + "function": "regex", + "regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""", + "group_select": -1, + }, + { + "function": "take_first", + }, + ], + }, + ], + } + + +def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: + """ + Generate a yaml file for each language. + + :param output_dir: The directory to output the files to. + :param overwrite: Whether to overwrite files if they already exist. + """ + err = [] + for lang in LANGUAGES.keys(): + try: + yaml_template = "cot_yaml" + filter_list = {} + DELIMITER = None + if mode == "direct": + ANSWER = LANGUAGES["eng"]["DIRECT"] + QUESTION = LANGUAGES["eng"]["QUESTION"] + REGEX = None + task_name = f"afrimgsm_direct_{lang}" + yaml_template = "direct_yaml" + if mode == "direct-native": + ANSWER = LANGUAGES[lang]["DIRECT"] + QUESTION = LANGUAGES[lang]["QUESTION"] + REGEX = None + task_name = f"afrimgsm_direct_native_{lang}" + yaml_template = "direct_native_yaml" + elif mode == "native-cot": + ANSWER = LANGUAGES[lang]["ANSWER"] + REGEX = LANGUAGES[lang]["REGEX"] + QUESTION = LANGUAGES[lang]["QUESTION"] + task_name = f"afrimgsm_native_cot_{lang}" + filter_list = add_regex_pattern(REGEX) + DELIMITER = "" if lang in ["zh", "ja"] else None + elif mode == "en-cot": + ANSWER = LANGUAGES["eng"]["ANSWER"] + REGEX = LANGUAGES["eng"]["REGEX"] + QUESTION = LANGUAGES["eng"]["QUESTION"] + task_name = f"afrimgsm_en_cot_{lang}" + elif mode == "translate-direct": + ANSWER = LANGUAGES["eng"]["DIRECT"] + QUESTION = LANGUAGES["eng"]["QUESTION"] + REGEX = None + task_name = f"afrimgsm_translate_direct_{lang}" + yaml_template = "translate_direct_yaml" + + file_name = f"{task_name}.yaml" + ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1 + with open( + f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8" + ) as f: + f.write("# Generated by utils.py\n") + yaml.dump( + { + "include": yaml_template, + "dataset_name": lang, + "task": f"{task_name}", + "doc_to_text": f"""{{% if answer is not none %}}""" + f"""{{{{question+"\\n{ANSWER}"}}}}""" + f"""{{% else %}}""" + f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}""" + f"""{{% endif %}}""", + "doc_to_target": f"""{{% if answer is not none %}}""" + f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}""" + f"""{{% else %}}""" + f"""{{{{answer_number|string}}}}""" + f"""{{% endif %}}""", + **filter_list, + "generation_kwargs": { + "until": [QUESTION, "", "<|im_end|>"], + "do_sample": False, + }, + **({"target_delimiter": DELIMITER} if DELIMITER else {}), + }, + f, + allow_unicode=True, + width=float("inf"), + ) + except FileExistsError: + err.append(file_name) + + if len(err) > 0: + raise FileExistsError( + "Files were not created because they already exist (use --overwrite flag):" + f" {', '.join(err)}" + ) + + +def main() -> None: + """Parse CLI args and generate language-specific yaml files.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--overwrite", + default=False, + action="store_true", + help="Overwrite files if they already exist", + ) + parser.add_argument( + "--output-dir", default=".", help="Directory to write yaml files to" + ) + parser.add_argument( + "--mode", + default="native-cot", + choices=["direct", "direct-native", "native-cot", "en-cot", "translate-direct"], + help="Mode of chain-of-thought", + ) + args = parser.parse_args() + + gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode) + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..77347e4fd8430ddc1fd7411be84a770d64f9096f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/README.md @@ -0,0 +1,43 @@ +# CoQA + +### Paper + +Title: `CoQA: A Conversational Question Answering Challenge` + +Abstract: https://arxiv.org/pdf/1808.07042.pdf + +CoQA is a large-scale dataset for building Conversational Question Answering +systems. The goal of the CoQA challenge is to measure the ability of machines to +understand a text passage and answer a series of interconnected questions that +appear in a conversation. + +Homepage: https://stanfordnlp.github.io/coqa/ + +### Citation + +``` +BibTeX-formatted citation goes here +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet + +#### Tasks + +* `coqa` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de398c242d04dfd823c32c5fbbb3c3796355d3f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/default.yaml @@ -0,0 +1,24 @@ +task: coqa +dataset_path: EleutherAI/coqa +output_type: generate_until +training_split: train +validation_split: validation +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +process_results: !function utils.process_results +should_decontaminate: true +doc_to_decontamination_query: "{{story}} {{question.input_text|join('\n')}}" +generation_kwargs: + until: + - "\nQ:" +metric_list: + - metric: em + aggregation: mean + higher_is_better: true + - metric: f1 + aggregation: mean + higher_is_better: true +metadata: + version: 3.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..29911cfec5cd345b41c631064a7e281b9d15000e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/coqa/utils.py @@ -0,0 +1,77 @@ +from itertools import zip_longest + +import transformers.data.metrics.squad_metrics as squad_metrics + + +def doc_to_text(doc): + # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} + # and a question qi, the task is to predict the answer ai + doc_text = doc["story"] + "\n\n" + for q, a in zip_longest( + doc["questions"]["input_text"], doc["answers"]["input_text"][:-1] + ): # omit target answer ai + question = f"Q: {q}\n\n" + answer = f"A: {a}\n\n" if a is not None else "A:" + doc_text += question + answer + return doc_text + + +def doc_to_target(doc): + turn_id = len(doc["questions"]["input_text"]) + # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers). + answers = [] + answer_forturn = doc["answers"]["input_text"][turn_id - 1] + answers.append(answer_forturn) + + additional_answers = doc.get("additional_answers") + if additional_answers: + for key in additional_answers: + additional_answer_for_turn = additional_answers[key]["input_text"][ + turn_id - 1 + ] + if additional_answer_for_turn.lower() not in map(str.lower, answers): + answers.append(additional_answer_for_turn) + return answers + + +def em(gold_list, pred): + # tests for exact match and on the normalised answer (compute_exact) + em_sum = 0.0 + if len(gold_list) > 1: + for i in range(len(gold_list)): + gold_answers = gold_list[0:i] + gold_list[i + 1 :] + # predictions compared against (n) golds and take maximum + em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers) + else: + em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list) + + return em_sum / max(1, len(gold_list)) + + +def compute_scores(gold_list, pred): + # tests for exact match and on the normalised answer (compute_exact) + # test for overlap (compute_f1) + f1_sum = 0.0 + em_sum = 0.0 + if len(gold_list) > 1: + for i in range(len(gold_list)): + gold_answers = gold_list[0:i] + gold_list[i + 1 :] + # predictions compared against (n) golds and take maximum + em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers) + f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers) + else: + em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list) + f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list) + + return { + "em": em_sum / max(1, len(gold_list)), + "f1": f1_sum / max(1, len(gold_list)), + } + + +def process_results(doc, results): + gold_list = doc_to_target(doc) + pred = results[0].strip().split("\n")[0] + + scores = compute_scores(gold_list, pred) + return scores diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..666d6446b108afaa2991ec8b3d6921ecc2b704d7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/README.md @@ -0,0 +1,49 @@ +# MedConceptsQA + +### Paper + +Title: `MedConceptsQA: Open Source Medical Concepts QA Benchmark` + +Abstract: https://arxiv.org/abs/2405.07348 + +MedConceptsQA is a dedicated open source benchmark for medical concepts question answering. The benchmark comprises of questions of various medical concepts across different vocabularies: diagnoses, procedures, and drugs. + +The questions are categorized into three levels of difficulty: easy, medium, and hard. + +Our benchmark serves as a valuable resource for evaluating the +abilities of Large Language Models to interpret medical codes and distinguish +between medical concepts. + +### Citation + +``` +@article{shoham2024medconceptsqa, + title={MedConceptsQA--Open Source Medical Concepts QA Benchmark}, + author={Shoham, Ofir Ben and Rappoport, Nadav}, + journal={arXiv preprint arXiv:2405.07348}, + year={2024} +} +``` + +### Groups and Tasks + +#### Groups + +* `med_concepts_qa`: Contains all the QA tasks (diagnosis, procedures ,and drugs). + +#### Tasks + + +* `med_concepts_qa_icd9cm` - ICD9-CM (diagnosis codes, ICD9 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-9-CM (International Classification of Diseases, 9th Revision, Clinical Modification) diagnosis codes. + + +* `med_concepts_qa_icd10cm` - ICD10-CM (diagnosis codes, ICD10 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-10-CM (International Classification of Diseases, 10th Revision, Clinical Modification) diagnosis codes. + + +* `med_concepts_qa_icd9proc` - ICD9-Proc (procedure codes, ICD9 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-9-PCS (International Classification of Diseases, 9th Revision, Procedure Coding System) procedure codes. + + +* `med_concepts_qa_icd10proc` - ICD10-Proc (procedure codes, ICD10 format) question-answering. This involves providing information, clarifications, and answering questions related to ICD-10-PCS (International Classification of Diseases, 10th Revision, Procedure Coding System) procedure codes. + + +* `med_concepts_qa_atc` - ATC (Anatomical Therapeutic Chemical Classification System) question-answering. This involves providing information, clarifications, and answering questions related to the ATC classification system, which is used for the classification of drugs and other medical products according to the organ or system on which they act and their therapeutic, pharmacological, and chemical properties. diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..0fb527557dd3906b1ec51a83d0ddbfa48da815e2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_default_template_yaml @@ -0,0 +1,17 @@ +dataset_path: ofir408/MedConceptsQA +output_type: multiple_choice +description: "Answer A,B,C,D according to the answer to this multiple choice question.\n" +fewshot_split: dev +fewshot_config: + sampler: first_n +num_fewshot: 4 +test_split: test +doc_to_text: "{{question}}\nAnswer:" +doc_to_target: answer_id +doc_to_choice: ['A', 'B', 'C', 'D'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e34ad36c296e9b1e39b5445dc9da8bc640245a4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml @@ -0,0 +1,6 @@ +group: med_concepts_qa_atc +task: + - med_concepts_qa_atc_tasks +aggregate_metric_list: + - metric: acc + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b12ea811ff26631e65c9dd9cb42b56b1c0c7dba1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml @@ -0,0 +1,6 @@ +group: med_concepts_qa_icd9cm +task: + - med_concepts_qa_icd9cm_tasks +aggregate_metric_list: + - metric: acc + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94fc034eb20a8db04d2caf4d027e81be947ebf46 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml @@ -0,0 +1,6 @@ +group: med_concepts_qa_icd9proc +task: + - med_concepts_qa_icd9proc_tasks +aggregate_metric_list: + - metric: acc + aggregation: mean diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be2f62a1dc4e3106b1ea6f33a1286b1358f5df5e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml @@ -0,0 +1,5 @@ +dataset_name: atc_easy +include: _default_template_yaml +tag: med_concepts_qa_atc_tasks +task: med_concepts_qa_atc_easy +task_alias: atc_easy diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0b64ac85ba3435b9c7f3b4912225a2703358e6b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml @@ -0,0 +1,5 @@ +dataset_name: atc_medium +include: _default_template_yaml +tag: med_concepts_qa_atc_tasks +task: med_concepts_qa_atc_medium +task_alias: atc_medium diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1678fa311b550bc04430235059ad54327ec0ecf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml @@ -0,0 +1,5 @@ +dataset_name: icd10cm_medium +include: _default_template_yaml +tag: med_concepts_qa_icd10cm_tasks +task: med_concepts_qa_icd10cm_medium +task_alias: icd10cm_medium diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15efafba09f82bd8e987356511443329538ac594 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml @@ -0,0 +1,5 @@ +dataset_name: icd10proc_hard +include: _default_template_yaml +tag: med_concepts_qa_icd10proc_tasks +task: med_concepts_qa_icd10proc_hard +task_alias: icd10proc_hard diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml new file mode 100644 index 0000000000000000000000000000000000000000..648bd46075de41e4ccf7b34882990772928123f7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml @@ -0,0 +1,5 @@ +dataset_name: icd10proc_medium +include: _default_template_yaml +tag: med_concepts_qa_icd10proc_tasks +task: med_concepts_qa_icd10proc_medium +task_alias: icd10proc_medium diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..514a9e258614aaa29e32f2c5bc3d54318175d837 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml @@ -0,0 +1,5 @@ +dataset_name: icd9cm_easy +include: _default_template_yaml +tag: med_concepts_qa_icd9cm_tasks +task: med_concepts_qa_icd9cm_easy +task_alias: icd9cm_easy diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41d0fd5534743ba4a8919c0622e119c383b05588 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml @@ -0,0 +1,5 @@ +dataset_name: icd9cm_hard +include: _default_template_yaml +tag: med_concepts_qa_icd9cm_tasks +task: med_concepts_qa_icd9cm_hard +task_alias: icd9cm_hard diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90f93f9bc2b520186a98c923dd7cf49d58a5293b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml @@ -0,0 +1,5 @@ +dataset_name: icd9cm_medium +include: _default_template_yaml +tag: med_concepts_qa_icd9cm_tasks +task: med_concepts_qa_icd9cm_medium +task_alias: icd9cm_medium diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml new file mode 100644 index 0000000000000000000000000000000000000000..843029209916822a306e11e2795568a757cc4b2a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml @@ -0,0 +1,5 @@ +dataset_name: icd9proc_medium +include: _default_template_yaml +tag: med_concepts_qa_icd9proc_tasks +task: med_concepts_qa_icd9proc_medium +task_alias: icd9proc_medium diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dfe6c5e8a50da470e22be690e9e10612d830f957 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/README.md @@ -0,0 +1,62 @@ +# RACE + +### Paper + +Title: `RACE: Large-scale ReAding Comprehension Dataset From Examinations` + +Abstract: https://arxiv.org/abs/1704.04683 + +RACE is a large-scale reading comprehension dataset with more than 28,000 passages +and nearly 100,000 questions. The dataset is collected from English examinations +in China, which are designed for middle school and high school students. The dataset +can be served as the training and test sets for machine comprehension. + +Homepage: https://www.cs.cmu.edu/~glai1/data/race/ + + +### Citation + +``` +@inproceedings{lai-etal-2017-race, + title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations", + author = "Lai, Guokun and + Xie, Qizhe and + Liu, Hanxiao and + Yang, Yiming and + Hovy, Eduard", + editor = "Palmer, Martha and + Hwa, Rebecca and + Riedel, Sebastian", + booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing", + month = sep, + year = "2017", + address = "Copenhagen, Denmark", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D17-1082", + doi = "10.18653/v1/D17-1082", + pages = "785--794" +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `race` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/preprocess_race.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/preprocess_race.py new file mode 100644 index 0000000000000000000000000000000000000000..03a214e5747876325d118bf4660b0e5c7e9d5142 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/preprocess_race.py @@ -0,0 +1,40 @@ +import ast + + +def process_ast(string): + return ast.literal_eval(string) + + +def last_problem(doc): + return process_ast(doc["problems"])[-1] + + +def get_answer_option(problem): + letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3} + answer = letter_to_num[problem["answer"]] + return problem["options"][answer] + + +def doc_to_choice(doc): + problem = last_problem(doc) + choices = [problem["options"][i] for i in range(4)] + return choices + + +def doc_to_text(doc): + text = "Article: " + doc["article"] + "\n\n" + for problem in process_ast(doc["problems"])[:-1]: + if problem["question"][-6:] == " _ .": + text += problem["question"][-5:] + get_answer_option(problem) + "\n" + else: + question = "Question: " + problem["question"] + "\n" + answer = "Answer: " + get_answer_option(problem) + "\n" + text += question + answer + text += last_problem(doc)["question"] + return text + + +def doc_to_target(doc): + letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3} + answer = letter_to_num[last_problem(doc)["answer"]] + return answer diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/race.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/race.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b90b809f6120924f398372a454ce4ba74220bbe9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/race/race.yaml @@ -0,0 +1,16 @@ +task: race +dataset_path: EleutherAI/race +dataset_name: high +output_type: multiple_choice +test_split: test +doc_to_text: !function preprocess_race.doc_to_text +doc_to_target: !function preprocess_race.doc_to_target +doc_to_choice: !function preprocess_race.doc_to_choice +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/triviaqa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/triviaqa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1722b709886b938ded164ad0eee260a2e0f6b78e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/triviaqa/README.md @@ -0,0 +1,51 @@ +# Trivia QA + +### Paper + +Title: `TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension` +Abstract: https://arxiv.org/abs/1705.03551 + +TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence +triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts +and independently gathered evidence documents, six per question on average, that provide +high quality distant supervision for answering the questions. + +Homepage: https://nlp.cs.washington.edu/triviaqa/ + + +### Citation + +``` +@InProceedings{JoshiTriviaQA2017, + author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke}, + title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, + booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics}, + month = {July}, + year = {2017}, + address = {Vancouver, Canada}, + publisher = {Association for Computational Linguistics}, +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `triviaqa`: `Generate and answer based on the question.` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/triviaqa/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/triviaqa/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a895fe7eb48f1fdef578606ebc95bbc7ab0f75ca --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/triviaqa/default.yaml @@ -0,0 +1,31 @@ +task: triviaqa +dataset_path: trivia_qa +dataset_name: rc.nocontext +output_type: generate_until +training_split: train +validation_split: validation +doc_to_text: "Question: {{question}}?\nAnswer:" +doc_to_target: "{{answer.aliases}}" +should_decontaminate: true +doc_to_decontamination_query: question +generation_kwargs: + until: + - "\n" + - "." + - "," + do_sample: false + temperature: 0.0 +filter_list: + - name: remove_whitespace + filter: + - function: remove_whitespace + - function: take_first +target_delimiter: " " +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true +metadata: + version: 3.0