diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f9f50e6754ef3d760e34a7d65260bc5a0668fba Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9532179d8b0977573b6ee35e304c31f6c8867165 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md @@ -0,0 +1,101 @@ +# CrowS-Pairs + +### Paper + +CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models +https://aclanthology.org/2020.emnlp-main.154/ +French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked +language models to a language other than English +https://aclanthology.org/2022.acl-long.583/ + +CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency +to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has +a newer version which fixes some of the issues with the original version. + +Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs + +### Citation + +```bibtex +@inproceedings{nangia-etal-2020-crows, + title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models", + author = "Nangia, Nikita and + Vania, Clara and + Bhalerao, Rasika and + Bowman, Samuel R.", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.emnlp-main.154", + doi = "10.18653/v1/2020.emnlp-main.154", + pages = "1953--1967", + abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", +} + +@inproceedings{neveol-etal-2022-french, + title = "{F}rench {C}row{S}-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than {E}nglish", + author = {N{\'e}v{\'e}ol, Aur{\'e}lie and + Dupont, Yoann and + Bezan{\c{c}}on, Julien and + Fort, Kar{\"e}n}, + booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = may, + year = "2022", + address = "Dublin, Ireland", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2022.acl-long.583", + doi = "10.18653/v1/2022.acl-long.583", + pages = "8521--8531", + abstract = "Warning: This paper contains explicit statements of offensive stereotypes which may be upsetting.Much work on biases in natural language processing has addressed biases linked to the social and cultural experience of English speaking individuals in the United States. We seek to widen the scope of bias studies by creating material to measure social bias in language models (LMs) against specific demographic groups in France. We build on the US-centered CrowS-pairs dataset to create a multilingual stereotypes dataset that allows for comparability across languages while also characterizing biases that are specific to each country and language. We introduce 1,679 sentence pairs in French that cover stereotypes in ten types of bias like gender and age. 1,467 sentence pairs are translated from CrowS-pairs and 212 are newly crowdsourced. The sentence pairs contrast stereotypes concerning underadvantaged groups with the same sentence concerning advantaged groups. We find that four widely used language models (three French, one multilingual) favor sentences that express stereotypes in most bias categories. We report on the translation process from English into French, which led to a characterization of stereotypes in CrowS-pairs including the identification of US-centric cultural traits. We offer guidelines to further extend the dataset to other languages and cultural environments.", +} +``` + +### Groups and Tasks + +#### Groups + +- `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset. +- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset. + +#### Tasks + + +The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset: +- `crows_pairs_english_age` +- `crows_pairs_english_autre` +- `crows_pairs_english_disability` +- `crows_pairs_english_gender` +- `crows_pairs_english_nationality` +- `crows_pairs_english_physical_appearance` +- `crows_pairs_english_race_color` +- `crows_pairs_english_religion` +- `crows_pairs_english_sexual_orientation` +- `crows_pairs_english_socioeconomic` + +The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset: +- `crows_pairs_french_age` +- `crows_pairs_french_autre` +- `crows_pairs_french_disability` +- `crows_pairs_french_gender` +- `crows_pairs_french_nationality` +- `crows_pairs_french_physical_appearance` +- `crows_pairs_french_race_color` +- `crows_pairs_french_religion` +- `crows_pairs_french_sexual_orientation` +- `crows_pairs_french_socioeconomic` + +All tasks evaluate the percentage of more-stereotypical sentences that are rated as more likely by a model than the non-stereotypical sentences (`pct_stereotype`), as well as the average absolute difference of loglikelihoods between the sentences in the pairs. + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? + * [x] The original paper does not for causal language models, so this is a novel formulation of the task for autoregressive LMs. + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3daf5f75fc3fe8090336a43e3617fe79ceb22bdd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml @@ -0,0 +1,21 @@ +tag: + - crows_pairs +task: crows_pairs_english +dataset_path: BigScienceBiasEval/crows_pairs_multilingual +dataset_name: english +test_split: test +output_type: multiple_choice +doc_to_text: "" +doc_to_target: 0 +doc_to_choice: !function utils.doc_to_choice +target_delimiter: "" +process_results: !function utils.process_results +metric_list: + - metric: likelihood_diff + aggregation: mean + higher_is_better: false + - metric: pct_stereotype + aggregation: mean + higher_is_better: false +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf986547722edcae6fb8c7954e26c8321e146b8d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_age +dataset_name: english +process_docs: !function utils.filter_age diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5b456206f774c49d2d32a92bfb6733f22bce609c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_autre +dataset_name: english +process_docs: !function utils.filter_autre diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9830d8140e68e5fb9b48d16e61ed3904e1d5ff06 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_disability +dataset_name: english +process_docs: !function utils.filter_disability diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6e185c109163bc9e9919853b789267ae8a87ae6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_gender +dataset_name: english +process_docs: !function utils.filter_gender diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96ac97baedbaa8bb93b1cf7c8f396976d9b00897 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_nationality +dataset_name: english +process_docs: !function utils.filter_nationality diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6c199799f0385884ddeb37db9dd6de3490ec41a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_physical_appearance +dataset_name: english +process_docs: !function utils.filter_appearance diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml new file mode 100644 index 0000000000000000000000000000000000000000..69e22c53712169f9a12016ece922bb7bf81c7d24 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_race_color +dataset_name: english +process_docs: !function utils.filter_race_color diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c62882a0a96f60578d681a0dcd174e24317ecee --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_religion +dataset_name: english +process_docs: !function utils.filter_religion diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d678e75ca401570b9e80602282af0fb53200df90 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_sexual_orientation +dataset_name: english +process_docs: !function utils.filter_orientation diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc98fed59b5800600b30975d36e794d8b55be2f8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_english_socioeconomic +dataset_name: english +process_docs: !function utils.filter_socio diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4eb7f0034149f08f30249758c2baff4a8f0164e9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml @@ -0,0 +1,3 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french +dataset_name: french diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e862b5bab4d5e966bcbec5b26b162cb882088b43 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_age +dataset_name: french +process_docs: !function utils.filter_age diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f47f99254edff8aecb5ebf9979edb92360e1e81 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_autre +dataset_name: french +process_docs: !function utils.filter_autre diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml new file mode 100644 index 0000000000000000000000000000000000000000..643b16fd25e67c90f376b646bccd074e062a57f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_disability +dataset_name: french +process_docs: !function utils.filter_disability diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml new file mode 100644 index 0000000000000000000000000000000000000000..abf645178d698c199997a51eb4c140b1179ef423 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_gender +dataset_name: french +process_docs: !function utils.filter_gender diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml new file mode 100644 index 0000000000000000000000000000000000000000..876b20877c199bf577580e2cf7edafa412aa3f6d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_nationality +dataset_name: french +process_docs: !function utils.filter_nationality diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c95f36940180c930baf17c86709b54d46408290f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_physical_appearance +dataset_name: french +process_docs: !function utils.filter_appearance diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8eaf82149351ac9fac41b749fd20b7aaccfb6f13 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_race_color +dataset_name: french +process_docs: !function utils.filter_race_color diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7930c3243f25a73274beb36750ae72e05ee27d76 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_religion +dataset_name: french +process_docs: !function utils.filter_religion diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ecf47a3846671c793f88c74728605f3909d14d7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_sexual_orientation +dataset_name: french +process_docs: !function utils.filter_orientation diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2a782cc3abd9f46794c2e15ec8c5a838cf10d46 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml @@ -0,0 +1,4 @@ +include: crows_pairs_english.yaml +task: crows_pairs_french_socioeconomic +dataset_name: french +process_docs: !function utils.filter_socio diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb38bdb4923f10f4fb237f7332bdfc785cd521f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py @@ -0,0 +1,64 @@ +import datasets + + +def process_results(doc, results): + lls, _ = zip(*results) + + likelihood1, likelihood2 = lls + + # Calculate the absolute difference in loglikelihoods + diff = abs(likelihood1 - likelihood2) + + # if stereotypical sentence more likely (loglikelihood higher) + # then treat this as predicting stereotyped sentence + acc = 1.0 if likelihood1 > likelihood2 else 0.0 + + return {"likelihood_diff": diff, "pct_stereotype": acc} + + +def doc_to_choice(doc): + return [doc["sent_more"], doc["sent_less"]] + + +def filter_dataset(dataset: datasets.Dataset, bias_type: str) -> datasets.Dataset: + return dataset.filter(lambda example: example["bias_type"].startswith(bias_type)) + + +def filter_race_color(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "race-color") + + +def filter_socio(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "socioeconomic") + + +def filter_gender(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "gender") + + +def filter_age(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "age") + + +def filter_religion(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "religion") + + +def filter_disability(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "disability") + + +def filter_orientation(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "sexual-orientation") + + +def filter_nationality(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "nationality") + + +def filter_appearance(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "physical-appearance") + + +def filter_autre(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "autre") diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6671bda477e4533204c8ba154323e40d3df23f79 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md @@ -0,0 +1,48 @@ +# EusProficiency + +### Paper + +Title: Latxa: An Open Language Model and Evaluation Suite for Basque + +Abstract: https://arxiv.org/abs/2403.20266 + +EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. We collected the atarikoa exercises from EGA exams through the years 1998 to 2008. Atarikoa is the first qualifying test of EGA, which measures different aspects of language competency, such as reading comprehension, grammar, vocabulary, spelling, and writing. Each test generally has 85 multiple-choice questions, with 4 choices and a single correct answer. + +Homepage: https://github.com/hitz-zentroa/latxa + + +### Citation + +``` +@misc{etxaniz2024latxa, + title={Latxa: An Open Language Model and Evaluation Suite for Basque}, + author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa}, + year={2024}, + eprint={2403.20266}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +There are no groups. + +#### Tasks + +* `eus_proficiency`: EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18cf5d2ab313a2ac907738185b5e39036402c7e2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml @@ -0,0 +1,16 @@ +dataset_path: HiTZ/EusProficiency +dataset_name: default +task: eus_proficiency +doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:" +doc_to_choice: ["A", "B", "C", "D"] +validation_split: null +test_split: test +fewshot_split: test +output_type: multiple_choice +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdd60e5d0e78fc519ae08c97e4cdcb6986b04d8c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml @@ -0,0 +1,23 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +description: "D'après l'information dans le contexte donné, quelle est la réponse à la question ?" +task: french_bench_boolqa +dataset_path: manu/french_boolq +output_type: multiple_choice +validation_split: valid +doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n" +doc_to_choice: ["Oui", "Non"] +# doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n\nD'après l'information dans le contexte, la réponse est:\nA. Oui \nB. Non\n\nRéponse:" +# doc_to_choice: ["A", "B"] +doc_to_target: "{{[1, 0].index(label)}}" +should_decontaminate: true +doc_to_decontamination_query: passage +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e49ec43c185951786dc8a2ca60f80c71ed6ac25 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml @@ -0,0 +1,29 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'." +task: french_bench_fquadv2 +dataset_path: manu/fquad2_test +output_type: generate_until +validation_split: valid +doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:" +doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: context +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.exact + aggregation: mean + higher_is_better: true + - metric: !function utils.f1 + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6828c3a9fd7a9c73c7c7ff368952ca22805b4b7a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml @@ -0,0 +1,20 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_mc +description: "Répond au mieux en complétant la question avec une des réponses proposées." +dataset_path: manu/french-bench-grammar-vocab-reading +output_type: multiple_choice +validation_split: Grammar +fewshot_split: Grammar +test_split: Grammar +#doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:" +#doc_to_choice: ["A", "B", "C", "D"] +doc_to_text: "La phrase suivante est correcte grammaticalement:\n" +doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}" +doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}' +task: french_bench_grammar +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml new file mode 100644 index 0000000000000000000000000000000000000000..293a76c27a9bfbf7beec22805d06f44310cf143c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml @@ -0,0 +1,20 @@ +tag: + - french_bench + - french_bench_mc +task: french_bench_hellaswag +dataset_path: manu/french_bench_hellaswag +output_type: multiple_choice +training_split: validation +validation_split: validation +test_split: null +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{label}}" +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dbe714a9c03a9f099d7438f4458d086caa7cd4a1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml @@ -0,0 +1,23 @@ +tag: + - french_bench_perplexity +task: french_bench_opus_perplexity +dataset_path: manu/opus100-en-fr +output_type: loglikelihood_rolling +test_split: test +fewshot_split: validation +validation_split: validation +num_fewshot: 0 +doc_to_text: "" +doc_to_target: "{{text}}" +should_decontaminate: true +doc_to_decontamination_query: "{{text}}" +metric_list: + - metric: word_perplexity + aggregation: weighted_perplexity + higher_is_better: false + - metric: byte_perplexity + aggregation: weighted_perplexity + higher_is_better: false + - metric: bits_per_byte + aggregation: bits_per_byte + higher_is_better: false diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1d4a3b4acb9e5304eb191e345edc191309245729 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml @@ -0,0 +1,28 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_gen +description: "Résume l'article en une phrase." +task: french_bench_orangesum_abstract +dataset_path: orange_sum +dataset_name: abstract +output_type: generate_until +validation_split: validation +fewshot_split: validation +doc_to_text: "\nArticle: {{text}}\n\nRésumé:" +doc_to_target: "{{summary}}" +target_delimiter: " " +should_decontaminate: true +doc_to_decontamination_query: summary +generation_kwargs: + until: + - "\n" +# filter_list: +# - name: remove_whitespace +# filter: +# - function: remove_whitespace +# - function: take_first +metric_list: + - metric: !function utils.rouge1 + higher_is_better: true + aggregation: !function utils.rouge1_agg diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..28dd6af64ecd6344146a790a57cfc43ccf2eb3a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml @@ -0,0 +1,23 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +description: "A propos du thème spécifié, l'avis client est il positif, négatif, ou neutre ?" +task: french_bench_topic_based_nli +dataset_path: manu/topic_based_nli_test +output_type: multiple_choice +validation_split: valid +# doc_to_text: "\nAvis Client: {{text}}\n\nEn considèrant uniquement le thème \"{{topic}}\", l'avis client est plutot:\nA. Positif \nB. Négatif\nC. Mitigé \nD. Neutre\nE. Absent\n\nRéponse:" +# doc_to_choice: ["A", "B", "C", "D", "E"] +doc_to_text: "\nAvis Client: {{text}}\n\nA propos du thème \"{{topic}}\", l'avis client est" +doc_to_choice: ['positif', 'négatif', 'neutre'] +doc_to_target: "{{['positif', 'negatif', 'neutre'].index(polarity)}}" +should_decontaminate: true +doc_to_decontamination_query: texte +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a5d5cadcd4ab2e879909fc51c94699ff45e4f6b3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml @@ -0,0 +1,20 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_mc +# description: "Répond au mieux en complétant la question avec une des réponses proposées." +dataset_path: manu/french-bench-grammar-vocab-reading +output_type: multiple_choice +validation_split: Vocabulary +fewshot_split: Vocabulary +test_split: Vocabulary +# doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:" +# doc_to_choice: ["A", "B", "C", "D"] +doc_to_text: "La phrase suivante est logique sémantiquement:\n" +doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}" +doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}' +task: french_bench_vocab +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7ae23ff9246e09e31a0c2e77f19935b7d03432d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml @@ -0,0 +1,25 @@ +tag: + - french_bench_perplexity +task: french_bench_wikitext_fr +dataset_path: asi/wikitext_fr +dataset_name: wikitext-35 +output_type: loglikelihood_rolling +training_split: train +validation_split: validation +test_split: test +num_fewshot: 0 +doc_to_text: "" +doc_to_target: !function preprocess_wikitext.wikitext_detokenizer +process_results: !function preprocess_wikitext.process_results +should_decontaminate: true +doc_to_decontamination_query: "{{paragraph}}" +metric_list: + - metric: word_perplexity + aggregation: weighted_perplexity + higher_is_better: false + - metric: byte_perplexity + aggregation: weighted_perplexity + higher_is_better: false + - metric: bits_per_byte + aggregation: bits_per_byte + higher_is_better: false diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..272b5652e81fdc3d42a6ca6cc39220d715251323 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml @@ -0,0 +1,21 @@ +include: "_default_template_yaml" +tag: + - french_bench + - french_bench_extra +description: "La prémisse et l'hypothèse sont elles en accord, neutres en elles, ou en contradiction ?" +dataset_path: xnli +dataset_name: fr +output_type: multiple_choice +validation_split: validation +fewshot_split: validation +test_split: test +# doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont:\nA. En accord\nB. Neutre\nC. En contradiction\nRéponse:" +# doc_to_choice: "{{['A: En accord', 'B: Neutre', 'C: En contradiction']}}" +doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont" +doc_to_choice: "{{['en accord', 'neutres entre elles', 'en contradiction']}}" +doc_to_target: label +task: french_bench_xnli +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3efc925665fb2fcb21c15777f91fda06ae9822b3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md @@ -0,0 +1,20 @@ +# Glianorex + +The goal of this benchmark is to isolate the test answering capabilities from the content knowledge. + +### Paper + +Title: Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data + +Abstract: https://arxiv.org/abs/2406.02394 + +To test the relevance of MCQs to assess LLM performance without prior data exposure, we created a fictional medical benchmark and knowledge base on a non-existent gland, the Glianorex. Using GPT-4 we generated a comprehensive textbook on the Glianorex in both English and French, and created multiple-choice questions in both English and French. + +### Tasks + +All tasks are multiple choice questions with 4 options, only one correct option. + +- `glianorex`: Evaluates all tasks listed below. + +- `glianorex_en`: Evaluates the accuracy on 264 questions in English. +- `glianorex_fr`: Evaluates the accuracy on 264 questions in French. diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7ba4366564ab4d81b08708f67f42253682a9639 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml @@ -0,0 +1,14 @@ +task: glianorex +dataset_path: maximegmd/glianorex +output_type: multiple_choice +test_split: train +doc_to_text: !function preprocess_glianorex.doc_to_text +doc_to_target: !function preprocess_glianorex.doc_to_target +doc_to_choice: [ 'A', 'B', 'C', 'D' ] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b08c6f811425a7be4911a0f47f8994dfc630a3b8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml @@ -0,0 +1,15 @@ +task: glianorex_en +dataset_path: maximegmd/glianorex +output_type: multiple_choice +test_split: train +doc_to_text: !function preprocess_glianorex.doc_to_text +doc_to_target: !function preprocess_glianorex.doc_to_target +process_docs: !function preprocess_glianorex.filter_english +doc_to_choice: [ 'A', 'B', 'C', 'D' ] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d09bc5a7ba0150946ec746dbb946b0f550a8bde --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml @@ -0,0 +1,15 @@ +task: glianorex_fr +dataset_path: maximegmd/glianorex +output_type: multiple_choice +test_split: train +doc_to_text: !function preprocess_glianorex.doc_to_text +doc_to_target: !function preprocess_glianorex.doc_to_target +process_docs: !function preprocess_glianorex.filter_french +doc_to_choice: [ 'A', 'B', 'C', 'D' ] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py new file mode 100644 index 0000000000000000000000000000000000000000..f257df14d82c53ebd00b5071bbcf0d5529584161 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py @@ -0,0 +1,23 @@ +import datasets + + +def doc_to_text(doc) -> str: + option_choices = doc["options"] + answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items()) + return f"Question: {doc['question']}\n{answers}Answer:" + + +def doc_to_target(doc) -> int: + return doc["answer_idx"] + + +def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset: + return dataset.filter(lambda example: example["language"].startswith(lang)) + + +def filter_french(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "fr") + + +def filter_english(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "en") diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..91c35cb4a4599ef74ac2d36586c2eaca43916263 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md @@ -0,0 +1,76 @@ +# GLUE +**NOTE**: GLUE benchmark tasks do not provide publicly accessible labels for their test sets, so we default to the validation sets for all sub-tasks. + +### Paper + +Title: `GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding` + +Abstract: https://openreview.net/pdf?id=rJ4km2R5t7 + +The General Language Understanding Evaluation (GLUE) benchmark is a collection of +resources for training, evaluating, and analyzing natural language understanding +systems. GLUE consists of: +- A benchmark of nine sentence- or sentence-pair language understanding tasks built +on established existing datasets and selected to cover a diverse range of dataset +sizes, text genres, and degrees of difficulty, and +- A diagnostic dataset designed to evaluate and analyze model performance with +respect to a wide range of linguistic phenomena found in natural language. + +Homepage: https://gluebenchmark.com/ + +### Citation + +``` +@inproceedings{wang-etal-2018-glue, + title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", + author = "Wang, Alex and + Singh, Amanpreet and + Michael, Julian and + Hill, Felix and + Levy, Omer and + Bowman, Samuel", + booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}", + month = nov, + year = "2018", + address = "Brussels, Belgium", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/W18-5446", + doi = "10.18653/v1/W18-5446", + pages = "353--355", + abstract = "Human ability to understand language is \textit{general, flexible, and robust}. In contrast, most NLU models above the word level are designed for a specific task and struggle with out-of-domain data. If we aspire to develop models with understanding beyond the detection of superficial correspondences between inputs and outputs, then it is critical to develop a unified model that can execute a range of linguistic tasks across different domains. To facilitate research in this direction, we present the General Language Understanding Evaluation (GLUE, gluebenchmark.com): a benchmark of nine diverse NLU tasks, an auxiliary dataset for probing models for understanding of specific linguistic phenomena, and an online platform for evaluating and comparing models. For some benchmark tasks, training data is plentiful, but for others it is limited or does not match the genre of the test set. GLUE thus favors models that can represent linguistic knowledge in a way that facilitates sample-efficient learning and effective knowledge-transfer across tasks. While none of the datasets in GLUE were created from scratch for the benchmark, four of them feature privately-held test data, which is used to ensure that the benchmark is used fairly. We evaluate baselines that use ELMo (Peters et al., 2018), a powerful transfer learning technique, as well as state-of-the-art sentence representation models. The best models still achieve fairly low absolute scores. Analysis with our diagnostic dataset yields similarly weak performance over all phenomena tested, with some exceptions.", +} +``` + +### Groups, Tags, and Tasks + +#### Groups + +None. + +#### Tags + +* `glue`: Run all Glue subtasks. + +#### Tasks + +* `cola` +* `mnli` +* `mrpc` +* `qnli` +* `qqp` +* `rte` +* `sst` +* `wnli` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7f79e5e8f6403014e790726d8d66eac86629ec90 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml @@ -0,0 +1,16 @@ +tag: glue +task: cola +dataset_path: glue +dataset_name: cola +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{sentence}}\nQuestion: Does this sentence make sense?\nAnswer:" +doc_to_target: label +doc_to_choice: ["no", "yes"] +should_decontaminate: true +doc_to_decontamination_query: sentence +metric_list: + - metric: mcc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..654f61231fa51f7688daf8063514656aa7e29283 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml @@ -0,0 +1,14 @@ +tag: glue +task: mnli +dataset_path: glue +dataset_name: mnli +output_type: multiple_choice +training_split: train +validation_split: validation_matched +doc_to_text: !function utils.doc_to_text +doc_to_target: label +doc_to_choice: ["True", "Neither", "False"] +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e9b49bcd423ce43bf87f044c75a01e75f44d3d0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml @@ -0,0 +1,3 @@ +include: default.yaml +task: mnli_mismatch +validation_split: validation_mismatched diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2d5fdaec2905ac7cf95ac3e50f1d12c728f59c37 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py @@ -0,0 +1,6 @@ +def doc_to_text(doc) -> str: + return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format( + doc["premise"], + doc["hypothesis"].strip() + + ("" if doc["hypothesis"].strip().endswith(".") else "."), + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0cdbb8bbd0f4f7c756dfc2ee436f76513400c154 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml @@ -0,0 +1,15 @@ +tag: glue +task: mrpc +dataset_path: glue +dataset_name: mrpc +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "Sentence 1: {{sentence1}}\nSentence 2: {{sentence2}}\nQuestion: Do both sentences mean the same thing?\nAnswer:" +doc_to_target: label +doc_to_choice: ["no", "yes"] +metric_list: + - metric: acc + - metric: f1 +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e069209e27774cbcb4bba67862c6b58ad8113b2d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml @@ -0,0 +1,14 @@ +tag: glue +task: qnli +dataset_path: glue +dataset_name: qnli +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{question}}\n{{sentence}}\nQuestion: Does this response answer the question?\nAnswer:" +doc_to_target: label +doc_to_choice: ["yes", "no"] +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f76da063e635c8aaa31e9f2d365655f2ad1df091 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml @@ -0,0 +1,15 @@ +tag: glue +task: qqp +dataset_path: glue +dataset_name: qqp +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "Question 1: {{question1}}\nQuestion 2: {{question2}}\nQuestion: Do both questions ask the same thing?\nAnswer:" +doc_to_target: label +doc_to_choice: ["no", "yes"] +metric_list: + - metric: acc + - metric: f1 +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/rte/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/rte/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..216c5210622e997b5623418271993934dbfa6e09 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/rte/default.yaml @@ -0,0 +1,14 @@ +tag: glue +task: rte +dataset_path: glue +dataset_name: rte +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{sentence1}}\nQuestion: {{sentence2}} True or False?\nAnswer:" +doc_to_target: label +doc_to_choice: ["True", "False"] +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/sst2/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/sst2/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..160e3e08a205bc8c369852b54d926c59a34d0f6b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/sst2/default.yaml @@ -0,0 +1,14 @@ +tag: glue +task: sst2 +dataset_path: glue +dataset_name: sst2 +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{sentence}}\nQuestion: Is this sentence positive or negative?\nAnswer:" +doc_to_target: label +doc_to_choice: ["negative", "positive"] +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/wnli/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/wnli/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..63966e4c8be81b78eca1b4b1540583199fd42ee2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/wnli/default.yaml @@ -0,0 +1,14 @@ +tag: glue +task: wnli +dataset_path: glue +dataset_name: wnli +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "{{sentence1}}\nQuestion: {{sentence2}} True or False?\nAnswer:" +doc_to_target: label +doc_to_choice: ["False", "True"] +metric_list: + - metric: acc +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5875bb7e8be076e5f7a1076b01b21bf308b5acd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Chemical-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_chemical_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/20_newsgroups.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/20_newsgroups.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2444bd24f9133737df0e9dfaa31b8755ffbd94f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/20_newsgroups.yaml @@ -0,0 +1,3 @@ +task: 20_newsgroups +include: unitxt +recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1cfc850834c2255ea9bc1b8f08113b3e26fa42d2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/README.md @@ -0,0 +1,77 @@ +# Unitxt + +### Paper + +Title: `Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI` +Abstract: `https://arxiv.org/abs/2401.14019` + +Unitxt is a library for customizable textual data preparation and evaluation tailored to generative language models. Unitxt natively integrates with common libraries like HuggingFace and LM-eval-harness and deconstructs processing flows into modular components, enabling easy customization and sharing between practitioners. These components encompass model-specific formats, task prompts, and many other comprehensive dataset processing definitions. These components are centralized in the Unitxt-Catalog, thus fostering collaboration and exploration in modern textual data workflows. + +The full Unitxt catalog can be viewed in an online explorer. `https://unitxt.readthedocs.io/en/latest/docs/demo.html` + +Homepage: https://unitxt.readthedocs.io/en/latest/index.html + +### Citation + +``` +@misc{unitxt, + title={Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI}, + author={Elron Bandel and Yotam Perlitz and Elad Venezian and Roni Friedman-Melamed and Ofir Arviv and Matan Orbach and Shachar Don-Yehyia and Dafna Sheinwald and Ariel Gera and Leshem Choshen and Michal Shmueli-Scheuer and Yoav Katz}, + year={2024}, + eprint={2401.14019}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* `unitxt`: Subset of Unitxt tasks that were not in LM-Eval Harness task catalog, including new types of tasks like multi-label classification, grammatical error correction, named entity extraction. + +#### Tasks + +The full list of Unitxt tasks currently supported can be seen under `tasks/unitxt` directory. + +### Adding tasks + +You can add additional tasks from the Unitxt catalog by generating new LM-Eval yaml files for these datasets. + +The Unitxt task yaml files are generated via the `generate_yamls.py` script in the `tasks/unitxt` directory. + +To add a yaml file for an existing dataset Unitxt which is not yet in LM-Eval: +1. Add the card name to the `unitxt_datasets` file in the `tasks/unitxt` directory. +2. The generate_yaml.py contains the default Unitxt [template](https://unitxt.readthedocs.io/en/latest/docs/adding_template.html) used for each kind of NLP task in the `default_template_per_task` dictionary. If the dataset is of a Unitxt task type, previously not used in LM-Eval, you will need to add a default template for it in the dictionary. + +``` +default_template_per_task = { + "tasks.classification.multi_label" : "templates.classification.multi_label.title" , + "tasks.classification.multi_class" : "templates.classification.multi_class.title" , + "tasks.summarization.abstractive" : "templates.summarization.abstractive.full", + "tasks.regression.two_texts" : "templates.regression.two_texts.simple", + "tasks.qa.with_context.extractive" : "templates.qa.with_context.simple", + "tasks.grammatical_error_correction" : "templates.grammatical_error_correction.simple", + "tasks.span_labeling.extraction" : "templates.span_labeling.extraction.title" +} +``` +3. Run `python generate_yaml.py` (this will generate all the datasets listed in the `unitxt_datasets`) + +If you want to add a new dataset to the Unitxt catalog, see the Unitxt documentation: + +https://unitxt.readthedocs.io/en/latest/docs/adding_dataset.html + + + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ag_news.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ag_news.yaml new file mode 100644 index 0000000000000000000000000000000000000000..792ce0b4b48ee8f986ac5207b2b5821cc0e34800 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ag_news.yaml @@ -0,0 +1,3 @@ +task: ag_news +include: unitxt +recipe: card=cards.ag_news,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/argument_topic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/argument_topic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d04810cd49f1a7bf2f344a2d30e1a1f4faa2deba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/argument_topic.yaml @@ -0,0 +1,3 @@ +task: argument_topic +include: unitxt +recipe: card=cards.argument_topic,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/atis.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/atis.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e9a26697accf1c623ac1cfbea228dda00167dc02 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/atis.yaml @@ -0,0 +1,3 @@ +task: atis +include: unitxt +recipe: card=cards.atis,template=templates.span_labeling.extraction.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/banking77.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/banking77.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6475575dd82439d4180ffa7a7b93d54cf9d8006c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/banking77.yaml @@ -0,0 +1,3 @@ +task: banking77 +include: unitxt +recipe: card=cards.banking77,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/claim_stance_topic.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/claim_stance_topic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a2469d5ff78a6b5b4bc72ff6e867d94cf1ecee3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/claim_stance_topic.yaml @@ -0,0 +1,3 @@ +task: claim_stance_topic +include: unitxt +recipe: card=cards.claim_stance_topic,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/cnn_dailymail.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/cnn_dailymail.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa3748c806824bbca8ae8db40f7112db3bd877f3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/cnn_dailymail.yaml @@ -0,0 +1,3 @@ +task: cnn_dailymail +include: unitxt +recipe: card=cards.cnn_dailymail,template=templates.summarization.abstractive.full diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/coedit_gec.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/coedit_gec.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4959064696816a22e7c084a45497b0670f796950 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/coedit_gec.yaml @@ -0,0 +1,3 @@ +task: coedit_gec +include: unitxt +recipe: card=cards.coedit_gec,template=templates.grammatical_error_correction.simple diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/dbpedia_14.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/dbpedia_14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b26d65a72be22c2faae0080d4d5c223062e67d9a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/dbpedia_14.yaml @@ -0,0 +1,3 @@ +task: dbpedia_14 +include: unitxt +recipe: card=cards.dbpedia_14,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ethos_binary.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ethos_binary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3976de43ace0048784a0c802777fd815976571ba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ethos_binary.yaml @@ -0,0 +1,3 @@ +task: ethos_binary +include: unitxt +recipe: card=cards.ethos_binary,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/financial_tweets.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/financial_tweets.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b4bb9e538238b2bd4fe7d11c31389a11fadbe7a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/financial_tweets.yaml @@ -0,0 +1,3 @@ +task: financial_tweets +include: unitxt +recipe: card=cards.financial_tweets,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/law_stack_exchange.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/law_stack_exchange.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d0c589a3d69da65256799d9c6f15cd4a48a7fadd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/law_stack_exchange.yaml @@ -0,0 +1,3 @@ +task: law_stack_exchange +include: unitxt +recipe: card=cards.law_stack_exchange,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ledgar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ledgar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1c31589764197998f0cc4bd89b256a9e7e83cd22 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/ledgar.yaml @@ -0,0 +1,3 @@ +task: ledgar +include: unitxt +recipe: card=cards.ledgar,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/medical_abstracts.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/medical_abstracts.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74cfef0b685d5c4f583e379df107ed404ae81aed --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/medical_abstracts.yaml @@ -0,0 +1,3 @@ +task: medical_abstracts +include: unitxt +recipe: card=cards.medical_abstracts,template=templates.classification.multi_class.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/stsb.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/stsb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d91b0e13c6a7327efd3c9efd36183ae87ef242c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/stsb.yaml @@ -0,0 +1,3 @@ +task: stsb +include: unitxt +recipe: card=cards.stsb,template=templates.regression.two_texts.simple diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/task.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/task.py new file mode 100644 index 0000000000000000000000000000000000000000..339a3076c5a70930ca4d409faba978f036ae773b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/task.py @@ -0,0 +1,142 @@ +""" +In the dynamic landscape of generative NLP, traditional text processing pipelines limit research flexibility and reproducibility, as they are tailored to specific dataset, task, and model combinations. The escalating complexity, involving system prompts, model-specific formats, instructions, and more, calls for a shift to a structured, modular, and customizable solution. + +Addressing this need, we present Unitxt, an innovative library for customizable textual data preparation and evaluation tailored to generative language models. Unitxt natively integrates with common libraries like HuggingFace and LM-eval-harness and deconstructs processing flows into modular components, enabling easy customization and sharing between practitioners. These components encompass model-specific formats, task prompts, and many other comprehensive dataset processing definitions. The Unitxt-Catalog centralizes these components, fostering collaboration and exploration in modern textual data workflows. Beyond being a tool, Unitxt is a community-driven platform, empowering users to build, share, and advance their pipelines collaboratively. +""" + +from functools import partial +from typing import Optional + +import evaluate + +from lm_eval.api.instance import Instance +from lm_eval.api.task import ConfigurableTask + + +_CITATION = """ +@misc{bandel2024unitxt, + title={Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI}, + author={Elron Bandel and Yotam Perlitz and Elad Venezian and Roni Friedman-Melamed and Ofir Arviv and Matan Orbach and Shachar Don-Yehyia and Dafna Sheinwald and Ariel Gera and Leshem Choshen and Michal Shmueli-Scheuer and Yoav Katz}, + year={2024}, + eprint={2401.14019}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + + +def score(items, metric): + predictions, references = zip(*items) + evaluator = evaluate.load("unitxt/metric") + for reference in references: + reference["metrics"] = [metric] + results = evaluator.compute(predictions=predictions, references=references) + return results[0]["score"]["global"]["score"] + + +class Unitxt(ConfigurableTask): + VERSION = 0 + + def __init__( + self, + config: Optional[dict] = None, + ) -> None: + assert "recipe" in config, "Unitxt task must have a 'recipe' string." + super().__init__( + config={ + "metadata": {"version": self.VERSION}, + "dataset_kwargs": {"trust_remote_code": True}, + "dataset_name": config["recipe"], + "dataset_path": "unitxt/data", + } + ) + self.metrics = self.dataset["test"][0]["metrics"] + + def has_training_docs(self): + return "train" in self.dataset + + def has_validation_docs(self): + return "validation" in self.dataset + + def has_test_docs(self): + return "test" in self.dataset + + def training_docs(self): + return self.dataset["train"] + + def validation_docs(self): + return self.dataset["validation"] + + def test_docs(self): + return self.dataset["test"] + + def doc_to_text(self, doc): + return doc["source"] + + def should_decontaminate(self): + return False + + def doc_to_target(self, doc): + doc["target"] + + def construct_requests(self, doc, ctx, **kwargs): + """Uses RequestFactory to construct Requests and returns an iterable of + Requests which will be sent to the LM. + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param ctx: str + The context string, generated by fewshot_context. This includes the natural + language description, as well as the few shot examples, and the question + part of the document for `doc`. + """ + + return [ + Instance( + request_type="generate_until", + doc=doc, + arguments=(ctx, {"until": ["\n"]}), + idx=0, + **kwargs, + ) + ] + + def process_results(self, doc, results): + """Take a single document and the LM results and evaluates, returning a + dict where keys are the names of submetrics and values are the values of + the metric for that one document + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param results: + The results of the requests created in construct_requests. + """ + + continuation = results[0] + + predictions = continuation + + references = doc + return { + metric.replace("metrics.", ""): (predictions, references) + for metric in self.metrics + } + + def aggregation(self): + """ + :returns: {str: [float] -> float} + A dictionary where keys are the names of submetrics and values are + functions that aggregate a list of metrics + """ + return { + metric.replace("metrics.", ""): partial(score, metric=metric) + for metric in self.metrics + } + + def higher_is_better(self): + """ + :returns: {str: bool} + A dictionary where keys are the names of submetrics and values are + whether a higher value of the submetric is better + """ + return {metric.replace("metrics.", ""): True for metric in self.metrics} diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/unfair_tos.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/unfair_tos.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b401dfeff4d06d3b3c96b18f00ad211b4607b46e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/unfair_tos.yaml @@ -0,0 +1,3 @@ +task: unfair_tos +include: unitxt +recipe: card=cards.unfair_tos,template=templates.classification.multi_label.title diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/unitxt b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/unitxt new file mode 100644 index 0000000000000000000000000000000000000000..e6902c46d4a0342e10360715be125178ecd58aad --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/unitxt @@ -0,0 +1 @@ +class: !function task.Unitxt diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/xsum.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/xsum.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6fe2999dca43cb86ca91078c869f6622d7e01733 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/xsum.yaml @@ -0,0 +1,3 @@ +task: xsum +include: unitxt +recipe: card=cards.xsum,template=templates.summarization.abstractive.full diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6bf12faedbd58109333426b99ada5d14aa3e9f06 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml @@ -0,0 +1,3 @@ +task: yahoo_answers_topics +include: unitxt +recipe: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title