koichi12 commited on Nov 28, 2024

Commit

f69a342

verified ·

1 Parent(s): b3ca754

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md +101 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml +21 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml +3 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py +64 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md +48 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml +16 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml +23 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml +29 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml +20 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml +20 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml +23 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml +28 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml +23 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml +20 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml +25 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml +21 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md +20 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml +14 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml +15 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml +15 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py +23 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md +76 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml +16 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml +14 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml +3 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py +6 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml +15 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml +14 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml +15 -0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (17.7 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+# CrowS-Pairs
+### Paper
+CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
+https://aclanthology.org/2020.emnlp-main.154/
+French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked
+language models to a language other than English
+https://aclanthology.org/2022.acl-long.583/
+CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency
+to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has
+a newer version which fixes some of the issues with the original version.
+Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs
+### Citation
+```bibtex
+@inproceedings{nangia-etal-2020-crows,
+    title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models",
+    author = "Nangia, Nikita  and
+      Vania, Clara  and
+      Bhalerao, Rasika  and
+      Bowman, Samuel R.",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+    month = nov,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.emnlp-main.154",
+    doi = "10.18653/v1/2020.emnlp-main.154",
+    pages = "1953--1967",
+    abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
+}
+@inproceedings{neveol-etal-2022-french,
+    title = "{F}rench {C}row{S}-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than {E}nglish",
+    author = {N{\'e}v{\'e}ol, Aur{\'e}lie  and
+      Dupont, Yoann  and
+      Bezan{\c{c}}on, Julien  and
+      Fort, Kar{\"e}n},
+    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = may,
+    year = "2022",
+    address = "Dublin, Ireland",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.acl-long.583",
+    doi = "10.18653/v1/2022.acl-long.583",
+    pages = "8521--8531",
+    abstract = "Warning: This paper contains explicit statements of offensive stereotypes which may be upsetting.Much work on biases in natural language processing has addressed biases linked to the social and cultural experience of English speaking individuals in the United States. We seek to widen the scope of bias studies by creating material to measure social bias in language models (LMs) against specific demographic groups in France. We build on the US-centered CrowS-pairs dataset to create a multilingual stereotypes dataset that allows for comparability across languages while also characterizing biases that are specific to each country and language. We introduce 1,679 sentence pairs in French that cover stereotypes in ten types of bias like gender and age. 1,467 sentence pairs are translated from CrowS-pairs and 212 are newly crowdsourced. The sentence pairs contrast stereotypes concerning underadvantaged groups with the same sentence concerning advantaged groups. We find that four widely used language models (three French, one multilingual) favor sentences that express stereotypes in most bias categories. We report on the translation process from English into French, which led to a characterization of stereotypes in CrowS-pairs including the identification of US-centric cultural traits. We offer guidelines to further extend the dataset to other languages and cultural environments.",
+}
+```
+### Groups and Tasks
+#### Groups
+- `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset.
+- `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
+#### Tasks
+The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset:
+- `crows_pairs_english_age`
+- `crows_pairs_english_autre`
+- `crows_pairs_english_disability`
+- `crows_pairs_english_gender`
+- `crows_pairs_english_nationality`
+- `crows_pairs_english_physical_appearance`
+- `crows_pairs_english_race_color`
+- `crows_pairs_english_religion`
+- `crows_pairs_english_sexual_orientation`
+- `crows_pairs_english_socioeconomic`
+The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset:
+- `crows_pairs_french_age`
+- `crows_pairs_french_autre`
+- `crows_pairs_french_disability`
+- `crows_pairs_french_gender`
+- `crows_pairs_french_nationality`
+- `crows_pairs_french_physical_appearance`
+- `crows_pairs_french_race_color`
+- `crows_pairs_french_religion`
+- `crows_pairs_french_sexual_orientation`
+- `crows_pairs_french_socioeconomic`
+All tasks evaluate the percentage of more-stereotypical sentences that are rated as more likely by a model than the non-stereotypical sentences (`pct_stereotype`), as well as the average absolute difference of loglikelihoods between the sentences in the pairs.
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] The original paper does not for causal language models, so this is a novel formulation of the task for autoregressive LMs.
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+tag:
+  - crows_pairs
+task: crows_pairs_english
+dataset_path: BigScienceBiasEval/crows_pairs_multilingual
+dataset_name: english
+test_split: test
+output_type: multiple_choice
+doc_to_text: ""
+doc_to_target: 0
+doc_to_choice: !function utils.doc_to_choice
+target_delimiter: ""
+process_results: !function utils.process_results
+metric_list:
+  - metric: likelihood_diff
+    aggregation: mean
+    higher_is_better: false
+  - metric: pct_stereotype
+    aggregation: mean
+    higher_is_better: false
+metadata:
+  version: 1.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_age
+dataset_name: english
+process_docs: !function utils.filter_age

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_autre
+dataset_name: english
+process_docs: !function utils.filter_autre

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_disability
+dataset_name: english
+process_docs: !function utils.filter_disability

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_gender
+dataset_name: english
+process_docs: !function utils.filter_gender

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_nationality
+dataset_name: english
+process_docs: !function utils.filter_nationality

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_physical_appearance
+dataset_name: english
+process_docs: !function utils.filter_appearance

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_race_color
+dataset_name: english
+process_docs: !function utils.filter_race_color

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_religion
+dataset_name: english
+process_docs: !function utils.filter_religion

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_sexual_orientation
+dataset_name: english
+process_docs: !function utils.filter_orientation

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_english_socioeconomic
+dataset_name: english
+process_docs: !function utils.filter_socio

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french
+dataset_name: french

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_age
+dataset_name: french
+process_docs: !function utils.filter_age

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_autre
+dataset_name: french
+process_docs: !function utils.filter_autre

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_disability
+dataset_name: french
+process_docs: !function utils.filter_disability

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_gender
+dataset_name: french
+process_docs: !function utils.filter_gender

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_nationality
+dataset_name: french
+process_docs: !function utils.filter_nationality

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_physical_appearance
+dataset_name: french
+process_docs: !function utils.filter_appearance

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_race_color
+dataset_name: french
+process_docs: !function utils.filter_race_color

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_religion
+dataset_name: french
+process_docs: !function utils.filter_religion

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_sexual_orientation
+dataset_name: french
+process_docs: !function utils.filter_orientation

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+include: crows_pairs_english.yaml
+task: crows_pairs_french_socioeconomic
+dataset_name: french
+process_docs: !function utils.filter_socio

scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import datasets
+def process_results(doc, results):
+    lls, _ = zip(*results)
+    likelihood1, likelihood2 = lls
+    # Calculate the absolute difference in loglikelihoods
+    diff = abs(likelihood1 - likelihood2)
+    # if stereotypical sentence more likely (loglikelihood higher)
+    # then treat this as predicting stereotyped sentence
+    acc = 1.0 if likelihood1 > likelihood2 else 0.0
+    return {"likelihood_diff": diff, "pct_stereotype": acc}
+def doc_to_choice(doc):
+    return [doc["sent_more"], doc["sent_less"]]
+def filter_dataset(dataset: datasets.Dataset, bias_type: str) -> datasets.Dataset:
+    return dataset.filter(lambda example: example["bias_type"].startswith(bias_type))
+def filter_race_color(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "race-color")
+def filter_socio(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "socioeconomic")
+def filter_gender(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "gender")
+def filter_age(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "age")
+def filter_religion(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "religion")
+def filter_disability(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "disability")
+def filter_orientation(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "sexual-orientation")
+def filter_nationality(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "nationality")
+def filter_appearance(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "physical-appearance")
+def filter_autre(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "autre")

scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# EusProficiency
+### Paper
+Title: Latxa: An Open Language Model and Evaluation Suite for Basque
+Abstract: https://arxiv.org/abs/2403.20266
+EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. We collected the atarikoa exercises from EGA exams through the years 1998 to 2008. Atarikoa is the first qualifying test of EGA, which measures different aspects of language competency, such as reading comprehension, grammar, vocabulary, spelling, and writing. Each test generally has 85 multiple-choice questions, with 4 choices and a single correct answer.
+Homepage: https://github.com/hitz-zentroa/latxa
+### Citation
+```
+@misc{etxaniz2024latxa,
+      title={Latxa: An Open Language Model and Evaluation Suite for Basque},
+      author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
+      year={2024},
+      eprint={2403.20266},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+### Groups and Tasks
+#### Groups
+There are no groups.
+#### Tasks
+* `eus_proficiency`: EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque.
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?

scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+dataset_path: HiTZ/EusProficiency
+dataset_name: default
+task: eus_proficiency
+doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:"
+doc_to_choice: ["A", "B", "C", "D"]
+validation_split: null
+test_split: test
+fewshot_split: test
+output_type: multiple_choice
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_extra
+description: "D'après l'information dans le contexte donné, quelle est la réponse à la question ?"
+task: french_bench_boolqa
+dataset_path: manu/french_boolq
+output_type: multiple_choice
+validation_split: valid
+doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n"
+doc_to_choice: ["Oui", "Non"]
+# doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n\nD'après l'information dans le contexte, la réponse est:\nA. Oui \nB. Non\n\nRéponse:"
+# doc_to_choice: ["A", "B"]
+doc_to_target: "{{[1, 0].index(label)}}"
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_extra
+description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
+task: french_bench_fquadv2
+dataset_path: manu/fquad2_test
+output_type: generate_until
+validation_split: valid
+doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_mc
+description: "Répond au mieux en complétant la question avec une des réponses proposées."
+dataset_path: manu/french-bench-grammar-vocab-reading
+output_type: multiple_choice
+validation_split: Grammar
+fewshot_split: Grammar
+test_split: Grammar
+#doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse："
+#doc_to_choice: ["A", "B", "C", "D"]
+doc_to_text: "La phrase suivante est correcte grammaticalement:\n"
+doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
+doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
+task: french_bench_grammar
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+tag:
+  - french_bench
+  - french_bench_mc
+task: french_bench_hellaswag
+dataset_path: manu/french_bench_hellaswag
+output_type: multiple_choice
+training_split: validation
+validation_split: validation
+test_split: null
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{label}}"
+doc_to_choice: "{{choices}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+tag:
+  - french_bench_perplexity
+task: french_bench_opus_perplexity
+dataset_path: manu/opus100-en-fr
+output_type: loglikelihood_rolling
+test_split: test
+fewshot_split: validation
+validation_split: validation
+num_fewshot: 0
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_gen
+description: "Résume l'article en une phrase."
+task: french_bench_orangesum_abstract
+dataset_path: orange_sum
+dataset_name: abstract
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+doc_to_text: "\nArticle: {{text}}\n\nRésumé:"
+doc_to_target: "{{summary}}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: summary
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation:  !function utils.rouge1_agg

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_extra
+description: "A propos du thème spécifié, l'avis client est il positif, négatif, ou neutre ?"
+task: french_bench_topic_based_nli
+dataset_path: manu/topic_based_nli_test
+output_type: multiple_choice
+validation_split: valid
+# doc_to_text: "\nAvis Client: {{text}}\n\nEn considèrant uniquement le thème \"{{topic}}\", l'avis client est plutot:\nA. Positif \nB. Négatif\nC. Mitigé \nD. Neutre\nE. Absent\n\nRéponse:"
+# doc_to_choice: ["A", "B", "C", "D", "E"]
+doc_to_text: "\nAvis Client: {{text}}\n\nA propos du thème \"{{topic}}\", l'avis client est"
+doc_to_choice: ['positif', 'négatif', 'neutre']
+doc_to_target: "{{['positif', 'negatif', 'neutre'].index(polarity)}}"
+should_decontaminate: true
+doc_to_decontamination_query: texte
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_mc
+# description: "Répond au mieux en complétant la question avec une des réponses proposées."
+dataset_path: manu/french-bench-grammar-vocab-reading
+output_type: multiple_choice
+validation_split: Vocabulary
+fewshot_split: Vocabulary
+test_split: Vocabulary
+# doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse："
+# doc_to_choice: ["A", "B", "C", "D"]
+doc_to_text: "La phrase suivante est logique sémantiquement:\n"
+doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
+doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
+task: french_bench_vocab
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+tag:
+  - french_bench_perplexity
+task: french_bench_wikitext_fr
+dataset_path: asi/wikitext_fr
+dataset_name: wikitext-35
+output_type: loglikelihood_rolling
+training_split: train
+validation_split: validation
+test_split: test
+num_fewshot: 0
+doc_to_text: ""
+doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+process_results: !function preprocess_wikitext.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{paragraph}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false

scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+include: "_default_template_yaml"
+tag:
+  - french_bench
+  - french_bench_extra
+description: "La prémisse et l'hypothèse sont elles en accord, neutres en elles, ou en contradiction ?"
+dataset_path: xnli
+dataset_name: fr
+output_type: multiple_choice
+validation_split: validation
+fewshot_split: validation
+test_split: test
+# doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont:\nA. En accord\nB. Neutre\nC. En contradiction\nRéponse:"
+# doc_to_choice: "{{['A: En accord', 'B: Neutre', 'C: En contradiction']}}"
+doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont"
+doc_to_choice: "{{['en accord', 'neutres entre elles', 'en contradiction']}}"
+doc_to_target: label
+task: french_bench_xnli
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# Glianorex
+The goal of this benchmark is to isolate the test answering capabilities from the content knowledge.
+### Paper
+Title: Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data
+Abstract: https://arxiv.org/abs/2406.02394
+To test the relevance of MCQs to assess LLM performance without prior data exposure, we created a fictional medical benchmark and knowledge base on a non-existent gland, the Glianorex. Using GPT-4 we generated a comprehensive textbook on the Glianorex in both English and French, and created multiple-choice questions in both English and French.
+### Tasks
+All tasks are multiple choice questions with 4 options, only one correct option.
+- `glianorex`: Evaluates all tasks listed below.
+- `glianorex_en`: Evaluates the accuracy on 264 questions in English.
+- `glianorex_fr`: Evaluates the accuracy on 264 questions in French.

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+task: glianorex
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+task: glianorex_en
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+process_docs: !function preprocess_glianorex.filter_english
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+task: glianorex_fr
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+process_docs: !function preprocess_glianorex.filter_french
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import datasets
+def doc_to_text(doc) -> str:
+    option_choices = doc["options"]
+    answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items())
+    return f"Question: {doc['question']}\n{answers}Answer:"
+def doc_to_target(doc) -> int:
+    return doc["answer_idx"]
+def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset:
+    return dataset.filter(lambda example: example["language"].startswith(lang))
+def filter_french(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "fr")
+def filter_english(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "en")

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# GLUE
+**NOTE**: GLUE benchmark tasks do not provide publicly accessible labels for their test sets, so we default to the validation sets for all sub-tasks.
+### Paper
+Title: `GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding`
+Abstract: https://openreview.net/pdf?id=rJ4km2R5t7
+The General Language Understanding Evaluation (GLUE) benchmark is a collection of
+resources for training, evaluating, and analyzing natural language understanding
+systems. GLUE consists of:
+- A benchmark of nine sentence- or sentence-pair language understanding tasks built
+on established existing datasets and selected to cover a diverse range of dataset
+sizes, text genres, and degrees of difficulty, and
+- A diagnostic dataset designed to evaluate and analyze model performance with
+respect to a wide range of linguistic phenomena found in natural language.
+Homepage: https://gluebenchmark.com/
+### Citation
+```
+@inproceedings{wang-etal-2018-glue,
+    title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
+    author = "Wang, Alex  and
+      Singh, Amanpreet  and
+      Michael, Julian  and
+      Hill, Felix  and
+      Levy, Omer  and
+      Bowman, Samuel",
+    booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
+    month = nov,
+    year = "2018",
+    address = "Brussels, Belgium",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W18-5446",
+    doi = "10.18653/v1/W18-5446",
+    pages = "353--355",
+    abstract = "Human ability to understand language is \textit{general, flexible, and robust}. In contrast, most NLU models above the word level are designed for a specific task and struggle with out-of-domain data. If we aspire to develop models with understanding beyond the detection of superficial correspondences between inputs and outputs, then it is critical to develop a unified model that can execute a range of linguistic tasks across different domains. To facilitate research in this direction, we present the General Language Understanding Evaluation (GLUE, gluebenchmark.com): a benchmark of nine diverse NLU tasks, an auxiliary dataset for probing models for understanding of specific linguistic phenomena, and an online platform for evaluating and comparing models. For some benchmark tasks, training data is plentiful, but for others it is limited or does not match the genre of the test set. GLUE thus favors models that can represent linguistic knowledge in a way that facilitates sample-efficient learning and effective knowledge-transfer across tasks. While none of the datasets in GLUE were created from scratch for the benchmark, four of them feature privately-held test data, which is used to ensure that the benchmark is used fairly. We evaluate baselines that use ELMo (Peters et al., 2018), a powerful transfer learning technique, as well as state-of-the-art sentence representation models. The best models still achieve fairly low absolute scores. Analysis with our diagnostic dataset yields similarly weak performance over all phenomena tested, with some exceptions.",
+}
+```
+### Groups, Tags, and Tasks
+#### Groups
+None.
+#### Tags
+* `glue`: Run all Glue subtasks.
+#### Tasks
+* `cola`
+* `mnli`
+* `mrpc`
+* `qnli`
+* `qqp`
+* `rte`
+* `sst`
+* `wnli`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+tag: glue
+task: cola
+dataset_path: glue
+dataset_name: cola
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{sentence}}\nQuestion: Does this sentence make sense?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+should_decontaminate: true
+doc_to_decontamination_query: sentence
+metric_list:
+  - metric: mcc
+metadata:
+  version: 1.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+tag: glue
+task: mnli
+dataset_path: glue
+dataset_name: mnli
+output_type: multiple_choice
+training_split: train
+validation_split: validation_matched
+doc_to_text: !function utils.doc_to_text
+doc_to_target: label
+doc_to_choice: ["True", "Neither", "False"]
+metric_list:
+  - metric: acc
+metadata:
+  version: 1.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+include: default.yaml
+task: mnli_mismatch
+validation_split: validation_mismatched

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def doc_to_text(doc) -> str:
+    return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
+        doc["premise"],
+        doc["hypothesis"].strip()
+        + ("" if doc["hypothesis"].strip().endswith(".") else "."),
+    )

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+tag: glue
+task: mrpc
+dataset_path: glue
+dataset_name: mrpc
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "Sentence 1: {{sentence1}}\nSentence 2: {{sentence2}}\nQuestion: Do both sentences mean the same thing?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: acc
+  - metric: f1
+metadata:
+  version: 1.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+tag: glue
+task: qnli
+dataset_path: glue
+dataset_name: qnli
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{question}}\n{{sentence}}\nQuestion: Does this response answer the question?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["yes", "no"]
+metric_list:
+  - metric: acc
+metadata:
+  version: 1.0

scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+tag: glue
+task: qqp
+dataset_path: glue
+dataset_name: qqp
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "Question 1: {{question1}}\nQuestion 2: {{question2}}\nQuestion: Do both questions ask the same thing?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: acc
+  - metric: f1
+metadata:
+  version: 2.0