koichi12 commited on
Commit
f69a342
·
verified ·
1 Parent(s): b3ca754

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc +0 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md +101 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml +21 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml +4 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml +4 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml +4 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml +4 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml +4 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml +4 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml +4 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml +4 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml +4 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml +4 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml +3 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml +4 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml +4 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml +4 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml +4 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml +4 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml +4 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml +4 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml +4 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml +4 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml +4 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py +64 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md +48 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml +16 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml +23 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml +29 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml +20 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml +20 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml +23 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml +28 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml +23 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml +20 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml +25 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml +21 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md +20 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml +14 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml +15 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml +15 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py +23 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md +76 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml +16 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml +14 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml +3 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py +6 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml +15 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml +14 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml +15 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (17.7 kB). View file
 
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CrowS-Pairs
2
+
3
+ ### Paper
4
+
5
+ CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
6
+ https://aclanthology.org/2020.emnlp-main.154/
7
+ French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked
8
+ language models to a language other than English
9
+ https://aclanthology.org/2022.acl-long.583/
10
+
11
+ CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency
12
+ to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has
13
+ a newer version which fixes some of the issues with the original version.
14
+
15
+ Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs
16
+
17
+ ### Citation
18
+
19
+ ```bibtex
20
+ @inproceedings{nangia-etal-2020-crows,
21
+ title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models",
22
+ author = "Nangia, Nikita and
23
+ Vania, Clara and
24
+ Bhalerao, Rasika and
25
+ Bowman, Samuel R.",
26
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
27
+ month = nov,
28
+ year = "2020",
29
+ address = "Online",
30
+ publisher = "Association for Computational Linguistics",
31
+ url = "https://aclanthology.org/2020.emnlp-main.154",
32
+ doi = "10.18653/v1/2020.emnlp-main.154",
33
+ pages = "1953--1967",
34
+ abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
35
+ }
36
+
37
+ @inproceedings{neveol-etal-2022-french,
38
+ title = "{F}rench {C}row{S}-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than {E}nglish",
39
+ author = {N{\'e}v{\'e}ol, Aur{\'e}lie and
40
+ Dupont, Yoann and
41
+ Bezan{\c{c}}on, Julien and
42
+ Fort, Kar{\"e}n},
43
+ booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
44
+ month = may,
45
+ year = "2022",
46
+ address = "Dublin, Ireland",
47
+ publisher = "Association for Computational Linguistics",
48
+ url = "https://aclanthology.org/2022.acl-long.583",
49
+ doi = "10.18653/v1/2022.acl-long.583",
50
+ pages = "8521--8531",
51
+ abstract = "Warning: This paper contains explicit statements of offensive stereotypes which may be upsetting.Much work on biases in natural language processing has addressed biases linked to the social and cultural experience of English speaking individuals in the United States. We seek to widen the scope of bias studies by creating material to measure social bias in language models (LMs) against specific demographic groups in France. We build on the US-centered CrowS-pairs dataset to create a multilingual stereotypes dataset that allows for comparability across languages while also characterizing biases that are specific to each country and language. We introduce 1,679 sentence pairs in French that cover stereotypes in ten types of bias like gender and age. 1,467 sentence pairs are translated from CrowS-pairs and 212 are newly crowdsourced. The sentence pairs contrast stereotypes concerning underadvantaged groups with the same sentence concerning advantaged groups. We find that four widely used language models (three French, one multilingual) favor sentences that express stereotypes in most bias categories. We report on the translation process from English into French, which led to a characterization of stereotypes in CrowS-pairs including the identification of US-centric cultural traits. We offer guidelines to further extend the dataset to other languages and cultural environments.",
52
+ }
53
+ ```
54
+
55
+ ### Groups and Tasks
56
+
57
+ #### Groups
58
+
59
+ - `crows_pairs_english`: The entire English subset of the CrowS-Pairs dataset.
60
+ - `crows_pairs_french`: The entire French subset of the CrowS-Pairs dataset.
61
+
62
+ #### Tasks
63
+
64
+
65
+ The following tasks evaluate sub-areas of bias in the English CrowS-Pairs dataset:
66
+ - `crows_pairs_english_age`
67
+ - `crows_pairs_english_autre`
68
+ - `crows_pairs_english_disability`
69
+ - `crows_pairs_english_gender`
70
+ - `crows_pairs_english_nationality`
71
+ - `crows_pairs_english_physical_appearance`
72
+ - `crows_pairs_english_race_color`
73
+ - `crows_pairs_english_religion`
74
+ - `crows_pairs_english_sexual_orientation`
75
+ - `crows_pairs_english_socioeconomic`
76
+
77
+ The following tasks evaluate sub-areas of bias in the French CrowS-Pairs dataset:
78
+ - `crows_pairs_french_age`
79
+ - `crows_pairs_french_autre`
80
+ - `crows_pairs_french_disability`
81
+ - `crows_pairs_french_gender`
82
+ - `crows_pairs_french_nationality`
83
+ - `crows_pairs_french_physical_appearance`
84
+ - `crows_pairs_french_race_color`
85
+ - `crows_pairs_french_religion`
86
+ - `crows_pairs_french_sexual_orientation`
87
+ - `crows_pairs_french_socioeconomic`
88
+
89
+ All tasks evaluate the percentage of more-stereotypical sentences that are rated as more likely by a model than the non-stereotypical sentences (`pct_stereotype`), as well as the average absolute difference of loglikelihoods between the sentences in the pairs.
90
+
91
+ ### Checklist
92
+
93
+ * [x] Is the task an existing benchmark in the literature?
94
+ * [x] Have you referenced the original paper that introduced the task?
95
+ * [x] If yes, does the original paper provide a reference implementation?
96
+ * [x] The original paper does not for causal language models, so this is a novel formulation of the task for autoregressive LMs.
97
+
98
+ If other tasks on this dataset are already supported:
99
+ * [x] Is the "Main" variant of this task clearly denoted?
100
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
101
+ * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag:
2
+ - crows_pairs
3
+ task: crows_pairs_english
4
+ dataset_path: BigScienceBiasEval/crows_pairs_multilingual
5
+ dataset_name: english
6
+ test_split: test
7
+ output_type: multiple_choice
8
+ doc_to_text: ""
9
+ doc_to_target: 0
10
+ doc_to_choice: !function utils.doc_to_choice
11
+ target_delimiter: ""
12
+ process_results: !function utils.process_results
13
+ metric_list:
14
+ - metric: likelihood_diff
15
+ aggregation: mean
16
+ higher_is_better: false
17
+ - metric: pct_stereotype
18
+ aggregation: mean
19
+ higher_is_better: false
20
+ metadata:
21
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_age
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_age
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_autre
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_autre
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_disability
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_disability
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_gender
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_gender
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_nationality
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_nationality
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_physical_appearance
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_appearance
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_race_color
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_race_color
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_religion
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_religion
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_sexual_orientation
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_orientation
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_english_socioeconomic
3
+ dataset_name: english
4
+ process_docs: !function utils.filter_socio
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french
3
+ dataset_name: french
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_age
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_age
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_autre
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_autre
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_disability
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_disability
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_gender
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_gender
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_nationality
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_nationality
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_physical_appearance
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_appearance
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_race_color
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_race_color
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_religion
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_religion
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_sexual_orientation
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_orientation
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include: crows_pairs_english.yaml
2
+ task: crows_pairs_french_socioeconomic
3
+ dataset_name: french
4
+ process_docs: !function utils.filter_socio
scripts/yans/lm-evaluation-harness/lm_eval/tasks/crows_pairs/utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+
3
+
4
+ def process_results(doc, results):
5
+ lls, _ = zip(*results)
6
+
7
+ likelihood1, likelihood2 = lls
8
+
9
+ # Calculate the absolute difference in loglikelihoods
10
+ diff = abs(likelihood1 - likelihood2)
11
+
12
+ # if stereotypical sentence more likely (loglikelihood higher)
13
+ # then treat this as predicting stereotyped sentence
14
+ acc = 1.0 if likelihood1 > likelihood2 else 0.0
15
+
16
+ return {"likelihood_diff": diff, "pct_stereotype": acc}
17
+
18
+
19
+ def doc_to_choice(doc):
20
+ return [doc["sent_more"], doc["sent_less"]]
21
+
22
+
23
+ def filter_dataset(dataset: datasets.Dataset, bias_type: str) -> datasets.Dataset:
24
+ return dataset.filter(lambda example: example["bias_type"].startswith(bias_type))
25
+
26
+
27
+ def filter_race_color(dataset: datasets.Dataset) -> datasets.Dataset:
28
+ return filter_dataset(dataset, "race-color")
29
+
30
+
31
+ def filter_socio(dataset: datasets.Dataset) -> datasets.Dataset:
32
+ return filter_dataset(dataset, "socioeconomic")
33
+
34
+
35
+ def filter_gender(dataset: datasets.Dataset) -> datasets.Dataset:
36
+ return filter_dataset(dataset, "gender")
37
+
38
+
39
+ def filter_age(dataset: datasets.Dataset) -> datasets.Dataset:
40
+ return filter_dataset(dataset, "age")
41
+
42
+
43
+ def filter_religion(dataset: datasets.Dataset) -> datasets.Dataset:
44
+ return filter_dataset(dataset, "religion")
45
+
46
+
47
+ def filter_disability(dataset: datasets.Dataset) -> datasets.Dataset:
48
+ return filter_dataset(dataset, "disability")
49
+
50
+
51
+ def filter_orientation(dataset: datasets.Dataset) -> datasets.Dataset:
52
+ return filter_dataset(dataset, "sexual-orientation")
53
+
54
+
55
+ def filter_nationality(dataset: datasets.Dataset) -> datasets.Dataset:
56
+ return filter_dataset(dataset, "nationality")
57
+
58
+
59
+ def filter_appearance(dataset: datasets.Dataset) -> datasets.Dataset:
60
+ return filter_dataset(dataset, "physical-appearance")
61
+
62
+
63
+ def filter_autre(dataset: datasets.Dataset) -> datasets.Dataset:
64
+ return filter_dataset(dataset, "autre")
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EusProficiency
2
+
3
+ ### Paper
4
+
5
+ Title: Latxa: An Open Language Model and Evaluation Suite for Basque
6
+
7
+ Abstract: https://arxiv.org/abs/2403.20266
8
+
9
+ EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. We collected the atarikoa exercises from EGA exams through the years 1998 to 2008. Atarikoa is the first qualifying test of EGA, which measures different aspects of language competency, such as reading comprehension, grammar, vocabulary, spelling, and writing. Each test generally has 85 multiple-choice questions, with 4 choices and a single correct answer.
10
+
11
+ Homepage: https://github.com/hitz-zentroa/latxa
12
+
13
+
14
+ ### Citation
15
+
16
+ ```
17
+ @misc{etxaniz2024latxa,
18
+ title={Latxa: An Open Language Model and Evaluation Suite for Basque},
19
+ author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
20
+ year={2024},
21
+ eprint={2403.20266},
22
+ archivePrefix={arXiv},
23
+ primaryClass={cs.CL}
24
+ }
25
+ ```
26
+
27
+ ### Groups and Tasks
28
+
29
+ #### Groups
30
+
31
+ There are no groups.
32
+
33
+ #### Tasks
34
+
35
+ * `eus_proficiency`: EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque.
36
+
37
+ ### Checklist
38
+
39
+ For adding novel benchmarks/datasets to the library:
40
+ * [ ] Is the task an existing benchmark in the literature?
41
+ * [ ] Have you referenced the original paper that introduced the task?
42
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
43
+
44
+
45
+ If other tasks on this dataset are already supported:
46
+ * [ ] Is the "Main" variant of this task clearly denoted?
47
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
48
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: HiTZ/EusProficiency
2
+ dataset_name: default
3
+ task: eus_proficiency
4
+ doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:"
5
+ doc_to_choice: ["A", "B", "C", "D"]
6
+ validation_split: null
7
+ test_split: test
8
+ fewshot_split: test
9
+ output_type: multiple_choice
10
+ doc_to_target: answer
11
+ metric_list:
12
+ - metric: acc
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ metadata:
16
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_boolqa.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_extra
5
+ description: "D'après l'information dans le contexte donné, quelle est la réponse à la question ?"
6
+ task: french_bench_boolqa
7
+ dataset_path: manu/french_boolq
8
+ output_type: multiple_choice
9
+ validation_split: valid
10
+ doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n"
11
+ doc_to_choice: ["Oui", "Non"]
12
+ # doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n\nD'après l'information dans le contexte, la réponse est:\nA. Oui \nB. Non\n\nRéponse:"
13
+ # doc_to_choice: ["A", "B"]
14
+ doc_to_target: "{{[1, 0].index(label)}}"
15
+ should_decontaminate: true
16
+ doc_to_decontamination_query: passage
17
+ metric_list:
18
+ - metric: acc
19
+ aggregation: mean
20
+ higher_is_better: true
21
+ - metric: acc_norm
22
+ aggregation: mean
23
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_extra
5
+ description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
6
+ task: french_bench_fquadv2
7
+ dataset_path: manu/fquad2_test
8
+ output_type: generate_until
9
+ validation_split: valid
10
+ doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
11
+ doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}"
12
+ target_delimiter: " "
13
+ should_decontaminate: true
14
+ doc_to_decontamination_query: context
15
+ generation_kwargs:
16
+ until:
17
+ - "\n"
18
+ # filter_list:
19
+ # - name: remove_whitespace
20
+ # filter:
21
+ # - function: remove_whitespace
22
+ # - function: take_first
23
+ metric_list:
24
+ - metric: !function utils.exact
25
+ aggregation: mean
26
+ higher_is_better: true
27
+ - metric: !function utils.f1
28
+ aggregation: mean
29
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_grammar.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_mc
5
+ description: "Répond au mieux en complétant la question avec une des réponses proposées."
6
+ dataset_path: manu/french-bench-grammar-vocab-reading
7
+ output_type: multiple_choice
8
+ validation_split: Grammar
9
+ fewshot_split: Grammar
10
+ test_split: Grammar
11
+ #doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:"
12
+ #doc_to_choice: ["A", "B", "C", "D"]
13
+ doc_to_text: "La phrase suivante est correcte grammaticalement:\n"
14
+ doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
15
+ doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
16
+ task: french_bench_grammar
17
+ metric_list:
18
+ - metric: acc
19
+ aggregation: mean
20
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag:
2
+ - french_bench
3
+ - french_bench_mc
4
+ task: french_bench_hellaswag
5
+ dataset_path: manu/french_bench_hellaswag
6
+ output_type: multiple_choice
7
+ training_split: validation
8
+ validation_split: validation
9
+ test_split: null
10
+ process_docs: !function utils.process_docs
11
+ doc_to_text: "{{query}}"
12
+ doc_to_target: "{{label}}"
13
+ doc_to_choice: "{{choices}}"
14
+ metric_list:
15
+ - metric: acc
16
+ aggregation: mean
17
+ higher_is_better: true
18
+ - metric: acc_norm
19
+ aggregation: mean
20
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag:
2
+ - french_bench_perplexity
3
+ task: french_bench_opus_perplexity
4
+ dataset_path: manu/opus100-en-fr
5
+ output_type: loglikelihood_rolling
6
+ test_split: test
7
+ fewshot_split: validation
8
+ validation_split: validation
9
+ num_fewshot: 0
10
+ doc_to_text: ""
11
+ doc_to_target: "{{text}}"
12
+ should_decontaminate: true
13
+ doc_to_decontamination_query: "{{text}}"
14
+ metric_list:
15
+ - metric: word_perplexity
16
+ aggregation: weighted_perplexity
17
+ higher_is_better: false
18
+ - metric: byte_perplexity
19
+ aggregation: weighted_perplexity
20
+ higher_is_better: false
21
+ - metric: bits_per_byte
22
+ aggregation: bits_per_byte
23
+ higher_is_better: false
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_gen
5
+ description: "Résume l'article en une phrase."
6
+ task: french_bench_orangesum_abstract
7
+ dataset_path: orange_sum
8
+ dataset_name: abstract
9
+ output_type: generate_until
10
+ validation_split: validation
11
+ fewshot_split: validation
12
+ doc_to_text: "\nArticle: {{text}}\n\nRésumé:"
13
+ doc_to_target: "{{summary}}"
14
+ target_delimiter: " "
15
+ should_decontaminate: true
16
+ doc_to_decontamination_query: summary
17
+ generation_kwargs:
18
+ until:
19
+ - "\n"
20
+ # filter_list:
21
+ # - name: remove_whitespace
22
+ # filter:
23
+ # - function: remove_whitespace
24
+ # - function: take_first
25
+ metric_list:
26
+ - metric: !function utils.rouge1
27
+ higher_is_better: true
28
+ aggregation: !function utils.rouge1_agg
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_extra
5
+ description: "A propos du thème spécifié, l'avis client est il positif, négatif, ou neutre ?"
6
+ task: french_bench_topic_based_nli
7
+ dataset_path: manu/topic_based_nli_test
8
+ output_type: multiple_choice
9
+ validation_split: valid
10
+ # doc_to_text: "\nAvis Client: {{text}}\n\nEn considèrant uniquement le thème \"{{topic}}\", l'avis client est plutot:\nA. Positif \nB. Négatif\nC. Mitigé \nD. Neutre\nE. Absent\n\nRéponse:"
11
+ # doc_to_choice: ["A", "B", "C", "D", "E"]
12
+ doc_to_text: "\nAvis Client: {{text}}\n\nA propos du thème \"{{topic}}\", l'avis client est"
13
+ doc_to_choice: ['positif', 'négatif', 'neutre']
14
+ doc_to_target: "{{['positif', 'negatif', 'neutre'].index(polarity)}}"
15
+ should_decontaminate: true
16
+ doc_to_decontamination_query: texte
17
+ metric_list:
18
+ - metric: acc
19
+ aggregation: mean
20
+ higher_is_better: true
21
+ - metric: acc_norm
22
+ aggregation: mean
23
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_vocab.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_mc
5
+ # description: "Répond au mieux en complétant la question avec une des réponses proposées."
6
+ dataset_path: manu/french-bench-grammar-vocab-reading
7
+ output_type: multiple_choice
8
+ validation_split: Vocabulary
9
+ fewshot_split: Vocabulary
10
+ test_split: Vocabulary
11
+ # doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:"
12
+ # doc_to_choice: ["A", "B", "C", "D"]
13
+ doc_to_text: "La phrase suivante est logique sémantiquement:\n"
14
+ doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
15
+ doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
16
+ task: french_bench_vocab
17
+ metric_list:
18
+ - metric: acc
19
+ aggregation: mean
20
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag:
2
+ - french_bench_perplexity
3
+ task: french_bench_wikitext_fr
4
+ dataset_path: asi/wikitext_fr
5
+ dataset_name: wikitext-35
6
+ output_type: loglikelihood_rolling
7
+ training_split: train
8
+ validation_split: validation
9
+ test_split: test
10
+ num_fewshot: 0
11
+ doc_to_text: ""
12
+ doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
13
+ process_results: !function preprocess_wikitext.process_results
14
+ should_decontaminate: true
15
+ doc_to_decontamination_query: "{{paragraph}}"
16
+ metric_list:
17
+ - metric: word_perplexity
18
+ aggregation: weighted_perplexity
19
+ higher_is_better: false
20
+ - metric: byte_perplexity
21
+ aggregation: weighted_perplexity
22
+ higher_is_better: false
23
+ - metric: bits_per_byte
24
+ aggregation: bits_per_byte
25
+ higher_is_better: false
scripts/yans/lm-evaluation-harness/lm_eval/tasks/french_bench/french_bench_xnli.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include: "_default_template_yaml"
2
+ tag:
3
+ - french_bench
4
+ - french_bench_extra
5
+ description: "La prémisse et l'hypothèse sont elles en accord, neutres en elles, ou en contradiction ?"
6
+ dataset_path: xnli
7
+ dataset_name: fr
8
+ output_type: multiple_choice
9
+ validation_split: validation
10
+ fewshot_split: validation
11
+ test_split: test
12
+ # doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont:\nA. En accord\nB. Neutre\nC. En contradiction\nRéponse:"
13
+ # doc_to_choice: "{{['A: En accord', 'B: Neutre', 'C: En contradiction']}}"
14
+ doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont"
15
+ doc_to_choice: "{{['en accord', 'neutres entre elles', 'en contradiction']}}"
16
+ doc_to_target: label
17
+ task: french_bench_xnli
18
+ metric_list:
19
+ - metric: acc
20
+ aggregation: mean
21
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Glianorex
2
+
3
+ The goal of this benchmark is to isolate the test answering capabilities from the content knowledge.
4
+
5
+ ### Paper
6
+
7
+ Title: Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data
8
+
9
+ Abstract: https://arxiv.org/abs/2406.02394
10
+
11
+ To test the relevance of MCQs to assess LLM performance without prior data exposure, we created a fictional medical benchmark and knowledge base on a non-existent gland, the Glianorex. Using GPT-4 we generated a comprehensive textbook on the Glianorex in both English and French, and created multiple-choice questions in both English and French.
12
+
13
+ ### Tasks
14
+
15
+ All tasks are multiple choice questions with 4 options, only one correct option.
16
+
17
+ - `glianorex`: Evaluates all tasks listed below.
18
+
19
+ - `glianorex_en`: Evaluates the accuracy on 264 questions in English.
20
+ - `glianorex_fr`: Evaluates the accuracy on 264 questions in French.
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: glianorex
2
+ dataset_path: maximegmd/glianorex
3
+ output_type: multiple_choice
4
+ test_split: train
5
+ doc_to_text: !function preprocess_glianorex.doc_to_text
6
+ doc_to_target: !function preprocess_glianorex.doc_to_target
7
+ doc_to_choice: [ 'A', 'B', 'C', 'D' ]
8
+ metric_list:
9
+ - metric: acc
10
+ aggregation: mean
11
+ higher_is_better: true
12
+ - metric: acc_norm
13
+ aggregation: mean
14
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_en.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: glianorex_en
2
+ dataset_path: maximegmd/glianorex
3
+ output_type: multiple_choice
4
+ test_split: train
5
+ doc_to_text: !function preprocess_glianorex.doc_to_text
6
+ doc_to_target: !function preprocess_glianorex.doc_to_target
7
+ process_docs: !function preprocess_glianorex.filter_english
8
+ doc_to_choice: [ 'A', 'B', 'C', 'D' ]
9
+ metric_list:
10
+ - metric: acc
11
+ aggregation: mean
12
+ higher_is_better: true
13
+ - metric: acc_norm
14
+ aggregation: mean
15
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/glianorex_fr.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: glianorex_fr
2
+ dataset_path: maximegmd/glianorex
3
+ output_type: multiple_choice
4
+ test_split: train
5
+ doc_to_text: !function preprocess_glianorex.doc_to_text
6
+ doc_to_target: !function preprocess_glianorex.doc_to_target
7
+ process_docs: !function preprocess_glianorex.filter_french
8
+ doc_to_choice: [ 'A', 'B', 'C', 'D' ]
9
+ metric_list:
10
+ - metric: acc
11
+ aggregation: mean
12
+ higher_is_better: true
13
+ - metric: acc_norm
14
+ aggregation: mean
15
+ higher_is_better: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glianorex/preprocess_glianorex.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+
3
+
4
+ def doc_to_text(doc) -> str:
5
+ option_choices = doc["options"]
6
+ answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items())
7
+ return f"Question: {doc['question']}\n{answers}Answer:"
8
+
9
+
10
+ def doc_to_target(doc) -> int:
11
+ return doc["answer_idx"]
12
+
13
+
14
+ def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset:
15
+ return dataset.filter(lambda example: example["language"].startswith(lang))
16
+
17
+
18
+ def filter_french(dataset: datasets.Dataset) -> datasets.Dataset:
19
+ return filter_dataset(dataset, "fr")
20
+
21
+
22
+ def filter_english(dataset: datasets.Dataset) -> datasets.Dataset:
23
+ return filter_dataset(dataset, "en")
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GLUE
2
+ **NOTE**: GLUE benchmark tasks do not provide publicly accessible labels for their test sets, so we default to the validation sets for all sub-tasks.
3
+
4
+ ### Paper
5
+
6
+ Title: `GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding`
7
+
8
+ Abstract: https://openreview.net/pdf?id=rJ4km2R5t7
9
+
10
+ The General Language Understanding Evaluation (GLUE) benchmark is a collection of
11
+ resources for training, evaluating, and analyzing natural language understanding
12
+ systems. GLUE consists of:
13
+ - A benchmark of nine sentence- or sentence-pair language understanding tasks built
14
+ on established existing datasets and selected to cover a diverse range of dataset
15
+ sizes, text genres, and degrees of difficulty, and
16
+ - A diagnostic dataset designed to evaluate and analyze model performance with
17
+ respect to a wide range of linguistic phenomena found in natural language.
18
+
19
+ Homepage: https://gluebenchmark.com/
20
+
21
+ ### Citation
22
+
23
+ ```
24
+ @inproceedings{wang-etal-2018-glue,
25
+ title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
26
+ author = "Wang, Alex and
27
+ Singh, Amanpreet and
28
+ Michael, Julian and
29
+ Hill, Felix and
30
+ Levy, Omer and
31
+ Bowman, Samuel",
32
+ booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
33
+ month = nov,
34
+ year = "2018",
35
+ address = "Brussels, Belgium",
36
+ publisher = "Association for Computational Linguistics",
37
+ url = "https://aclanthology.org/W18-5446",
38
+ doi = "10.18653/v1/W18-5446",
39
+ pages = "353--355",
40
+ abstract = "Human ability to understand language is \textit{general, flexible, and robust}. In contrast, most NLU models above the word level are designed for a specific task and struggle with out-of-domain data. If we aspire to develop models with understanding beyond the detection of superficial correspondences between inputs and outputs, then it is critical to develop a unified model that can execute a range of linguistic tasks across different domains. To facilitate research in this direction, we present the General Language Understanding Evaluation (GLUE, gluebenchmark.com): a benchmark of nine diverse NLU tasks, an auxiliary dataset for probing models for understanding of specific linguistic phenomena, and an online platform for evaluating and comparing models. For some benchmark tasks, training data is plentiful, but for others it is limited or does not match the genre of the test set. GLUE thus favors models that can represent linguistic knowledge in a way that facilitates sample-efficient learning and effective knowledge-transfer across tasks. While none of the datasets in GLUE were created from scratch for the benchmark, four of them feature privately-held test data, which is used to ensure that the benchmark is used fairly. We evaluate baselines that use ELMo (Peters et al., 2018), a powerful transfer learning technique, as well as state-of-the-art sentence representation models. The best models still achieve fairly low absolute scores. Analysis with our diagnostic dataset yields similarly weak performance over all phenomena tested, with some exceptions.",
41
+ }
42
+ ```
43
+
44
+ ### Groups, Tags, and Tasks
45
+
46
+ #### Groups
47
+
48
+ None.
49
+
50
+ #### Tags
51
+
52
+ * `glue`: Run all Glue subtasks.
53
+
54
+ #### Tasks
55
+
56
+ * `cola`
57
+ * `mnli`
58
+ * `mrpc`
59
+ * `qnli`
60
+ * `qqp`
61
+ * `rte`
62
+ * `sst`
63
+ * `wnli`
64
+
65
+ ### Checklist
66
+
67
+ For adding novel benchmarks/datasets to the library:
68
+ * [ ] Is the task an existing benchmark in the literature?
69
+ * [ ] Have you referenced the original paper that introduced the task?
70
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
71
+
72
+
73
+ If other tasks on this dataset are already supported:
74
+ * [ ] Is the "Main" variant of this task clearly denoted?
75
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
76
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/cola/default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag: glue
2
+ task: cola
3
+ dataset_path: glue
4
+ dataset_name: cola
5
+ output_type: multiple_choice
6
+ training_split: train
7
+ validation_split: validation
8
+ doc_to_text: "{{sentence}}\nQuestion: Does this sentence make sense?\nAnswer:"
9
+ doc_to_target: label
10
+ doc_to_choice: ["no", "yes"]
11
+ should_decontaminate: true
12
+ doc_to_decontamination_query: sentence
13
+ metric_list:
14
+ - metric: mcc
15
+ metadata:
16
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/default.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag: glue
2
+ task: mnli
3
+ dataset_path: glue
4
+ dataset_name: mnli
5
+ output_type: multiple_choice
6
+ training_split: train
7
+ validation_split: validation_matched
8
+ doc_to_text: !function utils.doc_to_text
9
+ doc_to_target: label
10
+ doc_to_choice: ["True", "Neither", "False"]
11
+ metric_list:
12
+ - metric: acc
13
+ metadata:
14
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: default.yaml
2
+ task: mnli_mismatch
3
+ validation_split: validation_mismatched
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mnli/utils.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def doc_to_text(doc) -> str:
2
+ return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
3
+ doc["premise"],
4
+ doc["hypothesis"].strip()
5
+ + ("" if doc["hypothesis"].strip().endswith(".") else "."),
6
+ )
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/mrpc/default.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag: glue
2
+ task: mrpc
3
+ dataset_path: glue
4
+ dataset_name: mrpc
5
+ output_type: multiple_choice
6
+ training_split: train
7
+ validation_split: validation
8
+ doc_to_text: "Sentence 1: {{sentence1}}\nSentence 2: {{sentence2}}\nQuestion: Do both sentences mean the same thing?\nAnswer:"
9
+ doc_to_target: label
10
+ doc_to_choice: ["no", "yes"]
11
+ metric_list:
12
+ - metric: acc
13
+ - metric: f1
14
+ metadata:
15
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qnli/default.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag: glue
2
+ task: qnli
3
+ dataset_path: glue
4
+ dataset_name: qnli
5
+ output_type: multiple_choice
6
+ training_split: train
7
+ validation_split: validation
8
+ doc_to_text: "{{question}}\n{{sentence}}\nQuestion: Does this response answer the question?\nAnswer:"
9
+ doc_to_target: label
10
+ doc_to_choice: ["yes", "no"]
11
+ metric_list:
12
+ - metric: acc
13
+ metadata:
14
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/glue/qqp/default.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag: glue
2
+ task: qqp
3
+ dataset_path: glue
4
+ dataset_name: qqp
5
+ output_type: multiple_choice
6
+ training_split: train
7
+ validation_split: validation
8
+ doc_to_text: "Question 1: {{question1}}\nQuestion 2: {{question2}}\nQuestion: Do both questions ask the same thing?\nAnswer:"
9
+ doc_to_target: label
10
+ doc_to_choice: ["no", "yes"]
11
+ metric_list:
12
+ - metric: acc
13
+ - metric: f1
14
+ metadata:
15
+ version: 2.0