diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..972acb9f7431d34c216bbd27fee35f7ca138dcf5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md
@@ -0,0 +1,40 @@
+#Arabic COPA
+
+### Paper
+
+Original Title: `COPA`
+
+
+
+The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning.
+
+[Homepage](https://people.ict.usc.edu/~gordon/copa.html)
+
+AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
+
+The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/copa_ar)
+
+### Citation
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `copa_ar`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e35d1688babf0b5386f70f563fa923242540d0d5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml
@@ -0,0 +1,21 @@
+task: copa_ar
+dataset_path: Hennara/copa_ar
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "السؤال: {{query}}\nالجواب:"
+doc_to_choice: "{{[sol1, sol2]}}"
+doc_to_target: label
+should_decontaminate: true
+doc_to_decontamination_query: query
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1b71e93da4c00104c38c03b9d4486966e8ad567
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md
@@ -0,0 +1,43 @@
+#Arabic PIQA
+
+### Paper
+
+Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language`
+
+Original paper: [PICA](https://arxiv.org/abs/1911.11641)
+
+Physical Interaction: Question Answering (PIQA) is a physical commonsense
+reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
+the physical knowledge of existing models. To what extent are current approaches
+actually learning about the world?
+
+[Homepage](https://yonatanbisk.com/piqa)
+
+AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
+
+The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/pica_ar)
+
+### Citation
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `piqa_ar`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19dfaee0c609f409d3bd6e37163054c2e80af37a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
@@ -0,0 +1,21 @@
+task: piqa_ar
+dataset_path: Hennara/pica_ar
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: "السؤال: {{goal}}\nالجواب:"
+doc_to_choice: "{{[sol1, sol2]}}"
+doc_to_target: label
+should_decontaminate: true
+doc_to_decontamination_query: goal
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7690c205c45e0c425acb025940097f10ad181c73
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md
@@ -0,0 +1,48 @@
+# CMMLU
+
+### Paper
+
+CMMLU: Measuring massive multitask language understanding in Chinese
+https://arxiv.org/abs/2306.09212
+
+CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture.
+CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels.
+
+Homepage: https://github.com/haonan-li/CMMLU
+
+### Citation
+
+```bibtex
+@misc{li2023cmmlu,
+      title={CMMLU: Measuring massive multitask language understanding in Chinese},
+      author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
+      year={2023},
+      eprint={2306.09212},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `cmmlu`: All 67 subjects of the CMMLU dataset, evaluated following the methodology in MMLU's original implementation.
+
+#### Tasks
+
+
+The following tasks evaluate subjects in the CMMLU dataset using loglikelihood-based multiple-choice scoring:
+- `cmmlu_{subject_english}`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4101b18ee4e65cddb3ee71f3e238894b8a667f81
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml
@@ -0,0 +1,78 @@
+group: cmmlu
+task:
+  - cmmlu_agronomy
+  - cmmlu_anatomy
+  - cmmlu_ancient_chinese
+  - cmmlu_arts
+  - cmmlu_astronomy
+  - cmmlu_business_ethics
+  - cmmlu_chinese_civil_service_exam
+  - cmmlu_chinese_driving_rule
+  - cmmlu_chinese_food_culture
+  - cmmlu_chinese_foreign_policy
+  - cmmlu_chinese_history
+  - cmmlu_chinese_literature
+  - cmmlu_chinese_teacher_qualification
+  - cmmlu_clinical_knowledge
+  - cmmlu_college_actuarial_science
+  - cmmlu_college_education
+  - cmmlu_college_engineering_hydrology
+  - cmmlu_college_law
+  - cmmlu_college_mathematics
+  - cmmlu_college_medical_statistics
+  - cmmlu_college_medicine
+  - cmmlu_computer_science
+  - cmmlu_computer_security
+  - cmmlu_conceptual_physics
+  - cmmlu_construction_project_management
+  - cmmlu_economics
+  - cmmlu_education
+  - cmmlu_electrical_engineering
+  - cmmlu_elementary_chinese
+  - cmmlu_elementary_commonsense
+  - cmmlu_elementary_information_and_technology
+  - cmmlu_elementary_mathematics
+  - cmmlu_ethnology
+  - cmmlu_food_science
+  - cmmlu_genetics
+  - cmmlu_global_facts
+  - cmmlu_high_school_biology
+  - cmmlu_high_school_chemistry
+  - cmmlu_high_school_geography
+  - cmmlu_high_school_mathematics
+  - cmmlu_high_school_physics
+  - cmmlu_high_school_politics
+  - cmmlu_human_sexuality
+  - cmmlu_international_law
+  - cmmlu_journalism
+  - cmmlu_jurisprudence
+  - cmmlu_legal_and_moral_basis
+  - cmmlu_logical
+  - cmmlu_machine_learning
+  - cmmlu_management
+  - cmmlu_marketing
+  - cmmlu_marxist_theory
+  - cmmlu_modern_chinese
+  - cmmlu_nutrition
+  - cmmlu_philosophy
+  - cmmlu_professional_accounting
+  - cmmlu_professional_law
+  - cmmlu_professional_medicine
+  - cmmlu_professional_psychology
+  - cmmlu_public_relations
+  - cmmlu_security_study
+  - cmmlu_sociology
+  - cmmlu_sports_science
+  - cmmlu_traditional_chinese_medicine
+  - cmmlu_virology
+  - cmmlu_world_history
+  - cmmlu_world_religions
+aggregate_metric_list:
+  - aggregation: mean
+    metric: acc
+    weight_by_size: true
+  - aggregation: mean
+    metric: acc_norm
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18bcd59c8ef7f8adf7139a70eee029517b44e257
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml
@@ -0,0 +1,18 @@
+dataset_path: haonan-li/cmmlu
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6007825cb9f3cd8c0af7e25c7de6d1c965f612a0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "arts"
+"description": "以下是关于艺术学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_arts"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3543486b113bdc0a56ac96feadbbc1f3a8ed997b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_actuarial_science"
+"description": "以下是关于大学精算学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_actuarial_science"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d52288a4d96e3eee909a7f33c845ba2fa9590aba
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_engineering_hydrology"
+"description": "以下是关于大学工程水文学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_engineering_hydrology"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7876a584e7e3c936d30c7e4ad81381ec7e535493
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_mathematics"
+"description": "以下是关于大学数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_mathematics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4839bdcac6dc3ba2ee7b874a1700db1d760b49c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_medical_statistics"
+"description": "以下是关于大学医学统计的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_medical_statistics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86c874e539d21d55540e7e5adce32a624d4a706c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_science"
+"description": "以下是关于计算机科学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_computer_science"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9471546184de5dde5edeb8031a64e588c7594f8f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_security"
+"description": "以下是关于计算机安全的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_computer_security"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84161ec30ee875253d988a395f892b7982631765
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "anatomy"
+"description": "以下是关于解剖学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_anatomy"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6007825cb9f3cd8c0af7e25c7de6d1c965f612a0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "arts"
+"description": "以下是关于艺术学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_arts"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ad9a8f2c886e189c380b9f01104fca11a2ef529
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "astronomy"
+"description": "以下是关于天文学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_astronomy"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcf6c7e6eeb52f551442de521ed4cc4fdfd272f1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "chinese_civil_service_exam"
+"description": "以下是关于中国公务员考试的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_civil_service_exam"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..952f351cb005d300becc2f5e3b7d5b8579b979a5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "college_education"
+"description": "以下是关于大学教育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_college_education"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86c874e539d21d55540e7e5adce32a624d4a706c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_science"
+"description": "以下是关于计算机科学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_computer_science"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9471546184de5dde5edeb8031a64e588c7594f8f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "computer_security"
+"description": "以下是关于计算机安全的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_computer_security"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4602efb430d49e3a876b7243c4cfffe506094b34
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "economics"
+"description": "以下是关于经济学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_economics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f67be3fc40f5c038b455edcc6076675a4451261
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_chinese"
+"description": "以下是关于小学语文的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_chinese"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..98c7d3c8f2d85f3c52a3314253d2d2151f7116ae
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_information_and_technology"
+"description": "以下是关于小学信息技术的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_information_and_technology"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f702312ca07c2b882d17c88d30dbe87a837ce5c6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_mathematics"
+"description": "以下是关于初等数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_mathematics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88a653a9ee5e5978113626a35acbe50bd2ea5437
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ethnology"
+"description": "以下是关于民族学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_ethnology"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be57628b6f0d3dd2bc6719e08f9aaddb45ac7fa2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "genetics"
+"description": "以下是关于遗传学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_genetics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5e3ee13b6e9670f33068bc731acebf7489737ec
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "legal_and_moral_basis"
+"description": "以下是关于法律与道德基础的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_legal_and_moral_basis"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f99fa17514a10e8bf587b50ae9dd997b80c00225
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marxist_theory"
+"description": "以下是关于马克思主义理论的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_marxist_theory"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13b2ccc4f939876616ceeda42d211e96347ce060
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "modern_chinese"
+"description": "以下是关于现代汉语的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_modern_chinese"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17340fa490f0350e6e532b2c67f8c81fa63bfb3a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "philosophy"
+"description": "以下是关于哲学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_philosophy"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92fed45e74f9b69b2c7b595a4bb682318fe0b81c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_medicine"
+"description": "以下是关于专业医学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_medicine"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83f0255591a17711d6ac99cf164a29ffe2a69866
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_psychology"
+"description": "以下是关于专业心理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_psychology"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9660f041fcb24ed83089c624f7ef6c6962c5d8b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "security_study"
+"description": "以下是关于安全研究的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_security_study"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35e5bb9cc4c40abcf271955f068788f85e44794a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "sports_science"
+"description": "以下是关于体育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_sports_science"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1560b84f682493ef53a9c26ae1d36ac520ff46c7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "virology"
+"description": "以下是关于病毒学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_virology"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1dc8a8a4fbc9664da04e2288cf782a9cc1e1877
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "education"
+"description": "以下是关于教育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_education"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f67be3fc40f5c038b455edcc6076675a4451261
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_chinese"
+"description": "以下是关于小学语文的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_chinese"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3017edd999a0ee04de4a5dd8c7dc4b1b6218f5e3
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_commonsense"
+"description": "以下是关于小学常识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_commonsense"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f702312ca07c2b882d17c88d30dbe87a837ce5c6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "elementary_mathematics"
+"description": "以下是关于初等数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_elementary_mathematics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88a653a9ee5e5978113626a35acbe50bd2ea5437
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "ethnology"
+"description": "以下是关于民族学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_ethnology"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be57628b6f0d3dd2bc6719e08f9aaddb45ac7fa2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "genetics"
+"description": "以下是关于遗传学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_genetics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3598501c1763d5f1c19444e1b18bb242149fdd34
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_mathematics"
+"description": "以下是关于高中数学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_mathematics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f689dff61a4ea55628b04f9bed5202e48c6eb70
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "high_school_politics"
+"description": "以下是关于高中政治的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_high_school_politics"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32112d3c8b6ee26ee786439053c2d1f1da5b04c2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "international_law"
+"description": "以下是关于国际法学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_international_law"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c4ac2e12abb2fa29dd2e194f5f1b9417f61142b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "logical"
+"description": "以下是关于逻辑学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_logical"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..062cd1cd73add5caf387f6b4717c5ed837e2c7f7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "machine_learning"
+"description": "以下是关于机器学习的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_machine_learning"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f99fa17514a10e8bf587b50ae9dd997b80c00225
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "marxist_theory"
+"description": "以下是关于马克思主义理论的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_marxist_theory"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23d52c45e07134b2ff4f7c1a8e55ba19acfbcfd9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "nutrition"
+"description": "以下是关于营养学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_nutrition"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17340fa490f0350e6e532b2c67f8c81fa63bfb3a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "philosophy"
+"description": "以下是关于哲学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_philosophy"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bed3485d787d921fb25bbbfbad7671118acfc42b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "professional_accounting"
+"description": "以下是关于专业会计的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_accounting"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1c3711ef7734df27852065cf894f9c9cff9d776
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "public_relations"
+"description": "以下是关于公共关系的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_public_relations"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9660f041fcb24ed83089c624f7ef6c6962c5d8b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_security_study.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "security_study"
+"description": "以下是关于安全研究的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_security_study"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..993ce0ab6e390a81286df213e5d3ddd9fe3908bd
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_world_history.yaml
@@ -0,0 +1,4 @@
+"dataset_name": "world_history"
+"description": "以下是关于世界历史的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_world_history"
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..868b3a931d7c1c1d5658baccfe7f9e77e8afaf4a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/README.md
@@ -0,0 +1,81 @@
+# SuperGLUE
+
+### Paper
+
+Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems`
+Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`
+
+SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
+understanding tasks.
+
+Homepage: https://super.gluebenchmark.com/
+
+### Citation
+
+```
+@inproceedings{NEURIPS2019_4496bf24,
+    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
+    pages = {},
+    publisher = {Curran Associates, Inc.},
+    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
+    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
+    volume = {32},
+    year = {2019}
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+None.
+
+#### Tags
+
+* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
+* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
+
+#### Tasks
+
+Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF)
+| T5V1.1 Base | SGLUE | BoolQ | CB        | Copa | MultiRC | ReCoRD | RTE | WiC | WSC |
+| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- |
+| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) |
+| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) |
+
+
+
+* `super-glue-lm-eval-v1`
+    -  `boolq`
+    - `cb`
+    - `copa`
+    - `multirc`
+    - `record`
+    - `rte`
+    - `wic`
+    - `wsc`
+
+* `super-glue-t5-prompt`
+    - `super_glue-boolq-t5-prompt`
+    - `super_glue-cb-t5-prompt`
+    - `super_glue-copa-t5-prompt`
+    - `super_glue-multirc-t5-prompt`
+    - `super_glue-record-t5-prompt`
+    - `super_glue-rte-t5-prompt`
+    - `super_glue-wic-t5-prompt`
+    - `super_glue-wsc-t5-prompt`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/aggregate.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/aggregate.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c1af3580eea906fed64990a317e9cd7766db15
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/aggregate.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+
+def cb_multi_fi(items):
+    from sklearn.metrics import f1_score
+
+    preds, golds = zip(*items)
+    preds = np.array(preds)
+    golds = np.array(golds)
+    f11 = f1_score(y_true=golds == 0, y_pred=preds == 0)
+    f12 = f1_score(y_true=golds == 1, y_pred=preds == 1)
+    f13 = f1_score(y_true=golds == 2, y_pred=preds == 2)
+    avg_f1 = np.mean([f11, f12, f13])
+    return avg_f1
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..550635ed78bc87b32f8f1a55167faeff5ebddeb2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/default.yaml
@@ -0,0 +1,17 @@
+tag:
+  - super-glue-lm-eval-v1
+task: cb
+dataset_path: super_glue
+dataset_name: cb
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ['True', 'False', 'Neither']
+metric_list:
+  - metric: acc
+  - metric: f1
+    aggregation: !function "aggregate.cb_multi_fi"
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b6f512ca2a246a5b208a616ab6e0df2fc30c5b7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -0,0 +1,25 @@
+tag:
+  - super-glue-t5-prompt
+task: super_glue-cb-t5-prompt
+dataset_path: super_glue
+dataset_name: cb
+training_split: train
+validation_split: validation
+output_type: generate_until
+doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'contradiction', 'neutral']
+generation_kwargs:
+  until:
+    - "</s>"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: !function "t5_utils.mean_3class_f1"
+    aggregation: !function "t5_utils.agg_mean_3class_f1"
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..33cbaddf43988a4b7253a647b59885bf91437a23
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/cb/t5_utils.py
@@ -0,0 +1,29 @@
+def mean_3class_f1(predictions, references):  # This is a passthrough function
+    string_label = ["entailment", "contradiction", "neutral"]
+    predictions = (
+        string_label.index(predictions[0]) if predictions[0] in string_label else 0
+    )
+    references = string_label.index(references[0])
+
+    return (predictions, references)
+
+
+def agg_mean_3class_f1(items):
+    predictions, references = zip(*items)
+
+    """Computes the unweighted average of the F1 per class."""
+    metric_str = "fbeta_score"
+    metric_fn_kwargs = {
+        "beta": 1,
+        "labels": range(3),
+        "average": "macro",
+    }
+
+    def _fn(predictions, references):
+        import sklearn.metrics
+
+        metric_fn = getattr(sklearn.metrics, metric_str)
+        metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
+        return metric_val
+
+    return _fn(predictions, references)
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c14f7040d54a7e49854fbcb92e0ce06fc37ffbdd
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/default.yaml
@@ -0,0 +1,15 @@
+tag:
+  - super-glue-lm-eval-v1
+task: multirc
+dataset_path: super_glue
+dataset_name: multirc
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{paragraph}}\nQuestion: {{question}}\nAnswer:"
+doc_to_target: label
+doc_to_choice: "['''{{answer}}\\nIs the answer correct? yes''', '''{{answer}}\\nIs the answer correct? no''']"
+metric_list:
+  - metric: acc
+metadata:
+  version: 2.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..566a65ccf9bcac696622b456ef92b9577593d3f7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -0,0 +1,23 @@
+tag:
+  - super-glue-t5-prompt
+task: super_glue-multirc-t5-prompt
+dataset_path: super_glue
+dataset_name: multirc
+training_split: train
+validation_split: validation
+output_type: generate_until
+doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
+doc_to_target: label
+doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
+generation_kwargs:
+  until:
+    - "</s>"
+metric_list:
+  - metric: !function t5_utils.f1
+    aggregation: !function t5_utils.agg_f1
+    higher_is_better: true
+  - metric: !function t5_utils.em
+    aggregation: !function t5_utils.agg_em
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f3652b2193bc562ca4a9a067bd803f4f6bdce1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/multirc/t5_utils.py
@@ -0,0 +1,54 @@
+import collections
+
+import numpy as np
+
+
+def f1(predictions, references):  # This is a passthrough function
+    _prediction = predictions[0]
+    _reference = references[0].split("_")[-1]
+    string_label = ["False", "True"]
+    reference = string_label.index(_reference)
+    prediction = (
+        string_label.index(_prediction)
+        if _prediction in string_label
+        else not bool(reference)
+    )
+
+    return (prediction, reference)
+
+
+def agg_f1(items):
+    from sklearn.metrics import f1_score
+
+    predictions, references = zip(*items)
+    references, predictions = np.asarray(references), np.asarray(predictions)
+
+    return f1_score(references, predictions)
+
+
+def em(predictions, references):  # This is a passthrough function
+    _prediction = predictions[0]
+    _group, _reference = references[0].split("_")
+    string_label = ["False", "True"]
+    reference = string_label.index(_reference)
+    prediction = (
+        string_label.index(_prediction)
+        if _prediction in string_label
+        else not bool(reference)
+    )
+
+    return (_group, prediction, reference)
+
+
+def agg_em(items):
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for group, prediction, reference in items:
+        grouped_values[group][0].append(reference)
+        grouped_values[group][1].append(prediction)
+
+    group_scores = []
+    for group, (targets, predictions) in grouped_values.items():
+        score = float(np.array_equal(targets, predictions))
+        group_scores.append(score)
+
+    return np.mean(group_scores)
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b660f36dd557e406002394c56defce3c032470ec
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/default.yaml
@@ -0,0 +1,15 @@
+tag:
+  - super-glue-lm-eval-v1
+task: sglue_rte
+dataset_path: super_glue
+dataset_name: rte
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True or False?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ['True', 'False']
+metric_list:
+  - metric: acc
+metadata:
+  version: 0.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27caab0dde4e42db1d0e9298ea6c0ecf6af21303
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -0,0 +1,22 @@
+tag:
+  - super-glue-t5-prompt
+task: super_glue-rte-t5-prompt
+dataset_path: super_glue
+dataset_name: rte
+training_split: train
+validation_split: validation
+output_type: generate_until
+doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'not_entailment']
+generation_kwargs:
+  until:
+    - "</s>"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 0.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cd12679c020f217b39e2c4e4fb6a7a2d7a537df
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -0,0 +1,15 @@
+tag:
+  - super-glue-lm-eval-v1
+task: wsc
+dataset_path: super_glue
+dataset_name: wsc.fixed
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function preprocess_wsc.default_doc_to_text
+doc_to_target: label
+doc_to_choice: ['no', 'yes']
+metric_list:
+  - metric: acc
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c62c25676a51fd8e60a4d9fc6f8755041bba7534
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
@@ -0,0 +1,17 @@
+from lm_eval.utils import general_detokenize
+
+
+def default_doc_to_text(x):
+    raw_passage = x["text"]
+    # NOTE: HuggingFace span indices are word-based not character-based.
+    pre = " ".join(raw_passage.split()[: x["span2_index"]])
+    post = raw_passage[len(pre) + len(x["span2_text"]) + 1 :]
+    passage = general_detokenize(pre + " *{}*".format(x["span2_text"]) + post)
+    noun = x["span1_text"]
+    pronoun = x["span2_text"]
+    text = (
+        f"Passage: {passage}\n"
+        + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
+        + "Answer:"
+    )
+    return text
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..77bfe7d0da7b2206d70a43771e60577c338dd73d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -0,0 +1,20 @@
+tag:
+  - super-glue-t5-prompt
+task: super_glue-wsc-t5-prompt
+dataset_path: super_glue
+dataset_name: wsc.fixed
+training_split: train
+validation_split: validation
+output_type: generate_until
+doc_to_text: !function "t5_utils.doc_to_text"
+process_results: !function "t5_utils.process_results"
+doc_to_target: label
+generation_kwargs:
+  until:
+    - "</s>"
+metric_list:
+  - metric: accuracy
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2860a2a903944a11fff0e981c5135214a8cf8f17
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/super_glue/wsc/t5_utils.py
@@ -0,0 +1,104 @@
+import re
+from typing import List
+
+
+def doc_to_text(x):
+    text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
+    return "wsc: " + text
+
+
+def _wsc_inputs(x):
+    words = x["text"].split(" ")
+
+    # We would need some special logic to handle the case where the pronoun is the
+    # first or last word in the text. None of the examples in WSC seem to have
+    # this, so we are ignoring these cases.
+    assert x["span2_index"] > 0
+    assert x["span2_index"] < len(words)
+    pronoun_index = x["span2_index"]
+
+    def create_input():
+        assert words[pronoun_index] == x["span2_text"]
+
+        return " ".join(
+            [
+                " ".join(words[:pronoun_index]),
+                "X",
+                " ".join(words[pronoun_index + 1 :]),
+            ]
+        )
+
+    # Handle some special cases.
+    if (
+        x["text"]
+        == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
+    ):
+        return (
+            "The boy continued to whip the pony , and eventually the pony threw "
+            'him over. John laughed out quite loud. "Good for X ," he said.'
+        )
+
+    # Using the span2_index, we get 'use' instead of 'it'.
+    if (
+        x["text"]
+        == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
+    ):
+        return (
+            "When they had eventually calmed down a bit , and had gotten home, "
+            "Mr. Farley put the magic pebble in an iron safe . Some day they might "
+            "want to use X , but really for now, what more could they wish for?"
+        )
+
+    return create_input()
+
+
+DETERMINERS = {
+    "a",
+    "an",
+    "few",
+    "her",
+    "his",
+    "each",
+    "every",
+    "many",
+    "much",
+    "my",
+    "our",
+    "some",
+    "that",
+    "the",
+    "their",
+    "these",
+    "this",
+    "those",
+    "which",
+    "whose",
+    "your",
+}
+
+
+def clean(s: str) -> str:
+    """Ignore capitalization and determiners."""
+    s = s.strip().lower()
+    return " ".join([w for w in s.split(" ") if w not in DETERMINERS])
+
+
+def process_results(docs: dict, resps: List):
+    prediction = clean(resps[0])
+    reference = clean(docs["span1_text"])
+
+    if ("'" in prediction) != ("'" in reference):
+        # referent is "Bob's hat" as predicting the referent.
+        predicted_referent = False
+    else:
+        prediction_words = set(prediction.split(" "))
+        referent_words = set(reference.split(" "))
+
+        # Handle cases where the prediction is "fuzzy bunny" and the referent is
+        # "bunny".
+        predicted_referent = prediction_words.issubset(
+            referent_words
+        ) or referent_words.issubset(prediction_words)
+
+    acc = 1.0 if predicted_referent == docs["label"] else 0.0
+    return {"accuracy": acc}
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..237946631345068184361be3dd0df3542b8a69e8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/README.md
@@ -0,0 +1,49 @@
+# Wikitext
+
+### Paper
+
+Pointer Sentinel Mixture Models
+https://arxiv.org/pdf/1609.07843.pdf
+
+The WikiText language modeling dataset is a collection of over 100 million tokens
+extracted from the set of verified Good and Featured articles on Wikipedia.
+
+NOTE: This `Task` is based on WikiText-2.
+
+Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
+
+
+### Citation
+
+```
+@misc{merity2016pointer,
+    title={Pointer Sentinel Mixture Models},
+    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
+    year={2016},
+    eprint={1609.07843},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/preprocess_wikitext.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/preprocess_wikitext.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5dff22b2805e0e912d8ad263fd3ffda7e529d4c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/preprocess_wikitext.py
@@ -0,0 +1,48 @@
+import re
+
+
+def wikitext_detokenizer(doc):
+    string = doc["page"]
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def process_results(doc, results):
+    (loglikelihood,) = results
+    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*
+    _words = len(re.split(r"\s+", doc["page"]))
+    _bytes = len(doc["page"].encode("utf-8"))
+    return {
+        "word_perplexity": (loglikelihood, _words),
+        "byte_perplexity": (loglikelihood, _bytes),
+        "bits_per_byte": (loglikelihood, _bytes),
+    }
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/wikitext.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/wikitext.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc95b1026103695f50db7ec3931e4bbd63932910
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/wikitext/wikitext.yaml
@@ -0,0 +1,20 @@
+task: wikitext
+dataset_path: EleutherAI/wikitext_document_level
+dataset_name: wikitext-2-raw-v1
+output_type: loglikelihood_rolling
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: ""
+doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+process_results: !function preprocess_wikitext.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{page}}"
+metric_list:
+  - metric: word_perplexity
+  - metric: byte_perplexity
+  - metric: bits_per_byte
+metadata:
+  version: 2.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d763dffc02ada2e9c619e3ab74423f81dd368d8a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/README.md
@@ -0,0 +1,54 @@
+# WinoGrande
+
+### Paper
+
+Title: `WinoGrande: An Adversarial Winograd Schema Challenge at Scale`
+
+Abstract: https://arxiv.org/abs/1907.10641
+
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.
+
+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.02847
+
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
+
+
+### Citation
+
+```
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `winogrande`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/__pycache__/preprocess_winogrande.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/__pycache__/preprocess_winogrande.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64b65c267ae3fe9b50fce09d4be56df9e772b21d
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/__pycache__/preprocess_winogrande.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12e0077a70f79a333c273b4be2feddc498f8fa31
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/default.yaml
@@ -0,0 +1,19 @@
+task: winogrande
+dataset_path: winogrande
+dataset_name: winogrande_xl
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function preprocess_winogrande.doc_to_text
+doc_to_target: !function preprocess_winogrande.doc_to_target
+doc_to_choice: !function preprocess_winogrande.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: sentence
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/preprocess_winogrande.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/preprocess_winogrande.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2076a762905cd151db382ec78109795975d74f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/winogrande/preprocess_winogrande.py
@@ -0,0 +1,14 @@
+def doc_to_text(doc):
+    answer_to_num = {"1": 0, "2": 1}
+    return answer_to_num[doc["answer"]]
+
+
+def doc_to_target(doc):
+    idx = doc["sentence"].index("_") + 1
+    return doc["sentence"][idx:].strip()
+
+
+def doc_to_choice(doc):
+    idx = doc["sentence"].index("_")
+    options = [doc["option1"], doc["option2"]]
+    return [doc["sentence"][:idx] + opt for opt in options]