Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md +40 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml +21 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md +43 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml +21 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md +48 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml +78 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml +18 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml +4 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/README.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Arabic COPA
|
| 2 |
+
|
| 3 |
+
### Paper
|
| 4 |
+
|
| 5 |
+
Original Title: `COPA`
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning.
|
| 10 |
+
|
| 11 |
+
[Homepage](https://people.ict.usc.edu/~gordon/copa.html)
|
| 12 |
+
|
| 13 |
+
AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
|
| 14 |
+
|
| 15 |
+
The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/copa_ar)
|
| 16 |
+
|
| 17 |
+
### Citation
|
| 18 |
+
|
| 19 |
+
### Groups and Tasks
|
| 20 |
+
|
| 21 |
+
#### Groups
|
| 22 |
+
|
| 23 |
+
* Not part of a group yet.
|
| 24 |
+
|
| 25 |
+
#### Tasks
|
| 26 |
+
|
| 27 |
+
* `copa_ar`
|
| 28 |
+
|
| 29 |
+
### Checklist
|
| 30 |
+
|
| 31 |
+
For adding novel benchmarks/datasets to the library:
|
| 32 |
+
* [x] Is the task an existing benchmark in the literature?
|
| 33 |
+
* [x] Have you referenced the original paper that introduced the task?
|
| 34 |
+
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
If other tasks on this dataset are already supported:
|
| 38 |
+
* [x] Is the "Main" variant of this task clearly denoted?
|
| 39 |
+
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
| 40 |
+
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: copa_ar
|
| 2 |
+
dataset_path: Hennara/copa_ar
|
| 3 |
+
dataset_name: null
|
| 4 |
+
output_type: multiple_choice
|
| 5 |
+
training_split: null
|
| 6 |
+
validation_split: null
|
| 7 |
+
test_split: test
|
| 8 |
+
doc_to_text: "السؤال: {{query}}\nالجواب:"
|
| 9 |
+
doc_to_choice: "{{[sol1, sol2]}}"
|
| 10 |
+
doc_to_target: label
|
| 11 |
+
should_decontaminate: true
|
| 12 |
+
doc_to_decontamination_query: query
|
| 13 |
+
metric_list:
|
| 14 |
+
- metric: acc
|
| 15 |
+
aggregation: mean
|
| 16 |
+
higher_is_better: true
|
| 17 |
+
- metric: acc_norm
|
| 18 |
+
aggregation: mean
|
| 19 |
+
higher_is_better: true
|
| 20 |
+
metadata:
|
| 21 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Arabic PIQA
|
| 2 |
+
|
| 3 |
+
### Paper
|
| 4 |
+
|
| 5 |
+
Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language`
|
| 6 |
+
|
| 7 |
+
Original paper: [PICA](https://arxiv.org/abs/1911.11641)
|
| 8 |
+
|
| 9 |
+
Physical Interaction: Question Answering (PIQA) is a physical commonsense
|
| 10 |
+
reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
|
| 11 |
+
the physical knowledge of existing models. To what extent are current approaches
|
| 12 |
+
actually learning about the world?
|
| 13 |
+
|
| 14 |
+
[Homepage](https://yonatanbisk.com/piqa)
|
| 15 |
+
|
| 16 |
+
AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
|
| 17 |
+
|
| 18 |
+
The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/pica_ar)
|
| 19 |
+
|
| 20 |
+
### Citation
|
| 21 |
+
|
| 22 |
+
### Groups and Tasks
|
| 23 |
+
|
| 24 |
+
#### Groups
|
| 25 |
+
|
| 26 |
+
* Not part of a group yet.
|
| 27 |
+
|
| 28 |
+
#### Tasks
|
| 29 |
+
|
| 30 |
+
* `piqa_ar`
|
| 31 |
+
|
| 32 |
+
### Checklist
|
| 33 |
+
|
| 34 |
+
For adding novel benchmarks/datasets to the library:
|
| 35 |
+
* [x] Is the task an existing benchmark in the literature?
|
| 36 |
+
* [x] Have you referenced the original paper that introduced the task?
|
| 37 |
+
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
If other tasks on this dataset are already supported:
|
| 41 |
+
* [x] Is the "Main" variant of this task clearly denoted?
|
| 42 |
+
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
| 43 |
+
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
task: piqa_ar
|
| 2 |
+
dataset_path: Hennara/pica_ar
|
| 3 |
+
dataset_name: null
|
| 4 |
+
output_type: multiple_choice
|
| 5 |
+
training_split: null
|
| 6 |
+
validation_split: null
|
| 7 |
+
test_split: test
|
| 8 |
+
doc_to_text: "السؤال: {{goal}}\nالجواب:"
|
| 9 |
+
doc_to_choice: "{{[sol1, sol2]}}"
|
| 10 |
+
doc_to_target: label
|
| 11 |
+
should_decontaminate: true
|
| 12 |
+
doc_to_decontamination_query: goal
|
| 13 |
+
metric_list:
|
| 14 |
+
- metric: acc
|
| 15 |
+
aggregation: mean
|
| 16 |
+
higher_is_better: true
|
| 17 |
+
- metric: acc_norm
|
| 18 |
+
aggregation: mean
|
| 19 |
+
higher_is_better: true
|
| 20 |
+
metadata:
|
| 21 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/README.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CMMLU
|
| 2 |
+
|
| 3 |
+
### Paper
|
| 4 |
+
|
| 5 |
+
CMMLU: Measuring massive multitask language understanding in Chinese
|
| 6 |
+
https://arxiv.org/abs/2306.09212
|
| 7 |
+
|
| 8 |
+
CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture.
|
| 9 |
+
CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels.
|
| 10 |
+
|
| 11 |
+
Homepage: https://github.com/haonan-li/CMMLU
|
| 12 |
+
|
| 13 |
+
### Citation
|
| 14 |
+
|
| 15 |
+
```bibtex
|
| 16 |
+
@misc{li2023cmmlu,
|
| 17 |
+
title={CMMLU: Measuring massive multitask language understanding in Chinese},
|
| 18 |
+
author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
|
| 19 |
+
year={2023},
|
| 20 |
+
eprint={2306.09212},
|
| 21 |
+
archivePrefix={arXiv},
|
| 22 |
+
primaryClass={cs.CL}
|
| 23 |
+
}
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Groups and Tasks
|
| 27 |
+
|
| 28 |
+
#### Groups
|
| 29 |
+
|
| 30 |
+
- `cmmlu`: All 67 subjects of the CMMLU dataset, evaluated following the methodology in MMLU's original implementation.
|
| 31 |
+
|
| 32 |
+
#### Tasks
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
The following tasks evaluate subjects in the CMMLU dataset using loglikelihood-based multiple-choice scoring:
|
| 36 |
+
- `cmmlu_{subject_english}`
|
| 37 |
+
|
| 38 |
+
### Checklist
|
| 39 |
+
|
| 40 |
+
* [x] Is the task an existing benchmark in the literature?
|
| 41 |
+
* [x] Have you referenced the original paper that introduced the task?
|
| 42 |
+
* [x] If yes, does the original paper provide a reference implementation?
|
| 43 |
+
* [x] Yes, original implementation contributed by author of the benchmark
|
| 44 |
+
|
| 45 |
+
If other tasks on this dataset are already supported:
|
| 46 |
+
* [x] Is the "Main" variant of this task clearly denoted?
|
| 47 |
+
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
| 48 |
+
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_cmmlu.yaml
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: cmmlu
|
| 2 |
+
task:
|
| 3 |
+
- cmmlu_agronomy
|
| 4 |
+
- cmmlu_anatomy
|
| 5 |
+
- cmmlu_ancient_chinese
|
| 6 |
+
- cmmlu_arts
|
| 7 |
+
- cmmlu_astronomy
|
| 8 |
+
- cmmlu_business_ethics
|
| 9 |
+
- cmmlu_chinese_civil_service_exam
|
| 10 |
+
- cmmlu_chinese_driving_rule
|
| 11 |
+
- cmmlu_chinese_food_culture
|
| 12 |
+
- cmmlu_chinese_foreign_policy
|
| 13 |
+
- cmmlu_chinese_history
|
| 14 |
+
- cmmlu_chinese_literature
|
| 15 |
+
- cmmlu_chinese_teacher_qualification
|
| 16 |
+
- cmmlu_clinical_knowledge
|
| 17 |
+
- cmmlu_college_actuarial_science
|
| 18 |
+
- cmmlu_college_education
|
| 19 |
+
- cmmlu_college_engineering_hydrology
|
| 20 |
+
- cmmlu_college_law
|
| 21 |
+
- cmmlu_college_mathematics
|
| 22 |
+
- cmmlu_college_medical_statistics
|
| 23 |
+
- cmmlu_college_medicine
|
| 24 |
+
- cmmlu_computer_science
|
| 25 |
+
- cmmlu_computer_security
|
| 26 |
+
- cmmlu_conceptual_physics
|
| 27 |
+
- cmmlu_construction_project_management
|
| 28 |
+
- cmmlu_economics
|
| 29 |
+
- cmmlu_education
|
| 30 |
+
- cmmlu_electrical_engineering
|
| 31 |
+
- cmmlu_elementary_chinese
|
| 32 |
+
- cmmlu_elementary_commonsense
|
| 33 |
+
- cmmlu_elementary_information_and_technology
|
| 34 |
+
- cmmlu_elementary_mathematics
|
| 35 |
+
- cmmlu_ethnology
|
| 36 |
+
- cmmlu_food_science
|
| 37 |
+
- cmmlu_genetics
|
| 38 |
+
- cmmlu_global_facts
|
| 39 |
+
- cmmlu_high_school_biology
|
| 40 |
+
- cmmlu_high_school_chemistry
|
| 41 |
+
- cmmlu_high_school_geography
|
| 42 |
+
- cmmlu_high_school_mathematics
|
| 43 |
+
- cmmlu_high_school_physics
|
| 44 |
+
- cmmlu_high_school_politics
|
| 45 |
+
- cmmlu_human_sexuality
|
| 46 |
+
- cmmlu_international_law
|
| 47 |
+
- cmmlu_journalism
|
| 48 |
+
- cmmlu_jurisprudence
|
| 49 |
+
- cmmlu_legal_and_moral_basis
|
| 50 |
+
- cmmlu_logical
|
| 51 |
+
- cmmlu_machine_learning
|
| 52 |
+
- cmmlu_management
|
| 53 |
+
- cmmlu_marketing
|
| 54 |
+
- cmmlu_marxist_theory
|
| 55 |
+
- cmmlu_modern_chinese
|
| 56 |
+
- cmmlu_nutrition
|
| 57 |
+
- cmmlu_philosophy
|
| 58 |
+
- cmmlu_professional_accounting
|
| 59 |
+
- cmmlu_professional_law
|
| 60 |
+
- cmmlu_professional_medicine
|
| 61 |
+
- cmmlu_professional_psychology
|
| 62 |
+
- cmmlu_public_relations
|
| 63 |
+
- cmmlu_security_study
|
| 64 |
+
- cmmlu_sociology
|
| 65 |
+
- cmmlu_sports_science
|
| 66 |
+
- cmmlu_traditional_chinese_medicine
|
| 67 |
+
- cmmlu_virology
|
| 68 |
+
- cmmlu_world_history
|
| 69 |
+
- cmmlu_world_religions
|
| 70 |
+
aggregate_metric_list:
|
| 71 |
+
- aggregation: mean
|
| 72 |
+
metric: acc
|
| 73 |
+
weight_by_size: true
|
| 74 |
+
- aggregation: mean
|
| 75 |
+
metric: acc_norm
|
| 76 |
+
weight_by_size: true
|
| 77 |
+
metadata:
|
| 78 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/_default_template_yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_path: haonan-li/cmmlu
|
| 2 |
+
test_split: test
|
| 3 |
+
fewshot_split: dev
|
| 4 |
+
fewshot_config:
|
| 5 |
+
sampler: first_n
|
| 6 |
+
output_type: multiple_choice
|
| 7 |
+
doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:"
|
| 8 |
+
doc_to_choice: ["A", "B", "C", "D"]
|
| 9 |
+
doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
|
| 10 |
+
metric_list:
|
| 11 |
+
- metric: acc
|
| 12 |
+
aggregation: mean
|
| 13 |
+
higher_is_better: true
|
| 14 |
+
- metric: acc_norm
|
| 15 |
+
aggregation: mean
|
| 16 |
+
higher_is_better: true
|
| 17 |
+
metadata:
|
| 18 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "arts"
|
| 2 |
+
"description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_arts"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "college_actuarial_science"
|
| 2 |
+
"description": "以下是关于大学精算学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_college_actuarial_science"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "college_engineering_hydrology"
|
| 2 |
+
"description": "以下是关于大学工程水文学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_college_engineering_hydrology"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "college_mathematics"
|
| 2 |
+
"description": "以下是关于大学数学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_college_mathematics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "college_medical_statistics"
|
| 2 |
+
"description": "以下是关于大学医学统计的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_college_medical_statistics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "computer_science"
|
| 2 |
+
"description": "以下是关于计算机科学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_computer_science"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "computer_security"
|
| 2 |
+
"description": "以下是关于计算机安全的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_computer_security"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_anatomy.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "anatomy"
|
| 2 |
+
"description": "以下是关于解剖学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_anatomy"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "arts"
|
| 2 |
+
"description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_arts"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "astronomy"
|
| 2 |
+
"description": "以下是关于天文学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_astronomy"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "chinese_civil_service_exam"
|
| 2 |
+
"description": "以下是关于中国公务员考试的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_chinese_civil_service_exam"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "college_education"
|
| 2 |
+
"description": "以下是关于大学教育学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_college_education"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "computer_science"
|
| 2 |
+
"description": "以下是关于计算机科学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_computer_science"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_computer_security.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "computer_security"
|
| 2 |
+
"description": "以下是关于计算机安全的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_computer_security"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "economics"
|
| 2 |
+
"description": "以下是关于经济学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_economics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "elementary_chinese"
|
| 2 |
+
"description": "以下是关于小学语文的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_elementary_chinese"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "elementary_information_and_technology"
|
| 2 |
+
"description": "以下是关于小学信息技术的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_elementary_information_and_technology"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "elementary_mathematics"
|
| 2 |
+
"description": "以下是关于初等数学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_elementary_mathematics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "ethnology"
|
| 2 |
+
"description": "以下是关于民族学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_ethnology"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "genetics"
|
| 2 |
+
"description": "以下是关于遗传学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_genetics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_legal_and_moral_basis.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "legal_and_moral_basis"
|
| 2 |
+
"description": "以下是关于法律与道德基础的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_legal_and_moral_basis"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "marxist_theory"
|
| 2 |
+
"description": "以下是关于马克思主义理论的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_marxist_theory"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "modern_chinese"
|
| 2 |
+
"description": "以下是关于现代汉语的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_modern_chinese"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "philosophy"
|
| 2 |
+
"description": "以下是关于哲学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_philosophy"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "professional_medicine"
|
| 2 |
+
"description": "以下是关于专业医学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_professional_medicine"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "professional_psychology"
|
| 2 |
+
"description": "以下是关于专业心理学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_professional_psychology"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "security_study"
|
| 2 |
+
"description": "以下是关于安全研究的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_security_study"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "sports_science"
|
| 2 |
+
"description": "以下是关于体育学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_sports_science"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "virology"
|
| 2 |
+
"description": "以下是关于病毒学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_virology"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_education.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "education"
|
| 2 |
+
"description": "以下是关于教育学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_education"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "elementary_chinese"
|
| 2 |
+
"description": "以下是关于小学语文的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_elementary_chinese"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "elementary_commonsense"
|
| 2 |
+
"description": "以下是关于小学常识的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_elementary_commonsense"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "elementary_mathematics"
|
| 2 |
+
"description": "以下是关于初等数学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_elementary_mathematics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "ethnology"
|
| 2 |
+
"description": "以下是关于民族学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_ethnology"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_genetics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "genetics"
|
| 2 |
+
"description": "以下是关于遗传学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_genetics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "high_school_mathematics"
|
| 2 |
+
"description": "以下是关于高中数学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_high_school_mathematics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "high_school_politics"
|
| 2 |
+
"description": "以下是关于高中政治的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_high_school_politics"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_international_law.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "international_law"
|
| 2 |
+
"description": "以下是关于国际法学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_international_law"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_logical.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "logical"
|
| 2 |
+
"description": "以下是关于逻辑学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_logical"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "machine_learning"
|
| 2 |
+
"description": "以下是关于机器学习的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_machine_learning"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "marxist_theory"
|
| 2 |
+
"description": "以下是关于马克思主义理论的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_marxist_theory"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "nutrition"
|
| 2 |
+
"description": "以下是关于营养学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_nutrition"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"dataset_name": "philosophy"
|
| 2 |
+
"description": "以下是关于哲学的单项选择题,请直接给出正确答案的选项。\n\n"
|
| 3 |
+
"include": "_default_template_yaml"
|
| 4 |
+
"task": "cmmlu_philosophy"
|