diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0c8dfdc7e8ccce1f61fb55f140b4aca539a8d4e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml @@ -0,0 +1,27 @@ +tag: + - kmmlu + - kmmlu_direct +dataset_path: HAERAE-HUB/KMMLU +output_type: generate_until +test_split: test +fewshot_split: dev +doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:" +doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + regexes_to_ignore: + - " " +generation_kwargs: + until: + - "Q:" + - "\n\n" + - "" + - "." + do_sample: false + temperature: 0.0 +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7736e8d5b918f58ffc4dfa19e3e6bd6af898980 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml @@ -0,0 +1,3 @@ +dataset_name: Accounting +include: _direct_kmmlu_yaml +task: kmmlu_direct_accounting diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5bf1fa4b56fdc58cd4219164cc90b11f50886bc1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml @@ -0,0 +1,3 @@ +dataset_name: Agricultural-Sciences +include: _direct_kmmlu_yaml +task: kmmlu_direct_agricultural_sciences diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a9a621931a8f387085557e741fc5c22c9755cb7b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml @@ -0,0 +1,3 @@ +dataset_name: Aviation-Engineering-and-Maintenance +include: _direct_kmmlu_yaml +task: kmmlu_direct_aviation_engineering_and_maintenance diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ebe1765b34a3fe774d45869552d0f69e80285896 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml @@ -0,0 +1,3 @@ +dataset_name: Biology +include: _direct_kmmlu_yaml +task: kmmlu_direct_biology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..edabfb67dd089798dcc001db737136e55eed0efe --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml @@ -0,0 +1,3 @@ +dataset_name: Chemistry +include: _direct_kmmlu_yaml +task: kmmlu_direct_chemistry diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98ed98dd2cc5f90039d98b74ca0f711809232e14 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Civil-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_civil_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c546e738d68db7e281b5d70bbf9771bced6c1300 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml @@ -0,0 +1,3 @@ +dataset_name: Computer-Science +include: _direct_kmmlu_yaml +task: kmmlu_direct_computer_science diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0af2a16cfc082d58903758234ed0e36de0333c9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml @@ -0,0 +1,3 @@ +dataset_name: Construction +include: _direct_kmmlu_yaml +task: kmmlu_direct_construction diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9dfdfabc5971164a63fe651c66f4c0842598ef17 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml @@ -0,0 +1,3 @@ +dataset_name: Criminal-Law +include: _direct_kmmlu_yaml +task: kmmlu_direct_criminal_law diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9d182903e2abe1f3c2b3f5d4cbe955bb1bcf58c9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml @@ -0,0 +1,3 @@ +dataset_name: Ecology +include: _direct_kmmlu_yaml +task: kmmlu_direct_ecology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db4d78405a6079273f8042350fd4f785c9fe4bed --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml @@ -0,0 +1,3 @@ +dataset_name: Economics +include: _direct_kmmlu_yaml +task: kmmlu_direct_economics diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74887e76f395c2b8565cd7c716fd410f921f6f1d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml @@ -0,0 +1,3 @@ +dataset_name: Education +include: _direct_kmmlu_yaml +task: kmmlu_direct_education diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3455d50715d250762358c9db89f05a0c8eb521c3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Electrical-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_electrical_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b45aa3083cb269c964b4beff2c48a9d1cfcc973c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Electronics-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_electronics_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4fb806b3808d2cb47ea68534030b9432e998b74 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml @@ -0,0 +1,3 @@ +dataset_name: Energy-Management +include: _direct_kmmlu_yaml +task: kmmlu_direct_energy_management diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1670ff16bae6d41096f2b9c86f8361455f4c347e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml @@ -0,0 +1,3 @@ +dataset_name: Environmental-Science +include: _direct_kmmlu_yaml +task: kmmlu_direct_environmental_science diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aef8043aa4605573b074b96b711b6f321d179f44 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml @@ -0,0 +1,3 @@ +dataset_name: Fashion +include: _direct_kmmlu_yaml +task: kmmlu_direct_fashion diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f49b087fc288187a9a3363260a17bda1a68ce9bb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml @@ -0,0 +1,3 @@ +dataset_name: Food-Processing +include: _direct_kmmlu_yaml +task: kmmlu_direct_food_processing diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..00b7021c5c11fb9a0cae1958e2079e41c5854d4c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Gas-Technology-and-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_gas_technology_and_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d8dc70db5eabc1af2e29d3c8588dfb04b8dedb1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml @@ -0,0 +1,3 @@ +dataset_name: Geomatics +include: _direct_kmmlu_yaml +task: kmmlu_direct_geomatics diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f0d77eb78a61cd2b7b00b80311b59b011abc47e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml @@ -0,0 +1,3 @@ +dataset_name: Health +include: _direct_kmmlu_yaml +task: kmmlu_direct_health diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39ea0bcf054c6dfef197beef942a16feffca338b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml @@ -0,0 +1,3 @@ +dataset_name: Industrial-Engineer +include: _direct_kmmlu_yaml +task: kmmlu_direct_industrial_engineer diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c42e80eda1ad438d65d1d656671d5fb1542018da --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml @@ -0,0 +1,3 @@ +dataset_name: Information-Technology +include: _direct_kmmlu_yaml +task: kmmlu_direct_information_technology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml new file mode 100644 index 0000000000000000000000000000000000000000..842534aa0a4e87d6aa4bb43b0261b85b7e47676f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml @@ -0,0 +1,3 @@ +dataset_name: Interior-Architecture-and-Design +include: _direct_kmmlu_yaml +task: kmmlu_direct_interior_architecture_and_design diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f1aa277a70d03a617e673c27bba1cc2d7440d156 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml @@ -0,0 +1,3 @@ +dataset_name: Korean-History +include: _direct_kmmlu_yaml +task: kmmlu_direct_korean_history diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..602f8982f6ca939766cf0d87f0546eef5a4452de --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml @@ -0,0 +1,3 @@ +dataset_name: Law +include: _direct_kmmlu_yaml +task: kmmlu_direct_law diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bfb923c2a9ac76515f3796a5a8c73770ed9fc586 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml @@ -0,0 +1,3 @@ +dataset_name: Machine-Design-and-Manufacturing +include: _direct_kmmlu_yaml +task: kmmlu_direct_machine_design_and_manufacturing diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7352a1360b2a0cb32a85e88351cccfad62c142d3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml @@ -0,0 +1,3 @@ +dataset_name: Management +include: _direct_kmmlu_yaml +task: kmmlu_direct_management diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa0c8f319f35d3343ec4cd5b3be8247fa8fe3e61 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Maritime-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_maritime_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c3b524d853c19b9943c0e50bf8842632e8971344 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml @@ -0,0 +1,3 @@ +dataset_name: Marketing +include: _direct_kmmlu_yaml +task: kmmlu_direct_marketing diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f04e0975a0700c13d9e816c5d37981d22d8f1b6c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Materials-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_materials_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c5d28af05edd5bb5c3c9207930c1994068ce1fe --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml @@ -0,0 +1,3 @@ +dataset_name: Math +include: _direct_kmmlu_yaml +task: kmmlu_direct_math diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a253535adb6c44a8fa8340b106539205cbe6c689 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Mechanical-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_mechanical_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b8dc7e7845394754ede20b72534fe889c7c564f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml @@ -0,0 +1,3 @@ +dataset_name: Nondestructive-Testing +include: _direct_kmmlu_yaml +task: kmmlu_direct_nondestructive_testing diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2afff2c373a4e5a201a233de96d71baf6d980937 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml @@ -0,0 +1,3 @@ +dataset_name: Patent +include: _direct_kmmlu_yaml +task: kmmlu_direct_patent diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2209abbf05d8f78017fdcdc6b4178d5c48a2305a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml @@ -0,0 +1,3 @@ +dataset_name: Political-Science-and-Sociology +include: _direct_kmmlu_yaml +task: kmmlu_direct_political_science_and_sociology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..140302d01f32ab5d0e55cfe01748659536a2262c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml @@ -0,0 +1,3 @@ +dataset_name: Psychology +include: _direct_kmmlu_yaml +task: kmmlu_direct_psychology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5bb16a90d1f5303b919e8f348b3eb79a9f7cf296 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml @@ -0,0 +1,3 @@ +dataset_name: Public-Safety +include: _direct_kmmlu_yaml +task: kmmlu_direct_public_safety diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a13204a23bbb4be1de93fceb697cb37d8319ae6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: Railway-and-Automotive-Engineering +include: _direct_kmmlu_yaml +task: kmmlu_direct_railway_and_automotive_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a5202b65d8c9ba693f470e953b22ea3b721e84a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml @@ -0,0 +1,3 @@ +dataset_name: Real-Estate +include: _direct_kmmlu_yaml +task: kmmlu_direct_real_estate diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44f9e428bbd8d8c7eb33617a6498d2856a6e1c1a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml @@ -0,0 +1,3 @@ +dataset_name: Refrigerating-Machinery +include: _direct_kmmlu_yaml +task: kmmlu_direct_refrigerating_machinery diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa13bdff6a4791c8e20fe905a84db0586af11afa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml @@ -0,0 +1,3 @@ +dataset_name: Social-Welfare +include: _direct_kmmlu_yaml +task: kmmlu_direct_social_welfare diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..69e71d6dfa6284cc701221c5c187969be5e92832 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml @@ -0,0 +1,3 @@ +dataset_name: Taxation +include: _direct_kmmlu_yaml +task: kmmlu_direct_taxation diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f4d1fd05c876bf269c0aae1f3590f8801f7e9955 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml @@ -0,0 +1,3 @@ +dataset_name: Telecommunications-and-Wireless-Technology +include: _direct_kmmlu_yaml +task: kmmlu_direct_telecommunications_and_wireless_technology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml new file mode 100644 index 0000000000000000000000000000000000000000..3cf6359206ba07951a7ac08781f8dd6d3fd1450a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/_direct_hard_kmmlu_yaml @@ -0,0 +1,27 @@ +tag: + - kmmlu + - kmmlu_hard_direct +dataset_path: HAERAE-HUB/KMMLU-HARD +output_type: generate_until +test_split: test +fewshot_split: dev +doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:" +doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + regexes_to_ignore: + - " " +generation_kwargs: + until: + - "Q:" + - "\n\n" + - "" + - "." + do_sample: false + temperature: 0.0 +metadata: + version: 2.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml new file mode 100644 index 0000000000000000000000000000000000000000..25c91cb6e5e55fcc578bd455086b994f1dd51d8c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml @@ -0,0 +1,3 @@ +dataset_name: aviation_engineering_and_maintenance +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_aviation_engineering_and_maintenance diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7bc8417b030a06bfd2308384525e6a5b4dcacc4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml @@ -0,0 +1,3 @@ +dataset_name: biology +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_biology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..371db7bfbffb6dfee72baf0482be6d2acea883e4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml @@ -0,0 +1,3 @@ +dataset_name: chemistry +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_chemistry diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba2c23b2d1866b4b0dfe71304758e26e94a42a89 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: civil_engineering +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_civil_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a388ff474281c525b8e674f204376c16e522641 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml @@ -0,0 +1,3 @@ +dataset_name: computer_science +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_computer_science diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..adedf9d6e704a36368249260114aa8a80954a24a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml @@ -0,0 +1,3 @@ +dataset_name: ecology +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_ecology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f42e5b8dad2a7f4481dbd7d5e476ccccef222ede --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml @@ -0,0 +1,3 @@ +dataset_name: economics +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_economics diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c90432fe26075d1c14f84f5765f8e3198deb2ed --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml @@ -0,0 +1,3 @@ +dataset_name: education +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_education diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e01781549fd0bf1982b895ba2041c3d6f9ec9644 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: electronics_engineering +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_electronics_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de511a09f02c411dedba2ac816a34c11b6805caa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml @@ -0,0 +1,3 @@ +dataset_name: environmental_science +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_environmental_science diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e48143d2c3bc7a69db87ac5d68f4a8951c1d391d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml @@ -0,0 +1,3 @@ +dataset_name: food_processing +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_food_processing diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eb5211ad857bfe99cc41062f21b8c47d008c3c64 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: gas_technology_and_engineering +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_gas_technology_and_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a25f3c1a7eefe75cd11ce6d45f62ab898f30922b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml @@ -0,0 +1,3 @@ +dataset_name: geomatics +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_geomatics diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7ca26e58ac90c69cb2bffcf7a4d95657b019019 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml @@ -0,0 +1,3 @@ +dataset_name: industrial_engineer +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_industrial_engineer diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b1303810a9fbee6d966095fabbcc773dc489e71 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml @@ -0,0 +1,3 @@ +dataset_name: interior_architecture_and_design +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_interior_architecture_and_design diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml new file mode 100644 index 0000000000000000000000000000000000000000..168f0340590d9736548eaeb56335e734d756fdac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml @@ -0,0 +1,3 @@ +dataset_name: law +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_law diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6eb945d27e69a636cea53c1c8ba9a35c569fe7f5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml @@ -0,0 +1,3 @@ +dataset_name: management +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_management diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4078cf973b90f3e03ac88a7670b3344a159fef2e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: maritime_engineering +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_maritime_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f5f3373a8aee37d793e49693b53a5c6bd514cb78 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml @@ -0,0 +1,3 @@ +dataset_name: math +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_math diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ff9583743953fde9d681a9d4c4655b72d7c7e3c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml @@ -0,0 +1,3 @@ +dataset_name: nondestructive_testing +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_nondestructive_testing diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d913752b0bb3f9cfd0c47eb8919f4beb6e921adb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml @@ -0,0 +1,3 @@ +dataset_name: patent +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_patent diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a5d96b6000a27ff3631fbf4c42b89ea3a41fc9a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml @@ -0,0 +1,3 @@ +dataset_name: political_science_and_sociology +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_political_science_and_sociology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9fbf0d3191e885cd1486caf148d1c723ea142ee2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml @@ -0,0 +1,3 @@ +dataset_name: psychology +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_psychology diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b376c4ebae7574364b1157afd65938237eeca209 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml @@ -0,0 +1,3 @@ +dataset_name: public_safety +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_public_safety diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0eb534e579c125e2e9951443649a5fbc084da47f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml @@ -0,0 +1,3 @@ +dataset_name: railway_and_automotive_engineering +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_railway_and_automotive_engineering diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c3df599ee0bae86ec979fabd1b3b118c3034c08 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml @@ -0,0 +1,3 @@ +dataset_name: real_estate +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_real_estate diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f62e8e9559fb0f0cb8795afd7027093b65d822f1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml @@ -0,0 +1,3 @@ +dataset_name: refrigerating_machinery +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_refrigerating_machinery diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..445ab693d6a3064ea35a169d2d7327f6f0942687 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml @@ -0,0 +1,3 @@ +dataset_name: taxation +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_taxation diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..498b2fb2d661089325953ea8de407e08fb9d4934 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml @@ -0,0 +1,3 @@ +dataset_name: telecommunications_and_wireless_technology +include: _direct_hard_kmmlu_yaml +task: kmmlu_hard_direct_telecommunications_and_wireless_technology