Training in progress, step 1000

cd63278 verified 10 months ago

5.04 kB

	# Copyright 2025 the LlamaFactory team.
	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os

	import datasets
	import pandas as pd


	_CITATION = """\
	@article{huang2023ceval,
	title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
	author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and others},
	journal={arXiv preprint arXiv:2305.08322},
	year={2023}
	}
	"""

	_DESCRIPTION = """\
	C-Eval is a comprehensive Chinese evaluation suite for foundation models.
	It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
	"""

	_HOMEPAGE = "https://cevalbenchmark.com"

	_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"

	_URL = "ceval.zip"

	task_list = [
	"computer_network",
	"operating_system",
	"computer_architecture",
	"college_programming",
	"college_physics",
	"college_chemistry",
	"advanced_mathematics",
	"probability_and_statistics",
	"discrete_mathematics",
	"electrical_engineer",
	"metrology_engineer",
	"high_school_mathematics",
	"high_school_physics",
	"high_school_chemistry",
	"high_school_biology",
	"middle_school_mathematics",
	"middle_school_biology",
	"middle_school_physics",
	"middle_school_chemistry",
	"veterinary_medicine",
	"college_economics",
	"business_administration",
	"marxism",
	"mao_zedong_thought",
	"education_science",
	"teacher_qualification",
	"high_school_politics",
	"high_school_geography",
	"middle_school_politics",
	"middle_school_geography",
	"modern_chinese_history",
	"ideological_and_moral_cultivation",
	"logic",
	"law",
	"chinese_language_and_literature",
	"art_studies",
	"professional_tour_guide",
	"legal_professional",
	"high_school_chinese",
	"high_school_history",
	"middle_school_history",
	"civil_servant",
	"sports_science",
	"plant_protection",
	"basic_medicine",
	"clinical_medicine",
	"urban_and_rural_planner",
	"accountant",
	"fire_engineer",
	"environmental_impact_assessment_engineer",
	"tax_accountant",
	"physician",
	]


	class CevalConfig(datasets.BuilderConfig):
	def __init__(self, **kwargs):
	super().__init__(version=datasets.Version("1.0.0"), **kwargs)


	class Ceval(datasets.GeneratorBasedBuilder):
	BUILDER_CONFIGS = [
	CevalConfig(
	name=task_name,
	)
	for task_name in task_list
	]

	def _info(self):
	features = datasets.Features(
	{
	"id": datasets.Value("int32"),
	"question": datasets.Value("string"),
	"A": datasets.Value("string"),
	"B": datasets.Value("string"),
	"C": datasets.Value("string"),
	"D": datasets.Value("string"),
	"answer": datasets.Value("string"),
	"explanation": datasets.Value("string"),
	}
	)
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=features,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	data_dir = dl_manager.download_and_extract(_URL)
	task_name = self.config.name
	return [
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	gen_kwargs={
	"filepath": os.path.join(data_dir, "test", f"{task_name}_test.csv"),
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	gen_kwargs={
	"filepath": os.path.join(data_dir, "val", f"{task_name}_val.csv"),
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	gen_kwargs={
	"filepath": os.path.join(data_dir, "dev", f"{task_name}_dev.csv"),
	},
	),
	]

	def _generate_examples(self, filepath):
	df = pd.read_csv(filepath, encoding="utf-8")
	for i, instance in enumerate(df.to_dict(orient="records")):
	if "answer" not in instance.keys():
	instance["answer"] = ""
	if "explanation" not in instance.keys():
	instance["explanation"] = ""
	yield i, instance