msj19
/

opencompass

Model card Files Files and versions

opencompass / tools /chatml_format_test.py

msj19's picture

Add files using upload-large-folder tool

a908f55 verified 19 days ago

history blame contribute delete

1.67 kB

	import argparse
	import json
	from pathlib import Path

	from opencompass.datasets.chatml.verification import VerifyDataset


	def parse_args():
	parser = argparse.ArgumentParser(
	description='Utils to check the format of your ChatML dataset files')
	parser.add_argument('--path',
	type=str,
	help='your dataset file path or category path.')
	return parser.parse_args()


	def collect_file_paths(path):

	path_obj = Path(path)
	file_paths = []

	if not path_obj.exists():
	print(f"警告: 路径 '{path}' 不存在")
	return file_paths

	if path_obj.is_file():
	file_paths.append(str(path_obj))
	elif path_obj.is_dir():
	for file_path in path_obj.rglob('*'):
	if file_path.is_file():
	file_paths.append(str(file_path))

	file_paths = [f for f in file_paths if f.endswith('.jsonl')]
	return file_paths


	def main():
	args = parse_args()
	all_check_files = collect_file_paths(args.path)
	for path in all_check_files:
	with open(path, 'r', encoding='utf-8-sig') as f:
	data = [json.loads(line) for line in f]
	for i in range(len(data)):
	key_list = list(data[i].keys())
	for key in key_list:
	if key != 'question' and key != 'answer':
	del data[i][key]

	print(f': checking file {path} ...')
	for i in data:
	try:
	VerifyDataset(**i)
	except Exception as e:
	print(f': check failed. {e}')
	break

	print('format check finished!')


	if __name__ == '__main__':
	main()