opencompass / tools /chatml_format_test.py
msj19's picture
Add files using upload-large-folder tool
a908f55 verified
import argparse
import json
from pathlib import Path
from opencompass.datasets.chatml.verification import VerifyDataset
def parse_args():
parser = argparse.ArgumentParser(
description='Utils to check the format of your ChatML dataset files')
parser.add_argument('--path',
type=str,
help='your dataset file path or category path.')
return parser.parse_args()
def collect_file_paths(path):
path_obj = Path(path)
file_paths = []
if not path_obj.exists():
print(f"警告: 路径 '{path}' 不存在")
return file_paths
if path_obj.is_file():
file_paths.append(str(path_obj))
elif path_obj.is_dir():
for file_path in path_obj.rglob('*'):
if file_path.is_file():
file_paths.append(str(file_path))
file_paths = [f for f in file_paths if f.endswith('.jsonl')]
return file_paths
def main():
args = parse_args()
all_check_files = collect_file_paths(args.path)
for path in all_check_files:
with open(path, 'r', encoding='utf-8-sig') as f:
data = [json.loads(line) for line in f]
for i in range(len(data)):
key_list = list(data[i].keys())
for key in key_list:
if key != 'question' and key != 'answer':
del data[i][key]
print(f': checking file {path} ...')
for i in data:
try:
VerifyDataset(**i)
except Exception as e:
print(f': check failed. {e}')
break
print('format check finished!')
if __name__ == '__main__':
main()