File size: 1,674 Bytes
a908f55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import argparse
import json
from pathlib import Path
from opencompass.datasets.chatml.verification import VerifyDataset
def parse_args():
parser = argparse.ArgumentParser(
description='Utils to check the format of your ChatML dataset files')
parser.add_argument('--path',
type=str,
help='your dataset file path or category path.')
return parser.parse_args()
def collect_file_paths(path):
path_obj = Path(path)
file_paths = []
if not path_obj.exists():
print(f"警告: 路径 '{path}' 不存在")
return file_paths
if path_obj.is_file():
file_paths.append(str(path_obj))
elif path_obj.is_dir():
for file_path in path_obj.rglob('*'):
if file_path.is_file():
file_paths.append(str(file_path))
file_paths = [f for f in file_paths if f.endswith('.jsonl')]
return file_paths
def main():
args = parse_args()
all_check_files = collect_file_paths(args.path)
for path in all_check_files:
with open(path, 'r', encoding='utf-8-sig') as f:
data = [json.loads(line) for line in f]
for i in range(len(data)):
key_list = list(data[i].keys())
for key in key_list:
if key != 'question' and key != 'answer':
del data[i][key]
print(f': checking file {path} ...')
for i in data:
try:
VerifyDataset(**i)
except Exception as e:
print(f': check failed. {e}')
break
print('format check finished!')
if __name__ == '__main__':
main()
|