import argparse import json from pathlib import Path from opencompass.datasets.chatml.verification import VerifyDataset def parse_args(): parser = argparse.ArgumentParser( description='Utils to check the format of your ChatML dataset files') parser.add_argument('--path', type=str, help='your dataset file path or category path.') return parser.parse_args() def collect_file_paths(path): path_obj = Path(path) file_paths = [] if not path_obj.exists(): print(f"警告: 路径 '{path}' 不存在") return file_paths if path_obj.is_file(): file_paths.append(str(path_obj)) elif path_obj.is_dir(): for file_path in path_obj.rglob('*'): if file_path.is_file(): file_paths.append(str(file_path)) file_paths = [f for f in file_paths if f.endswith('.jsonl')] return file_paths def main(): args = parse_args() all_check_files = collect_file_paths(args.path) for path in all_check_files: with open(path, 'r', encoding='utf-8-sig') as f: data = [json.loads(line) for line in f] for i in range(len(data)): key_list = list(data[i].keys()) for key in key_list: if key != 'question' and key != 'answer': del data[i][key] print(f': checking file {path} ...') for i in data: try: VerifyDataset(**i) except Exception as e: print(f': check failed. {e}') break print('format check finished!') if __name__ == '__main__': main()