File size: 1,674 Bytes
a908f55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import argparse
import json
from pathlib import Path

from opencompass.datasets.chatml.verification import VerifyDataset


def parse_args():
    parser = argparse.ArgumentParser(
        description='Utils to check the format of your ChatML dataset files')
    parser.add_argument('--path',
                        type=str,
                        help='your dataset file path or category path.')
    return parser.parse_args()


def collect_file_paths(path):

    path_obj = Path(path)
    file_paths = []

    if not path_obj.exists():
        print(f"警告: 路径 '{path}' 不存在")
        return file_paths

    if path_obj.is_file():
        file_paths.append(str(path_obj))
    elif path_obj.is_dir():
        for file_path in path_obj.rglob('*'):
            if file_path.is_file():
                file_paths.append(str(file_path))

    file_paths = [f for f in file_paths if f.endswith('.jsonl')]
    return file_paths


def main():
    args = parse_args()
    all_check_files = collect_file_paths(args.path)
    for path in all_check_files:
        with open(path, 'r', encoding='utf-8-sig') as f:
            data = [json.loads(line) for line in f]
        for i in range(len(data)):
            key_list = list(data[i].keys())
            for key in key_list:
                if key != 'question' and key != 'answer':
                    del data[i][key]

        print(f': checking file {path} ...')
        for i in data:
            try:
                VerifyDataset(**i)
            except Exception as e:
                print(f': check failed. {e}')
                break

    print('format check finished!')


if __name__ == '__main__':
    main()