File size: 1,251 Bytes
e791fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json
import sys
import os
from tqdm import tqdm

def load_jsonl_set(path):
    """加载jsonl文件,返回以audios字段为key的dict"""
    data_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line.strip())
                # 用tuple(audios)做key,保证唯一性
                key = tuple(item.get('audios', []))
                data_dict[key] = item
            except Exception as e:
                continue
    return data_dict

def main(file1, file2, output_path):
    dict1 = load_jsonl_set(file1)
    dict2 = load_jsonl_set(file2)

    # 取交集
    common_keys = set(dict1.keys()) & set(dict2.keys())
    print(f"交集样本数: {len(common_keys)}")

    with open(output_path, 'w', encoding='utf-8') as out:
        for key in tqdm(common_keys, desc="写入交集"):
            # 以file1的内容为准
            out.write(json.dumps(dict1[key], ensure_ascii=False) + '\n')

    print(f"交集已保存到: {output_path}")

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("用法: python intersect_jsonl.py file1.jsonl file2.jsonl output.jsonl")
        sys.exit(1)
    main(sys.argv[1], sys.argv[2], sys.argv[3])