File size: 1,251 Bytes
e791fa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import json
import sys
import os
from tqdm import tqdm
def load_jsonl_set(path):
"""加载jsonl文件,返回以audios字段为key的dict"""
data_dict = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
try:
item = json.loads(line.strip())
# 用tuple(audios)做key,保证唯一性
key = tuple(item.get('audios', []))
data_dict[key] = item
except Exception as e:
continue
return data_dict
def main(file1, file2, output_path):
dict1 = load_jsonl_set(file1)
dict2 = load_jsonl_set(file2)
# 取交集
common_keys = set(dict1.keys()) & set(dict2.keys())
print(f"交集样本数: {len(common_keys)}")
with open(output_path, 'w', encoding='utf-8') as out:
for key in tqdm(common_keys, desc="写入交集"):
# 以file1的内容为准
out.write(json.dumps(dict1[key], ensure_ascii=False) + '\n')
print(f"交集已保存到: {output_path}")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("用法: python intersect_jsonl.py file1.jsonl file2.jsonl output.jsonl")
sys.exit(1)
main(sys.argv[1], sys.argv[2], sys.argv[3])
|