File size: 1,292 Bytes
e791fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import json
import tarfile
import os

jsonl_path = 'dataset_4JOB.jsonl'  # 输入 JSONL 文件路径
output_tar_path = '4JOB_train.tar'  # 输出 tar 文件名
keep_directory_structure = False  # 设置为 True 会保留原始路径结构;False 则只保留文件名

def collect_audio_paths(jsonl_path):
    audio_paths = set()
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            audios = data.get('audios', [])
            for audio_path in audios:
                audio_paths.add(audio_path)
    return list(audio_paths)

def add_files_to_tar(tar_path, file_paths, keep_structure=False):
    with tarfile.open(tar_path, 'w') as tar:
        for path in file_paths:
            if not os.path.isfile(path):
                print(f"Warning: File not found - {path}")
                continue
            arcname = path if keep_structure else os.path.basename(path)
            tar.add(path, arcname=arcname)

def main():
    audio_paths = collect_audio_paths(jsonl_path)
    print(f"Collected {len(audio_paths)} unique audio files.")
    add_files_to_tar(output_tar_path, audio_paths, keep_structure=keep_directory_structure)
    print(f"TAR file created: {output_tar_path}")

if __name__ == '__main__':
    main()