import json import tarfile import os jsonl_path = 'dataset_4JOB.jsonl' # 输入 JSONL 文件路径 output_tar_path = '4JOB_train.tar' # 输出 tar 文件名 keep_directory_structure = False # 设置为 True 会保留原始路径结构;False 则只保留文件名 def collect_audio_paths(jsonl_path): audio_paths = set() with open(jsonl_path, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) audios = data.get('audios', []) for audio_path in audios: audio_paths.add(audio_path) return list(audio_paths) def add_files_to_tar(tar_path, file_paths, keep_structure=False): with tarfile.open(tar_path, 'w') as tar: for path in file_paths: if not os.path.isfile(path): print(f"Warning: File not found - {path}") continue arcname = path if keep_structure else os.path.basename(path) tar.add(path, arcname=arcname) def main(): audio_paths = collect_audio_paths(jsonl_path) print(f"Collected {len(audio_paths)} unique audio files.") add_files_to_tar(output_tar_path, audio_paths, keep_structure=keep_directory_structure) print(f"TAR file created: {output_tar_path}") if __name__ == '__main__': main()