interactSpeech / .ipynb_checkpoints /maketar-checkpoint.py
Student0809's picture
Add files using upload-large-folder tool
e791fa3 verified
import json
import tarfile
import os
jsonl_path = 'dataset_4JOB.jsonl' # 输入 JSONL 文件路径
output_tar_path = '4JOB_train.tar' # 输出 tar 文件名
keep_directory_structure = False # 设置为 True 会保留原始路径结构;False 则只保留文件名
def collect_audio_paths(jsonl_path):
audio_paths = set()
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
audios = data.get('audios', [])
for audio_path in audios:
audio_paths.add(audio_path)
return list(audio_paths)
def add_files_to_tar(tar_path, file_paths, keep_structure=False):
with tarfile.open(tar_path, 'w') as tar:
for path in file_paths:
if not os.path.isfile(path):
print(f"Warning: File not found - {path}")
continue
arcname = path if keep_structure else os.path.basename(path)
tar.add(path, arcname=arcname)
def main():
audio_paths = collect_audio_paths(jsonl_path)
print(f"Collected {len(audio_paths)} unique audio files.")
add_files_to_tar(output_tar_path, audio_paths, keep_structure=keep_directory_structure)
print(f"TAR file created: {output_tar_path}")
if __name__ == '__main__':
main()