|
|
import json |
|
|
import tarfile |
|
|
import os |
|
|
|
|
|
jsonl_path = 'dataset_4JOB.jsonl' |
|
|
output_tar_path = '4JOB_train.tar' |
|
|
keep_directory_structure = False |
|
|
|
|
|
def collect_audio_paths(jsonl_path): |
|
|
audio_paths = set() |
|
|
with open(jsonl_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
data = json.loads(line) |
|
|
audios = data.get('audios', []) |
|
|
for audio_path in audios: |
|
|
audio_paths.add(audio_path) |
|
|
return list(audio_paths) |
|
|
|
|
|
def add_files_to_tar(tar_path, file_paths, keep_structure=False): |
|
|
with tarfile.open(tar_path, 'w') as tar: |
|
|
for path in file_paths: |
|
|
if not os.path.isfile(path): |
|
|
print(f"Warning: File not found - {path}") |
|
|
continue |
|
|
arcname = path if keep_structure else os.path.basename(path) |
|
|
tar.add(path, arcname=arcname) |
|
|
|
|
|
def main(): |
|
|
audio_paths = collect_audio_paths(jsonl_path) |
|
|
print(f"Collected {len(audio_paths)} unique audio files.") |
|
|
add_files_to_tar(output_tar_path, audio_paths, keep_structure=keep_directory_structure) |
|
|
print(f"TAR file created: {output_tar_path}") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|