|
|
import json |
|
|
import re |
|
|
from typing import Union, Dict |
|
|
from pathlib import Path |
|
|
import os |
|
|
|
|
|
MAX_FILE_NAME_LENGTH = 100 |
|
|
|
|
|
|
|
|
def read_jsonl_to_mapping( |
|
|
jsonl_file: Union[str, Path], |
|
|
key_col: str, |
|
|
value_col: str, |
|
|
base_path=None, |
|
|
overwrite=True, |
|
|
) -> Dict[str, str]: |
|
|
""" |
|
|
Read two columns, indicated by `key_col` and `value_col`, from the |
|
|
given jsonl file to return the mapping dict |
|
|
TODO handle duplicate keys |
|
|
""" |
|
|
mapping = {} |
|
|
with open(jsonl_file, 'r') as file: |
|
|
for line in file.readlines(): |
|
|
data = json.loads(line.strip()) |
|
|
key = data[key_col] |
|
|
value = data[value_col] |
|
|
if base_path: |
|
|
value = os.path.join(base_path, value) |
|
|
if key not in mapping.keys() or overwrite: |
|
|
mapping[key] = value |
|
|
return mapping |
|
|
|
|
|
|
|
|
def sanitize_filename(name: str, max_len: int = MAX_FILE_NAME_LENGTH) -> str: |
|
|
""" |
|
|
Clean and truncate a string to make it a valid and safe filename. |
|
|
""" |
|
|
name = re.sub(r'[\\/*?:"<>|]', '_', name) |
|
|
name = name.replace('/', '_') |
|
|
max_len = min(len(name), max_len) |
|
|
return name[:max_len] |
|
|
|
|
|
|
|
|
def transform_gen_fn_to_id(audio_file: Path, task: str) -> str: |
|
|
if task == "svs": |
|
|
audio_id = audio_file.stem.split("_")[0] |
|
|
elif task == "sr": |
|
|
audio_id = audio_file.stem |
|
|
elif task == "tta": |
|
|
audio_id = audio_file.stem[:12] + '.wav' |
|
|
elif task == "ttm": |
|
|
audio_id = audio_file.stem[:11] |
|
|
|
|
|
elif task == "v2a": |
|
|
audio_id = audio_file.stem.rsplit("_", 1)[0] + ".mp4" |
|
|
elif task == "sta_test" or task == "tta_test": |
|
|
audio_id = audio_file.stem[:12] + '.wav' |
|
|
elif task == "sta_base": |
|
|
audio_id = 'Y' + audio_file.stem[:11] + '.wav' |
|
|
else: |
|
|
audio_id = audio_file.stem |
|
|
return audio_id |
|
|
|
|
|
|
|
|
def audio_dir_to_mapping(audio_dir: str | Path, task: str) -> dict: |
|
|
mapping = {} |
|
|
audio_dir = Path(audio_dir) |
|
|
audio_files = sorted(audio_dir.iterdir()) |
|
|
for audio_file in audio_files: |
|
|
audio_id = transform_gen_fn_to_id(audio_file, task) |
|
|
mapping[audio_id] = str(audio_file.resolve()) |
|
|
return mapping |
|
|
|