| |
| |
| |
| |
|
|
| import json |
| from tqdm import tqdm |
| import os |
| import librosa |
|
|
| from utils.util import has_existed |
|
|
|
|
| def get_lines(file): |
| with open(file, "r") as f: |
| lines = f.readlines() |
| lines = [l.strip() for l in lines] |
| return lines |
|
|
|
|
| def get_uid2utt(opencpop_path, dataset, dataset_type): |
| index_count = 0 |
| total_duration = 0 |
|
|
| file = os.path.join(opencpop_path, "segments", "{}.txt".format(dataset_type)) |
| lines = get_lines(file) |
|
|
| uid2utt = [] |
| for l in tqdm(lines): |
| items = l.split("|") |
| uid = items[0] |
|
|
| res = { |
| "Dataset": dataset, |
| "index": index_count, |
| "Singer": "female1", |
| "Uid": uid, |
| } |
|
|
| |
| audio_file = os.path.join(opencpop_path, "segments/wavs/{}.wav".format(uid)) |
| res["Path"] = audio_file |
|
|
| duration = librosa.get_duration(filename=res["Path"]) |
| res["Duration"] = duration |
|
|
| uid2utt.append(res) |
|
|
| index_count = index_count + 1 |
| total_duration += duration |
|
|
| return uid2utt, total_duration / 3600 |
|
|
|
|
| def main(dataset, output_path, dataset_path): |
| print("-" * 10) |
| print("Dataset splits for {}...\n".format(dataset)) |
|
|
| save_dir = os.path.join(output_path, dataset) |
| opencpop_path = dataset_path |
| for dataset_type in ["train", "test"]: |
| output_file = os.path.join(save_dir, "{}.json".format(dataset_type)) |
| if has_existed(output_file): |
| continue |
|
|
| res, hours = get_uid2utt(opencpop_path, dataset, dataset_type) |
|
|
| |
| os.makedirs(save_dir, exist_ok=True) |
| with open(output_file, "w") as f: |
| json.dump(res, f, indent=4, ensure_ascii=False) |
|
|
| print("{}_{}_hours= {}".format(dataset, dataset_type, hours)) |
|
|