| import sys, os | |
| from tqdm import tqdm | |
| import json | |
| metalst = sys.argv[1] | |
| wav_dir = sys.argv[2] | |
| dataset_lst = sys.argv[3] | |
| wav_res_ref_text = sys.argv[4] | |
| f = open(metalst) | |
| lines = f.readlines() | |
| f.close() | |
| datasets = [] | |
| with open(dataset_lst, "r") as f: | |
| for line in f: | |
| data = json.loads(line) | |
| datasets.append(os.path.join(wav_dir, '../', data['audio'])) | |
| f_w = open(wav_res_ref_text, 'w') | |
| for idx, line in tqdm(enumerate(lines)): | |
| assert len(line.strip().split('||')) == 2 | |
| utt, infer_text = line.strip().split('||') | |
| infer_text = infer_text.replace("|", "") | |
| if utt.endswith(".wav"): | |
| utt = utt[:-4] | |
| prompt_wav = datasets[idx] | |
| if not os.path.exists(os.path.join(wav_dir, utt + '.wav')): | |
| continue | |
| if not os.path.isabs(prompt_wav): | |
| prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav) | |
| out_line = '|'.join([os.path.join(wav_dir, utt + '.wav'), prompt_wav, infer_text]) | |
| f_w.write(out_line + '\n') | |
| f_w.close() | |