| | import json |
| | from random import shuffle |
| |
|
| | import tqdm |
| | from text.cleaner import clean_text |
| | from collections import defaultdict |
| | import shutil |
| | stage = [1,2,3] |
| |
|
| | transcription_path = 'filelists/short_character_anno.list' |
| | train_path = 'filelists/train.list' |
| | val_path = 'filelists/val.list' |
| | config_path = "configs/config.json" |
| | val_per_spk = 4 |
| | max_val_total = 8 |
| |
|
| | if 1 in stage: |
| | with open( transcription_path+'.cleaned', 'w', encoding='utf-8') as f: |
| | for line in tqdm.tqdm(open(transcription_path, encoding='utf-8').readlines()): |
| | try: |
| | utt, spk, language, text = line.strip().split('|') |
| | |
| | norm_text, phones, tones, word2ph = clean_text(text, language) |
| | f.write('{}|{}|{}|{}|{}|{}|{}\n'.format(utt, spk, language, norm_text, ' '.join(phones), |
| | " ".join([str(i) for i in tones]), |
| | " ".join([str(i) for i in word2ph]))) |
| | except: |
| | print("err!", utt) |
| |
|
| | if 2 in stage: |
| | spk_utt_map = defaultdict(list) |
| | spk_id_map = {} |
| | current_sid = 0 |
| |
|
| | with open( transcription_path+'.cleaned', encoding='utf-8') as f: |
| | for line in f.readlines(): |
| | utt, spk, language, text, phones, tones, word2ph = line.strip().split('|') |
| | spk_utt_map[spk].append(line) |
| | if spk not in spk_id_map.keys(): |
| | spk_id_map[spk] = current_sid |
| | current_sid += 1 |
| | train_list = [] |
| | val_list = [] |
| | for spk, utts in spk_utt_map.items(): |
| | shuffle(utts) |
| | val_list+=utts[:val_per_spk] |
| | train_list+=utts[val_per_spk:] |
| | if len(val_list) > max_val_total: |
| | train_list+=val_list[max_val_total:] |
| | val_list = val_list[:max_val_total] |
| | |
| | with open( train_path,"w", encoding='utf-8') as f: |
| | for line in train_list: |
| | f.write(line) |
| |
|
| | file_path = transcription_path+'.cleaned' |
| | shutil.copy(file_path,'./filelists/train.list') |
| | |
| | with open(val_path, "w", encoding='utf-8') as f: |
| | for line in val_list: |
| | f.write(line) |
| |
|
| | if 3 in stage: |
| | assert 2 in stage |
| | config = json.load(open(config_path)) |
| | config['data']["n_speakers"] = current_sid |
| | config["data"]['spk2id'] = spk_id_map |
| | with open(config_path, 'w', encoding='utf-8') as f: |
| | json.dump(config, f, indent=2, ensure_ascii=False) |
| |
|