| import os
|
| import json
|
| from shutil import copyfile
|
| import pandas as pd
|
| import librosa
|
| from utils import load_wav_to_torch
|
| from mel_processing import spectrogram_torch
|
| import torch
|
| import utils
|
| from models import SynthesizerTrn
|
| from text.symbols import symbols
|
| import numpy as np
|
| from scipy.io import wavfile
|
|
|
|
|
| devices = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
| hps = utils.get_hparams_from_file("./configs/aishell3_base.json")
|
|
|
|
|
| net_g = SynthesizerTrn(
|
| len(symbols),
|
| hps.data.filter_length // 2 + 1,
|
| hps.train.segment_size // hps.data.hop_length,
|
| n_speakers=175,
|
| **hps.model)
|
| if devices == 'cuda':
|
| net_g = net_g.cuda()
|
| _ = net_g.eval()
|
|
|
| _ = utils.load_checkpoint("./ckpt/G_1379000.pth", net_g, None)
|
|
|
|
|
| def mos(speaker_id, n_s):
|
|
|
| speaker_id = [41, 101, 168, 172, 2, 14, 27, 28, 112]
|
| n_s = 1
|
| ind = 1
|
| speaker_name = []
|
|
|
| with open('./configs/speakers.json', 'r', encoding='utf-8') as f:
|
| speaker_name_all = json.load(f)
|
|
|
| for s in speaker_id:
|
| for v, n in zip(speaker_name_all.values(), speaker_name_all.keys()):
|
| if s - 1 == v:
|
| speaker_name.append(n)
|
|
|
| source_sid = []
|
| target_sid = []
|
| file = []
|
| target_file = []
|
| raw_path = '/home/admin/yuanxin/3.tempData/FastSpeech2/raw_data/AISHELL3-Pro'
|
| save_path = '/home/admin/yuanxin/3.tempData/FastSpeech2/raw_data/AISHELL3-MOS'
|
| os.makedirs(save_path, exist_ok=True)
|
|
|
|
|
| for i in range(len(speaker_id)):
|
| for j in range(len(speaker_id)):
|
| if i != j:
|
|
|
| temp_file_list = os.listdir((os.path.join(raw_path, speaker_name[i])))
|
| os.makedirs(os.path.join(save_path, speaker_name[i]), exist_ok=True)
|
| count = 0
|
| for f in temp_file_list:
|
| if count > n_s:
|
| break
|
| else:
|
| if '.wav' in f:
|
|
|
| wav, fs = librosa.load(os.path.join(raw_path, speaker_name[i], f))
|
| print(len(wav)/fs)
|
| if len(wav)/fs >= 5:
|
| space_time = np.zeros(int(np.random.uniform(1, 2, 1)[0] * 22050), dtype=np.int16)
|
| wav = wav * 32767.0
|
| wav = np.concatenate((wav, space_time))
|
|
|
| tar_file_list = os.listdir((os.path.join(raw_path, speaker_name[j])))
|
| for tar_f in tar_file_list:
|
| if '.wav' in tar_f:
|
| tar_wav, tar_fs = librosa.load(os.path.join(raw_path, speaker_name[j], tar_f))
|
| if len(tar_wav) / tar_fs >= 5:
|
| tar_wav = tar_wav * 32767.0
|
| wav = np.concatenate((wav, tar_wav))
|
| wav = np.concatenate((wav, space_time))
|
| break
|
| source_sid.append(speaker_id[i])
|
| target_sid.append(speaker_id[j])
|
| file.append(f)
|
| target_file.append(speaker_name[j])
|
| count += 1
|
|
|
| copyfile(os.path.join(raw_path, speaker_name[i], f),
|
| os.path.join(save_path, speaker_name[i], f))
|
|
|
| audio, sampling_rate = load_wav_to_torch(os.path.join(raw_path, speaker_name[i], f))
|
| audio_norm = audio / 32768.0
|
| audio_norm = audio_norm.unsqueeze(0)
|
| spec = spectrogram_torch(audio_norm, 1024,
|
| 22050, 256, 1024,
|
| center=False)
|
| with torch.no_grad():
|
| spec = spec.cuda()
|
| spec_lengths = torch.LongTensor([spec.shape[2]]).cuda()
|
| sid_src = torch.LongTensor([speaker_id[i]]).cuda()
|
| sid_tgt2 = torch.LongTensor([speaker_id[j]]).cuda()
|
| audio = net_g.voice_conversion(spec, spec_lengths, sid_src, sid_tgt=sid_tgt2)[0][
|
| 0, 0].data.cpu().float().numpy()
|
| audio = audio * 32767.0
|
| wav = np.concatenate((wav, audio))
|
|
|
| wavfile.write(
|
| os.path.join(save_path, str(ind) + '_' + str(speaker_id[i]) + "_" + str(speaker_id[j]) + "_"
|
| + str(count) + ".wav"),
|
| 22050,
|
| wav.astype(np.int16),
|
| )
|
| ind += 1
|
|
|
| else:
|
| continue
|
|
|
|
|
| df = pd.DataFrame({"source_sid": source_sid, "target_sid": target_sid, "file": file,
|
| "target_file": target_file})
|
| df.to_csv("/home/admin/yuanxin/3.tempData/FastSpeech2/raw_data/AISHELL3-MOS/MOS.csv", index=False)
|
| return df
|
|
|
|
|
| if __name__ == "__main__":
|
| mos([41, 101, 168, 172, 2, 14, 27, 28, 112], 1)
|
|
|
|
|