EmpathyVC / mos.py
admin
init app
d47c0cc
import os
import json
from shutil import copyfile
import pandas as pd
import librosa
from utils import load_wav_to_torch
from mel_processing import spectrogram_torch
import torch
import utils
from models import SynthesizerTrn
from text.symbols import symbols
import numpy as np
from scipy.io import wavfile
# 加载模型
devices = "cuda" if torch.cuda.is_available() else "cpu"
# 加载配置文件
hps = utils.get_hparams_from_file("./configs/aishell3_base.json")
# 加载模型
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=175,
**hps.model)
if devices == 'cuda':
net_g = net_g.cuda()
_ = net_g.eval()
_ = utils.load_checkpoint("./ckpt/G_1379000.pth", net_g, None)
def mos(speaker_id, n_s):
speaker_id = [41, 101, 168, 172, 2, 14, 27, 28, 112]
n_s = 1
ind = 1
speaker_name = []
with open('./configs/speakers.json', 'r', encoding='utf-8') as f:
speaker_name_all = json.load(f)
for s in speaker_id:
for v, n in zip(speaker_name_all.values(), speaker_name_all.keys()):
if s - 1 == v:
speaker_name.append(n)
source_sid = []
target_sid = []
file = []
target_file = []
raw_path = '/home/admin/yuanxin/3.tempData/FastSpeech2/raw_data/AISHELL3-Pro'
save_path = '/home/admin/yuanxin/3.tempData/FastSpeech2/raw_data/AISHELL3-MOS'
os.makedirs(save_path, exist_ok=True)
# 每个speaker_id获取两个句子
for i in range(len(speaker_id)):
for j in range(len(speaker_id)):
if i != j:
#
temp_file_list = os.listdir((os.path.join(raw_path, speaker_name[i])))
os.makedirs(os.path.join(save_path, speaker_name[i]), exist_ok=True)
count = 0
for f in temp_file_list:
if count > n_s:
break
else:
if '.wav' in f:
# 读取音频,当音频长度大于10s再写入
wav, fs = librosa.load(os.path.join(raw_path, speaker_name[i], f))
print(len(wav)/fs)
if len(wav)/fs >= 5:
space_time = np.zeros(int(np.random.uniform(1, 2, 1)[0] * 22050), dtype=np.int16)
wav = wav * 32767.0
wav = np.concatenate((wav, space_time))
# 拼接目标说话人语音
tar_file_list = os.listdir((os.path.join(raw_path, speaker_name[j])))
for tar_f in tar_file_list:
if '.wav' in tar_f:
tar_wav, tar_fs = librosa.load(os.path.join(raw_path, speaker_name[j], tar_f))
if len(tar_wav) / tar_fs >= 5:
tar_wav = tar_wav * 32767.0
wav = np.concatenate((wav, tar_wav))
wav = np.concatenate((wav, space_time))
break
source_sid.append(speaker_id[i])
target_sid.append(speaker_id[j])
file.append(f)
target_file.append(speaker_name[j])
count += 1
# 复制文件
copyfile(os.path.join(raw_path, speaker_name[i], f),
os.path.join(save_path, speaker_name[i], f))
# 生成音频
audio, sampling_rate = load_wav_to_torch(os.path.join(raw_path, speaker_name[i], f))
audio_norm = audio / 32768.0
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, 1024,
22050, 256, 1024,
center=False)
with torch.no_grad():
spec = spec.cuda()
spec_lengths = torch.LongTensor([spec.shape[2]]).cuda()
sid_src = torch.LongTensor([speaker_id[i]]).cuda()
sid_tgt2 = torch.LongTensor([speaker_id[j]]).cuda()
audio = net_g.voice_conversion(spec, spec_lengths, sid_src, sid_tgt=sid_tgt2)[0][
0, 0].data.cpu().float().numpy()
audio = audio * 32767.0
wav = np.concatenate((wav, audio))
# 写文件
wavfile.write(
os.path.join(save_path, str(ind) + '_' + str(speaker_id[i]) + "_" + str(speaker_id[j]) + "_"
+ str(count) + ".wav"),
22050,
wav.astype(np.int16),
)
ind += 1
else:
continue
# 构建 csv 列表用于评分
df = pd.DataFrame({"source_sid": source_sid, "target_sid": target_sid, "file": file,
"target_file": target_file})
df.to_csv("/home/admin/yuanxin/3.tempData/FastSpeech2/raw_data/AISHELL3-MOS/MOS.csv", index=False)
return df
if __name__ == "__main__":
mos([41, 101, 168, 172, 2, 14, 27, 28, 112], 1)