Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Text2Sing-DiffSinger / DiffSinger /inference /svs /base_svs_infer.py

Vaishnavi0404

Upload 130 files

c8baff6 verified 11 months ago

raw

history blame contribute delete

12.2 kB

	import os

	import torch
	import numpy as np
	from modules.hifigan.hifigan import HifiGanGenerator
	from vocoders.hifigan import HifiGAN
	from inference.svs.opencpop.map import cpop_pinyin2ph_func

	from utils import load_ckpt
	from utils.hparams import set_hparams, hparams
	from utils.text_encoder import TokenTextEncoder
	from pypinyin import pinyin, lazy_pinyin, Style
	import librosa
	import glob
	import re


	class BaseSVSInfer:
	def __init__(self, hparams, device=None):
	if device is None:
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	self.hparams = hparams
	self.device = device

	phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
	"h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
	"ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
	"van", "ve", "vn", "w", "x", "y", "z", "zh"]
	self.ph_encoder = TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
	self.pinyin2phs = cpop_pinyin2ph_func()
	self.spk_map = {'opencpop': 0}

	self.model = self.build_model()
	self.model.eval()
	self.model.to(self.device)
	self.vocoder = self.build_vocoder()
	self.vocoder.eval()
	self.vocoder.to(self.device)

	def build_model(self):
	raise NotImplementedError

	def forward_model(self, inp):
	raise NotImplementedError

	def build_vocoder(self):
	base_dir = hparams['vocoder_ckpt']
	config_path = f'{base_dir}/config.yaml'
	ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
	lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
	print('\| load HifiGAN: ', ckpt)
	ckpt_dict = torch.load(ckpt, map_location="cpu")
	config = set_hparams(config_path, global_hparams=False)
	state = ckpt_dict["state_dict"]["model_gen"]
	vocoder = HifiGanGenerator(config)
	vocoder.load_state_dict(state, strict=True)
	vocoder.remove_weight_norm()
	vocoder = vocoder.eval().to(self.device)
	return vocoder

	def run_vocoder(self, c, **kwargs):
	c = c.transpose(2, 1) # [B, 80, T]
	f0 = kwargs.get('f0') # [B, T]
	if f0 is not None and hparams.get('use_nsf'):
	# f0 = torch.FloatTensor(f0).to(self.device)
	y = self.vocoder(c, f0).view(-1)
	else:
	y = self.vocoder(c).view(-1)
	# [T]
	return y[None]

	def preprocess_word_level_input(self, inp):
	# Pypinyin can't solve polyphonic words
	text_raw = inp['text'].replace('最长', '最常').replace('长睫毛', '常睫毛') \
	.replace('那么长', '那么常').replace('多长', '多常') \
	.replace('很长', '很常') # We hope someone could provide a better g2p module for us by opening pull requests.

	# lyric
	pinyins = lazy_pinyin(text_raw, strict=False)
	ph_per_word_lst = [self.pinyin2phs[pinyin.strip()] for pinyin in pinyins if pinyin.strip() in self.pinyin2phs]

	# Note
	note_per_word_lst = [x.strip() for x in inp['notes'].split('\|') if x.strip() != '']
	mididur_per_word_lst = [x.strip() for x in inp['notes_duration'].split('\|') if x.strip() != '']

	if len(note_per_word_lst) == len(ph_per_word_lst) == len(mididur_per_word_lst):
	print('Pass word-notes check.')
	else:
	print('The number of words does\'t match the number of notes\' windows. ',
	'You should split the note(s) for each word by \| mark.')
	print(ph_per_word_lst, note_per_word_lst, mididur_per_word_lst)
	print(len(ph_per_word_lst), len(note_per_word_lst), len(mididur_per_word_lst))
	return None

	note_lst = []
	ph_lst = []
	midi_dur_lst = []
	is_slur = []
	for idx, ph_per_word in enumerate(ph_per_word_lst):
	# for phs in one word:
	# single ph like ['ai'] or multiple phs like ['n', 'i']
	ph_in_this_word = ph_per_word.split()

	# for notes in one word:
	# single note like ['D4'] or multiple notes like ['D4', 'E4'] which means a 'slur' here.
	note_in_this_word = note_per_word_lst[idx].split()
	midi_dur_in_this_word = mididur_per_word_lst[idx].split()
	# process for the model input
	# Step 1.
	# Deal with note of 'not slur' case or the first note of 'slur' case
	# j ie
	# F#4/Gb4 F#4/Gb4
	# 0 0
	for ph in ph_in_this_word:
	ph_lst.append(ph)
	note_lst.append(note_in_this_word[0])
	midi_dur_lst.append(midi_dur_in_this_word[0])
	is_slur.append(0)
	# step 2.
	# Deal with the 2nd, 3rd... notes of 'slur' case
	# j ie ie
	# F#4/Gb4 F#4/Gb4 C#4/Db4
	# 0 0 1
	if len(note_in_this_word) > 1: # is_slur = True, we should repeat the YUNMU to match the 2nd, 3rd... notes.
	for idx in range(1, len(note_in_this_word)):
	ph_lst.append(ph_in_this_word[-1])
	note_lst.append(note_in_this_word[idx])
	midi_dur_lst.append(midi_dur_in_this_word[idx])
	is_slur.append(1)
	ph_seq = ' '.join(ph_lst)

	if len(ph_lst) == len(note_lst) == len(midi_dur_lst):
	print(len(ph_lst), len(note_lst), len(midi_dur_lst))
	print('Pass word-notes check.')
	else:
	print('The number of words does\'t match the number of notes\' windows. ',
	'You should split the note(s) for each word by \| mark.')
	return None
	return ph_seq, note_lst, midi_dur_lst, is_slur

	def preprocess_phoneme_level_input(self, inp):
	ph_seq = inp['ph_seq']
	note_lst = inp['note_seq'].split()
	midi_dur_lst = inp['note_dur_seq'].split()
	is_slur = [float(x) for x in inp['is_slur_seq'].split()]
	print(len(note_lst), len(ph_seq.split()), len(midi_dur_lst))
	if len(note_lst) == len(ph_seq.split()) == len(midi_dur_lst):
	print('Pass word-notes check.')
	else:
	print('The number of words does\'t match the number of notes\' windows. ',
	'You should split the note(s) for each word by \| mark.')
	return None
	return ph_seq, note_lst, midi_dur_lst, is_slur

	def preprocess_input(self, inp, input_type='word'):
	"""

	:param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
	:return:
	"""

	item_name = inp.get('item_name', '<ITEM_NAME>')
	spk_name = inp.get('spk_name', 'opencpop')

	# single spk
	spk_id = self.spk_map[spk_name]

	# get ph seq, note lst, midi dur lst, is slur lst.
	if input_type == 'word':
	ret = self.preprocess_word_level_input(inp)
	elif input_type == 'phoneme': # like transcriptions.txt in Opencpop dataset.
	ret = self.preprocess_phoneme_level_input(inp)
	else:
	print('Invalid input type.')
	return None

	if ret:
	ph_seq, note_lst, midi_dur_lst, is_slur = ret
	else:
	print('==========> Preprocess_word_level or phone_level input wrong.')
	return None

	# convert note lst to midi id; convert note dur lst to midi duration
	try:
	midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
	for x in note_lst]
	midi_dur_lst = [float(x) for x in midi_dur_lst]
	except Exception as e:
	print(e)
	print('Invalid Input Type.')
	return None

	ph_token = self.ph_encoder.encode(ph_seq)
	item = {'item_name': item_name, 'text': inp['text'], 'ph': ph_seq, 'spk_id': spk_id,
	'ph_token': ph_token, 'pitch_midi': np.asarray(midis), 'midi_dur': np.asarray(midi_dur_lst),
	'is_slur': np.asarray(is_slur), }
	item['ph_len'] = len(item['ph_token'])
	return item

	def input_to_batch(self, item):
	item_names = [item['item_name']]
	text = [item['text']]
	ph = [item['ph']]
	txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
	txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
	spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)

	pitch_midi = torch.LongTensor(item['pitch_midi'])[None, :hparams['max_frames']].to(self.device)
	midi_dur = torch.FloatTensor(item['midi_dur'])[None, :hparams['max_frames']].to(self.device)
	is_slur = torch.LongTensor(item['is_slur'])[None, :hparams['max_frames']].to(self.device)

	batch = {
	'item_name': item_names,
	'text': text,
	'ph': ph,
	'txt_tokens': txt_tokens,
	'txt_lengths': txt_lengths,
	'spk_ids': spk_ids,
	'pitch_midi': pitch_midi,
	'midi_dur': midi_dur,
	'is_slur': is_slur
	}
	return batch

	def postprocess_output(self, output):
	return output

	def infer_once(self, inp):
	inp = self.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
	output = self.forward_model(inp)
	output = self.postprocess_output(output)
	return output

	@classmethod
	def example_run(cls, inp):
	from utils.audio import save_wav
	set_hparams(print_hparams=False)
	infer_ins = cls(hparams)
	out = infer_ins.infer_once(inp)
	os.makedirs('infer_out', exist_ok=True)
	save_wav(out, f'infer_out/example_out.wav', hparams['audio_sample_rate'])


	# if __name__ == '__main__':
	# debug
	# a = BaseSVSInfer(hparams)
	# a.preprocess_input({'text': '你说你不 SP 懂为何在这时牵手 AP',
	# 'notes': 'D#4/Eb4 \| D#4/Eb4 \| D#4/Eb4 \| D#4/Eb4 \| rest \| D#4/Eb4 \| D4 \| D4 \| D4 \| D#4/Eb4 \| F4 \| D#4/Eb4 \| D4 \| rest',
	# 'notes_duration': '0.113740 \| 0.329060 \| 0.287950 \| 0.133480 \| 0.150900 \| 0.484730 \| 0.242010 \| 0.180820 \| 0.343570 \| 0.152050 \| 0.266720 \| 0.280310 \| 0.633300 \| 0.444590'
	# })

	# b = {
	# 'text': '小酒窝长睫毛AP是你最美的记号',
	# 'notes': 'C#4/Db4 \| F#4/Gb4 \| G#4/Ab4 \| A#4/Bb4 F#4/Gb4 \| F#4/Gb4 C#4/Db4 \| C#4/Db4 \| rest \| C#4/Db4 \| A#4/Bb4 \| G#4/Ab4 \| A#4/Bb4 \| G#4/Ab4 \| F4 \| C#4/Db4',
	# 'notes_duration': '0.407140 \| 0.376190 \| 0.242180 \| 0.509550 0.183420 \| 0.315400 0.235020 \| 0.361660 \| 0.223070 \| 0.377270 \| 0.340550 \| 0.299620 \| 0.344510 \| 0.283770 \| 0.323390 \| 0.360340'
	# }
	# c = {
	# 'text': '小酒窝长睫毛AP是你最美的记号',
	# 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
	# 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
	# 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
	# 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
	# } # input like Opencpop dataset.
	# a.preprocess_input(b)
	# a.preprocess_input(c, input_type='phoneme')