SoulX-Singer

Running on Zero

App Files Files Community

SoulX-Singer / preprocess /tools /note_transcription /utils /rosvot_utils.py

Xinsheng-Wang

Upload folder using huggingface_hub

c7f3ffb verified about 2 months ago

raw

history blame contribute delete

4.2 kB

	import numpy as np
	import torch

	def regulate_real_note_itv(note_itv, note_bd, word_bd, word_durs, hop_size, audio_sample_rate):
	# regulate note_itv in seconds according to the correspondence between note_bd and word_bd
	assert note_itv.shape[0] == np.sum(note_bd) + 1
	assert np.sum(word_bd) <= np.sum(note_bd)
	assert word_durs.shape[0] == np.sum(word_bd) + 1, f"{word_durs.shape[0]} {np.sum(word_bd) + 1}"
	word_bd = np.cumsum(word_bd) * word_bd # [0,1,0,0,1,0,0,0] -> [0,1,0,0,2,0,0,0]
	word_itv = np.zeros((word_durs.shape[0], 2))
	word_offsets = np.cumsum(word_durs)
	note2words = np.zeros(note_itv.shape[0], dtype=int)
	for idx in range(len(word_offsets) - 1):
	word_itv[idx, 1] = word_itv[idx + 1, 0] = word_offsets[idx]
	word_itv[-1, 1] = word_offsets[-1]
	note_itv_secs = note_itv * hop_size / audio_sample_rate
	for idx, itv in enumerate(note_itv):
	start_idx, end_idx = itv
	if word_bd[start_idx] > 0:
	word_dur_idx = word_bd[start_idx]
	note_itv_secs[idx, 0] = word_itv[word_dur_idx, 0]
	note2words[idx] = word_dur_idx
	if word_bd[end_idx] > 0:
	word_dur_idx = word_bd[end_idx] - 1
	note_itv_secs[idx, 1] = word_itv[word_dur_idx, 1]
	note2words[idx] = word_dur_idx
	note2words += 1 # mel2ph fashion: start from 1
	return note_itv_secs, note2words

	def regulate_ill_slur(notes, note_itv, note2words):
	res_note2words = []
	res_note_itv = []
	res_notes = []
	note_idx = 0
	note_idx_end = 0
	while True:
	if note_idx > len(notes) - 1:
	break
	while note_idx <= note_idx_end < len(notes) and note2words[note_idx] == note2words[note_idx_end]:
	note_idx_end += 1
	res_note2words.append(note2words[note_idx])
	res_note_itv.append(note_itv[note_idx].tolist())
	res_notes.append(notes[note_idx])
	for idx in range(note_idx+1, note_idx_end):
	if notes[idx] == notes[idx-1]:
	res_note_itv[-1][1] = note_itv[idx][1]
	else:
	res_note_itv.append(note_itv[idx].tolist())
	res_note2words.append(note2words[idx])
	res_notes.append(notes[idx])
	note_idx = note_idx_end
	res_notes = np.array(res_notes, dtype=notes.dtype)
	res_note_itv = np.array(res_note_itv, dtype=note_itv.dtype)
	res_note2words = np.array(res_note2words, dtype=note2words.dtype)
	return res_notes, res_note_itv, res_note2words

	def bd_to_idxs(bd):
	# bd [T]
	idxs = []
	for idx in range(len(bd)):
	if bd[idx] == 1:
	idxs.append(idx)
	return idxs

	def bd_to_durs(bd):
	# bd [T]
	last_idx = 0
	durs = []
	for idx in range(len(bd)):
	if bd[idx] == 1:
	durs.append(idx - last_idx)
	last_idx = idx
	durs.append(len(bd) - last_idx)
	return durs

	def get_mel_len(wav_len, hop_size):
	return (wav_len + hop_size - 1) // hop_size

	def mel2token_to_dur(mel2token, T_txt=None, max_dur=None):
	is_torch = isinstance(mel2token, torch.Tensor)
	has_batch_dim = True
	if not is_torch:
	mel2token = torch.LongTensor(mel2token)
	if T_txt is None:
	T_txt = mel2token.max()
	if len(mel2token.shape) == 1:
	mel2token = mel2token[None, ...]
	has_batch_dim = False
	B, _ = mel2token.shape
	dur = mel2token.new_zeros(B, T_txt + 1).scatter_add(1, mel2token, torch.ones_like(mel2token))
	dur = dur[:, 1:]
	if max_dur is not None:
	dur = dur.clamp(max=max_dur)
	if not is_torch:
	dur = dur.numpy()
	if not has_batch_dim:
	dur = dur[0]
	return dur

	def align_word(word_durs, mel_len, hop_size, audio_sample_rate):
	mel2word = np.zeros([mel_len], int)
	start_time = 0
	for i_word in range(len(word_durs)):
	start_frame = int(start_time * audio_sample_rate / hop_size + 0.5)
	end_frame = int((start_time + word_durs[i_word]) * audio_sample_rate / hop_size + 0.5)
	mel2word[start_frame:end_frame] = i_word + 1
	start_time = start_time + word_durs[i_word]

	dur_word = mel2token_to_dur(mel2word)

	return mel2word, dur_word.tolist()