camenduru
/

singing_voice_conversion

Model card Files Files and versions

Metrics Training metrics Community

singing_voice_conversion / utils /duration.py

camenduru's picture

thanks to amphion ❤

f951701 about 2 years ago

history blame contribute delete

2.22 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import numpy as np
	import os
	import tgt


	def get_alignment(tier, cfg):
	sample_rate = cfg["sample_rate"]
	hop_size = cfg["hop_size"]

	sil_phones = ["sil", "sp", "spn"]

	phones = []
	durations = []
	start_time = 0
	end_time = 0
	end_idx = 0

	for t in tier._objects:
	s, e, p = t.start_time, t.end_time, t.text

	# Trim leading silences
	if phones == []:
	if p in sil_phones:
	continue
	else:
	start_time = s

	if p not in sil_phones:
	# For ordinary phones
	phones.append(p)
	end_time = e
	end_idx = len(phones)
	else:
	# For silent phones
	phones.append(p)

	durations.append(
	int(
	np.round(e * sample_rate / hop_size)
	- np.round(s * sample_rate / hop_size)
	)
	)

	# Trim tailing silences
	phones = phones[:end_idx]
	durations = durations[:end_idx]

	return phones, durations, start_time, end_time


	def get_duration(utt, wav, cfg):
	speaker = utt["Singer"]
	basename = utt["Uid"]
	dataset = utt["Dataset"]
	sample_rate = cfg["sample_rate"]

	# print(cfg.processed_dir, dataset, speaker, basename)
	wav_path = os.path.join(
	cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename)
	)
	text_path = os.path.join(
	cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename)
	)
	tg_path = os.path.join(
	cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename)
	)

	# Read raw text
	with open(text_path, "r") as f:
	raw_text = f.readline().strip("\n")

	# Get alignments
	textgrid = tgt.io.read_textgrid(tg_path)
	phone, duration, start, end = get_alignment(
	textgrid.get_tier_by_name("phones"), cfg
	)
	text = "{" + " ".join(phone) + "}"
	if start >= end:
	return None

	return duration, text, int(sample_rate * start), int(sample_rate * end)