MakeDiffSinger / acoustic_forced_alignment /combine_tg.py

Upload folder using huggingface_hub

79cf5f5 verified over 1 year ago

4.06 kB

	import pathlib
	import re
	from typing import Dict, List

	import click
	import librosa
	import natsort
	import numpy
	import soundfile
	import textgrid
	import tqdm


	def remove_suffix(string, suffix_pattern):
	match = re.search(f'{suffix_pattern}$', string)
	if not match:
	return string
	return string[:-len(match.group())]


	@click.command(help='Combine segmented 2-tier TextGrids and wavs into 3-tier TextGrids and long wavs')
	@click.option(
	'--wavs', required=True,
	help='Directory containing the segmented wav files'
	)
	@click.option(
	'--tg', required=False,
	help='Directory containing the segmented TextGrid files (defaults to wav directory)'
	)
	@click.option(
	'--out', required=True,
	help='Path to output directory for combined files'
	)
	@click.option(
	'--suffix', required=False, default=r'_\d+',
	help='Filename suffix pattern for file combination'
	)
	@click.option(
	'--wav_subtype', required=False, default='PCM_16',
	help='Wav subtype (defaults to PCM_16)'
	)
	@click.option(
	'--overwrite', is_flag=True,
	help='Overwrite existing files'
	)
	def combine_tg(wavs, tg, out, suffix, wav_subtype, overwrite):
	wav_path_in = pathlib.Path(wavs)
	tg_path_in = wav_path_in if tg is None else pathlib.Path(tg)
	del tg
	combined_path_out = pathlib.Path(out)
	combined_path_out.mkdir(parents=True, exist_ok=True)
	filelist: Dict[str, List[pathlib.Path]] = {}
	for tg_file in tg_path_in.glob('*.TextGrid'):
	stem = remove_suffix(tg_file.stem, suffix)
	if stem not in filelist:
	filelist[stem] = [tg_file]
	else:
	filelist[stem].append(tg_file)
	for name, files in tqdm.tqdm(sorted(filelist.items(), key=lambda kv: kv[0])):
	wav_segments = []
	tg = textgrid.TextGrid()
	sentences_tier = textgrid.IntervalTier(name='sentences')
	words_tier = textgrid.IntervalTier(name='words')
	phones_tier = textgrid.IntervalTier(name='phones')
	sentence_start = 0.
	sr = None
	for tg_file in natsort.natsorted(files):
	wav_file = (wav_path_in / tg_file.name).with_suffix('.wav')
	waveform, sr_ = librosa.load(wav_file, sr=None)
	if sr is None:
	sr = sr_
	else:
	assert sr_ == sr, f'Cannot combine \'{tg_file.stem}\': incompatible samplerate ({sr_} != {sr})'
	sentence_end = waveform.shape[0] / sr + sentence_start
	wav_segments.append(waveform)
	sentences_tier.add(minTime=sentence_start, maxTime=sentence_end, mark=wav_file.stem)
	sentence_tg = textgrid.TextGrid()
	sentence_tg.read(tg_file)
	start = sentence_start
	for j, word in enumerate(sentence_tg[0]):
	if j == len(sentence_tg[0]) - 1:
	end = sentence_end
	else:
	end = start + word.duration()
	words_tier.add(minTime=start, maxTime=end, mark=word.mark)
	start = end
	start = sentence_start
	for j, phone in enumerate(sentence_tg[1]):
	if j == len(sentence_tg[1]) - 1:
	end = sentence_end
	else:
	end = start + phone.duration()
	phones_tier.add(minTime=start, maxTime=end, mark=phone.mark)
	start = end
	sentence_start = sentence_end
	tg.append(sentences_tier)
	tg.append(words_tier)
	tg.append(phones_tier)

	tg_file_out = combined_path_out / f'{name}.TextGrid'
	wav_file_out = tg_file_out.with_suffix('.wav')
	if wav_file_out.exists() and not overwrite:
	raise FileExistsError(str(wav_file_out))
	if tg_file_out.exists() and not overwrite:
	raise FileExistsError(str(tg_file_out))

	tg.write(tg_file_out)
	full_wav = numpy.concatenate(wav_segments)
	soundfile.write(wav_file_out, full_wav, samplerate=sr, subtype=wav_subtype)


	if __name__ == '__main__':
	combine_tg()